<a href="https://colab.research.google.com/github/alexv710/debias_cv_data/blob/main/DataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import io

In [2]:
# temp_data = pd.io.stata.read_stata("data_candidates_sample_tab6.dta")
# temp_data.to_csv('data_candidates_sample_tab6.csv')

# temp_data = pd.io.stata.read_stata("data_candidates_sample_tab7_tabA9.dta")
# temp_data.to_csv('data_candidates_sample_tab7_tabA9.csv')

# temp_data = pd.io.stata.read_stata("data_recruiters.dta")
# temp_data.to_csv('data_recruiters.csv')

In [3]:
# read the mainsample.dta file for further processing 
data = pd.io.stata.read_stata("data_candidates_mainsample.dta")
data.to_csv('data_candidates_mainsample.csv')

In [4]:
# Import the mainsample via StataReader to use variable_labels (old version of the read_stata)
data_stata = pd.io.stata.StataReader("data_candidates_mainsample.dta")
data_labels = data_stata.variable_labels()

In [5]:
# Write the Labels into a csv
#import csv

# with open('dataLabels.csv', 'w') as f:  # You will need 'wb' mode in Python 2.x
#    w = csv.DictWriter(f, data_labels.keys())
#    w.writeheader()
#    w.writerow(data_labels)

In [6]:
# Remove all columns in the dataset for which no labels exist
# Those are mostly columns needed for p-tests and some matrix calculations in
# Stata. The outputted labels are those that are kept in the data

data = data.drop(columns=['CVAxORIGINE_IM_12', 'CVAxZUS_CUCS', 'CVAxZetI', 'C', 'CVA0', 'p_offre1', 'control_manquant', 'ZouI_pred', 'ZouI0', 'ALE_16'])

# Remove sampling weights (within and out of the experiment)

data = data.drop(columns='POIDS_SEL')

# Remove all the centered features

data = data.drop(columns=['FEMME_c', 'a30m_c', 'a3049_c', 'a50p_c', 'dip_aucun_c', 'dip_bepcap_c', 'dip_bac_c', 'dip_bac2p_c', 'recherche_LD_c', 'recherche_TLD_c'])

# Remove all features that are not directly derivable from the CVs

data = data.drop(columns=['ORIGINE_IM_1', 'ORIGINE_IM_2', 'ORIGINE_IM_12', 'ID_OFFRE', 'ID_CANDIDAT', ])

for key, value in data_labels.items():
  for col in data.columns:
    if key==col:
      if len(value)==0 or value.startswith('ID_OFFRE=='):
        data = data.drop(columns=col)
      else:
        # Print all the columns that are included in the dataset
        print(key + ':      ', value)

CVA:       Treatment: anonymous resume
REFUSAL:       Recruiter refused the experiment
ENTRETIEN:       Interviewed
RECRUTE:       Hired
PREN_MUSULMAN:       Muslim souding name
ZUS_CUCS:       Deprived neighborhood
ZouI:       Minority (immigrant or child of immigrant or residing in deprived neighborhood)
ZetI:       Residing in deprived neighborhood and with foreign bachkground (child of or immi
FEMME:       Female candidate
a50p:       Candidate over 50 years old
a3049:       Candidate between 30 and 49 years old
a26m:       Candidate below 26 years old
dip_aucun:       No diploma
dip_bepcap:       Professional degree
dip_bac:       High school diploma
dip_bac2p:       Upper education degree
dip_bac2:       L2 (diploma 2 years after high school)
dip_bac3p:       At least L3 (diploma 3 years after high school)
duree_expro_offre:       Work experience for the job advertised (in years)
recherche_LD:       Candidate has been looking for a job for at least one year
recherche_TLD:       C

In [7]:
X_ano = pd.DataFrame()
X_unano = pd.DataFrame()

for index, row in data.iterrows():
  if row['CVA'] == 1: 
    X_ano = X_ano.append(row)
  else:
    X_unano = X_unano.append(row)

# Assign the interviewed column to our target vector
y_ano = X_ano['ENTRETIEN']
y_unano = X_unano['ENTRETIEN']

# Drop the Interviewed/hired & anonymized columns (hired is much dependend on the number of jobs available
# and will currently not be considered as target label)
X_ano = data.drop(columns=['ENTRETIEN', 'RECRUTE', 'CVA'])
X_unano = X_unano.drop(columns=['ENTRETIEN', 'RECRUTE', 'CVA'])

In [8]:
X_unano.to_csv('X_unano.csv')

In [9]:
import os
import datetime
import numpy as np
import pandas as pd
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [10]:
# Train, Test split

X_train, X_test, y_train, y_test = train_test_split(X_unano, y_unano, test_size=0.33, random_state=42)

input_shape = X_train[:1].shape

In [22]:
X_train.columns

Index(['ALE_1', 'ALE_10', 'ALE_11', 'ALE_12', 'ALE_13', 'ALE_14', 'ALE_15',
       'ALE_2', 'ALE_3', 'ALE_4', 'ALE_5', 'ALE_6', 'ALE_7', 'ALE_8', 'ALE_9',
       'CDI', 'CNT_sup6m', 'CV_attrayant_', 'CV_experience_',
       'CV_experience_3p', 'CV_formation_', 'CV_hesit', 'CV_hesit_',
       'CV_inactivite', 'CV_note_2', 'CV_note_3', 'CV_note_4', 'CV_note_5',
       'CV_note_7p', 'CV_qualif', 'CV_qualif_inf', 'CV_qualif_sup', 'FEMME',
       'PREN_MUSULMAN', 'REFUSAL', 'SALREV_SMIC_2', 'ZUS_CUCS', 'ZetI', 'ZouI',
       'a26m', 'a3049', 'a50p', 'cadre', 'codage_cv', 'construction',
       'dip_aucun', 'dip_bac', 'dip_bac2', 'dip_bac2p', 'dip_bac3p',
       'dip_bepcap', 'duree_expro_offre', 'effent_200plus', 'eoq',
       'etudes_etranger', 'industrie', 'langue_anglais', 'langue_arabe',
       'langue_autres', 'permis', 'poste_unique', 'profint', 'recherche_LD',
       'recherche_TLD', 'service_m', 'service_nm', 'travail_etranger'],
      dtype='object')

In [30]:
# define the keras model

name="simpleNet"
model = keras.Sequential(
    [
        layers.Dense(32, input_dim=67, activation='relu'),
        layers.Dense(16, activation='relu'),
        layers.Dense(1, activation='sigmoid')  
    
    ],name=name
)
model.summary()

Model: "simpleNet"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 32)                2176      
_________________________________________________________________
dense_13 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 17        
Total params: 2,721
Trainable params: 2,721
Non-trainable params: 0
_________________________________________________________________


In [43]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'mse'])

In [44]:
model.fit(X_train, y_train, epochs=150, batch_size=10)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150


Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Ep

Epoch 150/150


<tensorflow.python.keras.callbacks.History at 0x223e71bdd30>

In [45]:
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))



ValueError: too many values to unpack (expected 2)

In [46]:
batch_size = 128
epochs = 50

model.compile(loss="categorical_crossentropy", optimizer=Adam(epsilon=1e-07, learning_rate=0.001), metrics=["accuracy"])
log_dir= os.path.join('logs','fit_'+name,datetime.datetime.now().strftime("%Y%m%d-%H%M%S"),'')

#Learning Rate Annealer
from tensorflow.keras.callbacks import ReduceLROnPlateau
lrr = ReduceLROnPlateau(monitor='val_accuracy',
                       factor=.01,
                       patience=3,
                       min_lr=1e-7,
                       verbose=1)
#Early stopping
es = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=8, verbose=1, mode="auto", baseline=None, restore_best_weights=True)

# tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

history=model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1,callbacks=[es, lrr])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
1/5 [=====>........................] - ETA: 0s - loss: nan - accuracy: 0.9062
Epoch 00004: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 5/50
Epoch 6/50
Epoch 7/50
1/5 [=====>........................] - ETA: 0s - loss: nan - accuracy: 0.9141
Epoch 00007: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-07.
Epoch 8/50
Epoch 9/50
1/5 [=====>........................] - ETA: 0s - loss: nan - accuracy: 0.8672Restoring model weights from the end of the best epoch.
Epoch 00009: early stopping


In [None]:
input_shape[1]

In [None]:
X_train