In [1]:
pip install numpy pandas scikit_learn matplotlib seaborn



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv("resume_screen.csv")
data.head()

Unnamed: 0,id,resume_text_256,jd_text_128,job_family,seniority,advance
0,RS00000,"8+ years experience; key skills: linux, contai...",We are hiring a Senior DevOps professional. Mu...,DevOps,Senior,1
1,RS00001,"0+ years experience; key skills: kafka, ci/cd,...",We are hiring a Junior PM professional. Must h...,PM,Junior,0
2,RS00002,"0+ years experience; key skills: bug-tracking,...",We are hiring a Junior QA professional. Must h...,QA,Junior,1
3,RS00003,"2+ years experience; key skills: test-cases, b...",We are hiring a Mid QA professional. Must have...,QA,Mid,1
4,RS00004,"9+ years experience; key skills: testing, bug-...",We are hiring a Senior QA professional. Must h...,QA,Senior,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               2000 non-null   object
 1   resume_text_256  2000 non-null   object
 2   jd_text_128      2000 non-null   object
 3   job_family       2000 non-null   object
 4   seniority        2000 non-null   object
 5   advance          2000 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 93.9+ KB


In [5]:
data.isnull().sum()

Unnamed: 0,0
id,0
resume_text_256,0
jd_text_128,0
job_family,0
seniority,0
advance,0


In [6]:
data.columns

Index(['id', 'resume_text_256', 'jd_text_128', 'job_family', 'seniority',
       'advance'],
      dtype='object')

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split

In [8]:
X = data.drop('advance', axis=1)
Y = data["advance"]

In [9]:
vectorizer = TfidfVectorizer()

# Vectorize the text columns separately
X_resume_vectorized = vectorizer.fit_transform(X['resume_text_256'])
X_jd_vectorized = vectorizer.fit_transform(X['jd_text_128'])

# Combine the vectorized text features
X_vectorized = np.hstack((X_resume_vectorized.toarray(), X_jd_vectorized.toarray()))

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X_vectorized, Y, test_size=0.2, random_state=42)

In [11]:
# Get the number of features from the training data
input_dim = X_train.shape[1]

# Create the Sequential model
model = Sequential()

# Add the first hidden layer with 64 neurons and a 'relu' activation function
model.add(Dense(64, input_dim=input_dim, activation='relu'))

# Add a dropout layer to prevent overfitting
model.add(Dropout(0.5))

# Add a second hidden layer with 32 neurons
model.add(Dense(32, activation='relu'))

# Add a second dropout layer
model.add(Dropout(0.5))

# Add the output layer. 'sigmoid' is used for binary classification (advance vs. not advance)
model.add(Dense(1, activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [12]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [13]:
# Train the model
model.fit(X_train, Y_train,
          epochs=20,
          batch_size=32,
          )

Epoch 1/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.4654 - loss: 0.7012
Epoch 2/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5108 - loss: 0.6923
Epoch 3/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5713 - loss: 0.6811
Epoch 4/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5660 - loss: 0.6782
Epoch 5/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5890 - loss: 0.6684
Epoch 6/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6267 - loss: 0.6542
Epoch 7/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6438 - loss: 0.6309
Epoch 8/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6686 - loss: 0.6221
Epoch 9/20
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x7e81468cfec0>

In [14]:
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8274 - loss: 0.3522  
Test Loss: 0.3261
Test Accuracy: 0.8550


In [16]:
import json
import pickle
import tensorflow as tf
from tensorflow.keras.models import model_from_json

In [18]:
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)