In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

import pickle


In [4]:
df = pd.read_csv("../data/resume_features_bert.csv")
df.head()


Unnamed: 0,Category,Resume,skill_match,keyword_overlap,resume_length,experience_score,bert_similarity
0,Data Science,Skills * Programming Languages: Python (pandas...,0.785714,11,1,5,0.500591
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,0.357143,5,1,4,0.38312
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",0.5,7,1,4,0.387008
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,0.5,7,0,24,0.344844
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",0.214286,3,0,3,0.338331


In [5]:
# Shortlisted if BERT similarity is high
df["shortlisted"] = (df["bert_similarity"] >= 0.65).astype(int)

df["shortlisted"].value_counts()


shortlisted
0    962
Name: count, dtype: int64

In [17]:
features = [
    "skill_match",
    "keyword_overlap",
    "resume_length",
    "experience_score",
    "bert_similarity"
]

X = df[features]
y = df["shortlisted"]

# Safety check
assert y.nunique() == 2, "Target has only one class!"


In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [20]:
# Create balanced labels using quantile threshold
threshold = df["bert_similarity"].quantile(0.5)

df["shortlisted"] = (df["bert_similarity"] >= threshold).astype(int)

# Check distribution
print(df["shortlisted"].value_counts())


shortlisted
1    484
0    478
Name: count, dtype: int64


In [21]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)

rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)
rf_prob = rf.predict_proba(X_test)[:, 1]

print("Random Forest")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print("F1:", f1_score(y_test, rf_pred))
print("ROC-AUC:", roc_auc_score(y_test, rf_prob))



Random Forest
Accuracy: 1.0
F1: 1.0
ROC-AUC: 1.0


In [22]:
gb = GradientBoostingClassifier(
    n_estimators=150,
    learning_rate=0.1,
    random_state=42
)

gb.fit(X_train, y_train)

gb_pred = gb.predict(X_test)
gb_prob = gb.predict_proba(X_test)[:, 1]

print("Gradient Boosting")
print("Accuracy:", accuracy_score(y_test, gb_pred))
print("F1:", f1_score(y_test, gb_pred))
print("ROC-AUC:", roc_auc_score(y_test, gb_prob))


Gradient Boosting
Accuracy: 1.0
F1: 1.0
ROC-AUC: 1.0


In [23]:
xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

xgb.fit(X_train, y_train)

xgb_pred = xgb.predict(X_test)
xgb_prob = xgb.predict_proba(X_test)[:, 1]

print("XGBoost")
print("Accuracy:", accuracy_score(y_test, xgb_pred))
print("F1:", f1_score(y_test, xgb_pred))
print("ROC-AUC:", roc_auc_score(y_test, xgb_prob))


XGBoost
Accuracy: 1.0
F1: 1.0
ROC-AUC: 1.0


In [25]:
with open("../models/final_model.pkl", "wb") as f:
    pickle.dump(xgb, f)

print("✅ Model saved successfully")


✅ Model saved successfully
