In [3]:
!pip3 install pandas

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\alexm\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


## 1. Data Preperation

In [4]:
import pandas as pd
df = pd.read_csv("hf://datasets/AzharAli05/Resume-Screening-Dataset/dataset.csv")
df.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,Role,Resume,Decision,Reason_for_decision,Job_Description
0,E-commerce Specialist,Here's a professional resume for Jason Jones:\...,reject,Lacked leadership skills for a senior position.,Be part of a passionate team at the forefront ...
1,Game Developer,Here's a professional resume for Ann Marshall:...,select,Strong technical skills in AI and ML.,Help us build the next-generation products as ...
2,Human Resources Specialist,Here's a professional resume for Patrick Mccla...,reject,Insufficient system design expertise for senio...,We need a Human Resources Specialist to enhanc...
3,E-commerce Specialist,Here's a professional resume for Patricia Gray...,select,Impressive leadership and communication abilit...,Be part of a passionate team at the forefront ...
4,E-commerce Specialist,Here's a professional resume for Amanda Gross:...,reject,Lacked leadership skills for a senior position.,We are looking for an experienced E-commerce S...


In [5]:
from pandasql import sqldf

all_roles_query = """
SELECT DISTINCT Role
FROM df 
"""
all_decision_reason_query = """
SELECT DISTINCT Reason_for_decision
FROM df 
"""
print(df.shape)
all_roles = sqldf(all_roles_query, locals())
all_dec_reason = sqldf(all_decision_reason_query, locals())
print(all_roles.count())
print(all_dec_reason.count())
df['decision_binary'] = df['Decision'].apply(lambda x: 1 if x == 'select' else 0)
print(df['decision_binary'].mean())


(10174, 5)
Role    45
dtype: int64
Reason_for_decision    539
dtype: int64
0.49734617652840574


## Observations
This dataset includes recruiting decisions for 45 different roles and consists of over 10,000 decisions made. Each candidate entry hase a role they applied for, their resume in a text format, the decision made by the recruiter, the reason why they made that decision, and the job description.
This dataset also has a balanced set of outcomes with about 50% of each outcome.

## Data Extraction

- Personal Info
  - Every resume starts with ... resume for {Name}
  - Look for keywords email,phone,LinkedIn,address
- Data Splitting
  - Split resume into different chuncks (Summary, education, skills, experience, achievements, certifications, projects, references, professional memberships)


### 1. Add tailored columns for each section of the resume

In [None]:
import re

def extract_sections_by_headers(text):
    header_map = {
        # Summary variants
        "summary": "summary",
        "objective": "summary",
        "about me": "summary",
        "career focus": "summary",
        "career goals": "summary",
        "professional summary": "summary",

        # Experience variants
        "experience": "experience",
        "professional experience": "experience",
        "employment history": "experience",
        "work history": "experience",

        # Skills variants
        "skills": "skills",
        "technical skills": "skills",
        "core competencies": "skills",

        # Education variants
        "education": "education",
        "academic background": "education",

        # Certifications variants
        "certifications": "certifications",
        "licenses": "certifications",

        # Projects variants
        "projects": "projects",
        "personal projects": "projects",
        "portfolio": "projects",

        # References variants
        "references": "references",
        "referees": "references",
        "recommendations": "references",
        "professional references": "references",
    }

    # Compile regex to find headers exactly matching the keys
    # allow colon/dash and surrounding whitespace
    pattern = re.compile(
        r'^\s*(%s)\s*[:\-]?\s*$' % '|'.join(re.escape(h) for h in header_map.keys()),
        flags=re.IGNORECASE | re.MULTILINE
    )

    matches = list(pattern.finditer(text))
    sections = {}

    for i, match in enumerate(matches):
        raw_header = match.group(1).lower()
        canonical_header = header_map[raw_header]

        start = match.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)

        section_text = text[start:end].strip()

        # Normalize whitespace inside section text (optional)
        section_text = re.sub(r'\n{3,}', '\n\n', section_text)  # collapse 3+ newlines to 2
        section_text = re.sub(r'[ \t]+', ' ', section_text)      # collapse spaces/tabs to one space

        if canonical_header in sections:
            # Separate multiple blocks with two newlines for readability
            sections[canonical_header] += "\n\n" + section_text
        else:
            sections[canonical_header] = section_text

    # Ensure all expected keys are present even if empty
    for key in set(header_map.values()):
        sections.setdefault(key, "")

    return sections


In [7]:
section_keys = [
    "summary",
    "experience",
    "skills",
    "education",
    "certifications",
    "projects",
    "references"
]

# Remove existing extracted columns if they exist
for col in section_keys:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

# Extract and build new DataFrame of sections
extracted_data = []
for index, row in df.iterrows():
    sections = extract_sections_by_headers(row['Resume'])
    row_data = {key: sections.get(key, "") for key in section_keys}
    extracted_data.append(row_data)

extracted_df = pd.DataFrame(extracted_data)

# Concatenate with original DataFrame
df = pd.concat([df.reset_index(drop=True), extracted_df], axis=1)


### 2. Data Cleaning
- Remove bullet points
- lowercase letters

In [8]:

# Function to clean bullet points from text
def clean_text(text):
    # Common bullet point regex
    bullet_chars = [
        r'\*',  # asterisk
        r'-',   # dash
        r'\+',  # plus
        r'\u2022',  # bullet •
        r'\u2023',  # triangular bullet ‣
        r'\u25E6',  # white bullet ◦
        r'\u2043',  # hyphen bullet ⁃
        r'\u2219',  # bullet operator ∙
        r'\u00B7'   # middle dot ·
    ]
    bullet_pattern = '[' + ''.join(bullet_chars) + ']' + r'\s*'
    if not isinstance(text, str):
        return text  # skip non-string values
    cleaned_text = re.sub(bullet_pattern, '', text)
    cleaned_text.lower()
    return cleaned_text
columns_to_clean = ['summary', 'experience', 'skills', 'education', 'certifications', 'projects', 'references']

for col in columns_to_clean:
    df[col] = df[col].apply(clean_text)


In [9]:
print(len(df))
first_row = df.iloc[155]

for col, val in first_row.items():
    if col != "Resume":
      print(f"{col}:\n{val}\n{'-'*100}")

10174
Role:
Data Engineer
----------------------------------------------------------------------------------------------------
Decision:
select
----------------------------------------------------------------------------------------------------
Reason_for_decision:
Solid experience in machine learning and AI.
----------------------------------------------------------------------------------------------------
Job_Description:
Join our fast-growing team and help us scale our product offerings as a Data Engineer with expertise in Airflow, Data Warehousing, MLOps.
----------------------------------------------------------------------------------------------------
decision_binary:
1
----------------------------------------------------------------------------------------------------
summary:
Highly skilled Data Engineer with expertise in MLOps, Airflow, Big Data, Cloud Platforms, and Spark. Proven track record of designing, building, and deploying scalable data pipelines and machine learning

In [10]:
df['combined_text'] = (
    df['summary'].fillna('') + ' ' +
    df['experience'].fillna('') + ' ' +
    df['skills'].fillna('') + ' ' +
    df['education'].fillna('') + ' ' +
    df['certifications'].fillna('') + ' ' +
    df['projects'].fillna('') + ' ' +
    df['Reason_for_decision'].fillna('') + ' ' +
    df['Job_Description'].fillna('')
)


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def train_and_evaluate(df, 
                       max_features=30000, 
                       ngram_range=(3,4), 
                       k=1500, 
                       alpha=0.01,
                       random_state=42,
                       test_size=0.2,
                       return_probs=False):
    """
    Train and evaluate ComplementNB with given parameters.
    If return_probs=True, also return test DataFrame with predicted probabilities per sample.
    
    Returns:
        metrics (dict): Performance metrics
        model: trained ComplementNB model
        vectorizer: fitted TfidfVectorizer
        chi2_selector: fitted SelectKBest
        (optional) X_test_with_probs (DataFrame): test samples with predicted probabilities
        (optional) y_proba (np.array): predicted probabilities for test set
    """

    # Vectorize text
    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features, ngram_range=ngram_range)
    X_text = (
        df["combined_text"].fillna("") + " " +
        df["Job_Description"].fillna("") + " " +
        df["Role"].fillna("")
    )
    X = vectorizer.fit_transform(X_text)
    y = df['decision_binary']

    # Feature selection with fixed k
    current_k = min(k, X.shape[1])
    chi2_selector = SelectKBest(chi2, k=current_k)
    X_selected = chi2_selector.fit_transform(X, y)

    # Train/test split - split original df to keep roles for later
    df_train, df_test, X_train, X_test, y_train, y_test = train_test_split(
        df, X_selected, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # Train model
    model = ComplementNB(alpha=alpha)
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')

    metrics = {
        'max_features': max_features,
        'ngram_range': ngram_range,
        'k': current_k,
        'alpha': alpha,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

    if return_probs:
        y_proba = model.predict_proba(X_test)[:, 1]  # prob of positive class
        df_test = df_test.copy()
        df_test['predicted_proba'] = y_proba
        return metrics, model, vectorizer, chi2_selector, df_test, y_proba
    else:
        return metrics, model, vectorizer, chi2_selector


In [12]:
import matplotlib.pyplot as plt

def plot_metric_by_param(results, param_name, param_values=None):
    """
    Plot all metrics vs param_name.
    results: list of dicts with metrics and params
    param_name: str, which param to group by and plot on x-axis
    param_values: optional, list of param values to order the plot
    """

    metrics = ['accuracy', 'precision', 'recall', 'f1']

    if param_values is None:
        param_values = sorted(set(res[param_name] for res in results))

    metric_data = {metric: [] for metric in metrics}

    for val in param_values:
        matching = [res for res in results if res[param_name] == val]
        if matching:
            for metric in metrics:
                avg_metric = sum(res[metric] for res in matching) / len(matching)
                metric_data[metric].append(avg_metric)
        else:
            for metric in metrics:
                metric_data[metric].append(None)

    # Plot all metrics vs param on same plot
    plt.figure(figsize=(10,6))
    for metric in metrics:
        plt.plot(param_values, metric_data[metric], marker='o', label=metric.capitalize())

    plt.title(f'Model Performance Metrics vs {param_name}')
    plt.xlabel(param_name)
    plt.ylabel('Score')
    plt.ylim(0, 1)
    plt.grid(True, axis='y')
    plt.legend()
    plt.tight_layout()
    plt.show()

# Plot different metrics for models
# # Parameter grids
# max_features_list = [1000, 5000, 10000, 15000, 20000, 25000, 30000]
# ngram_ranges = [(1,2), (1,3), (1,4), (2,3), (3,4)]
# k_values = [100, 500, 1000, 1500, 2000]
# alphas = [0.01]

# results = []

# for max_feat in max_features_list:
#     for ngram in ngram_ranges:
#         for k in k_values:
#             for alpha in alphas:
#                 print(f"Training with max_features={max_feat}, ngram_range={ngram}, k={k}, alpha={alpha}")
#                 res = train_and_evaluate(
#                     df,
#                     max_features=max_feat,
#                     ngram_range=ngram,
#                     k=k,
#                     alpha=alpha,
#                     random_state=42,
#                     test_size=0.2
#                 )
#                 results.append(res)

# # Convert ngram_range tuples to string for nicer plotting labels
# # Convert ngram_range tuples to string for nicer plotting labels
# for r in results:
#     r['ngram_range_str'] = str(r['ngram_range'])

# # Print parameter ranges tested
# print("\nParameter ranges tested:")
# for param in ['max_features', 'k', 'alpha']:
#     values = sorted(set(res[param] for res in results))
#     print(f"{param}: min={values[0]}, max={values[-1]}")

# ngram_vals = sorted(set(res['ngram_range_str'] for res in results))
# print(f"ngram_range options: {', '.join(ngram_vals)}")

# # Print best combination by accuracy
# best_result = max(results, key=lambda x: x['accuracy'])
# print("\nBest combination found:")
# for key, value in best_result.items():
#     if key != 'ngram_range_str':  # optional, skip to reduce clutter
#         print(f"{key}: {value}")

# # Plot grouped comparisons
# plot_metric_by_param(results, 'max_features')
# plot_metric_by_param(results, 'ngram_range_str')
# plot_metric_by_param(results, 'k')
# plot_metric_by_param(results, 'alpha')



In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import ComplementNB

def cross_val_evaluate_best(df, max_features, ngram_range, k, alpha, cv=5, random_state=42):
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)

    # NEW → combine resume + job role + job desc
    X_text = (
        df["combined_text"].fillna("") + " " +
        df["Job_Description"].fillna("") + " " +
        df["Role"].fillna("")
    )
    y = df["decision_binary"]

    accuracies, precisions, recalls, f1s = [], [], [], []

    for train_idx, val_idx in skf.split(X_text, y):

        X_train_text = X_text.iloc[train_idx]
        X_val_text = X_text.iloc[val_idx]
        y_train = y.iloc[train_idx]
        y_val = y.iloc[val_idx]

        # Vectorizer
        vectorizer = TfidfVectorizer(
            stop_words="english",
            max_features=max_features,
            ngram_range=ngram_range
        )
        X_train_tfidf = vectorizer.fit_transform(X_train_text)

        # Feature Selection
        selector = SelectKBest(chi2, k=min(k, X_train_tfidf.shape[1]))
        X_train_sel = selector.fit_transform(X_train_tfidf, y_train)

        # Model
        clf = ComplementNB(alpha=alpha)
        clf.fit(X_train_sel, y_train)

        # Validation transform
        X_val_tfidf = vectorizer.transform(X_val_text)
        X_val_sel = selector.transform(X_val_tfidf)

        y_pred = clf.predict(X_val_sel)

        # Metrics
        accuracies.append(accuracy_score(y_val, y_pred))
        precisions.append(precision_score(y_val, y_pred, average="weighted"))
        recalls.append(recall_score(y_val, y_pred, average="weighted"))
        f1s.append(f1_score(y_val, y_pred, average="weighted"))

    print(f"Cross-validated metrics (cv={cv}):")
    print(f"Accuracy:  {sum(accuracies)/cv:.4f}")
    print(f"Precision: {sum(precisions)/cv:.4f}")
    print(f"Recall:    {sum(recalls)/cv:.4f}")
    print(f"F1 Score:  {sum(f1s)/cv:.4f}")

cross_val_evaluate_best(
    df,
    max_features=30000,
    ngram_range=(3,4),
    k=1500,
    alpha=0.01,
    cv=5
)


Cross-validated metrics (cv=5):
Accuracy:  0.9456
Precision: 0.9458
Recall:    0.9456
F1 Score:  0.9456


In [14]:
# Extract data from resume
from resumeReviewer import extract_text_from_pdf
resume_path = "resume.pdf"
new_resume_text = extract_text_from_pdf(resume_path)

# Format resume, job desc, job role 
resume_sections = extract_sections_by_headers(new_resume_text)
job_title = "Cloud Engineer"
job_desc = "We're seeking a talented Cloud Engineer to work on AI model development and bring new ideas to life."
cleaned_sections = {}
for section, text in resume_sections.items():
    cleaned_sections[section] = clean_text(text)

# Format 
combined_text = (
    cleaned_sections.get("summary", "") + " " +
    cleaned_sections.get("experience", "") + " " +
    cleaned_sections.get("skills", "") + " " +
    cleaned_sections.get("education", "") + " " +
    cleaned_sections.get("certifications", "") + " " +
    cleaned_sections.get("projects", "") + " " +
    job_desc + " " +
    job_title
).strip()

# Make prediction of outcome
def predict_resume_text(model, vectorizer, chi2_selector, combined_text):
    # Convert to TF-IDF
    X_tfidf = vectorizer.transform([combined_text])
    # Apply feature selection
    X_selected = chi2_selector.transform(X_tfidf)

    # Predict label
    label = model.predict(X_selected)[0]

    # Probabilities
    probs = model.predict_proba(X_selected)[0]

    return label, probs
    
def assign_verdict_and_reason(label, probs):
    verdict = "Hired" if label == 1 else "Not hired"
    confidence = probs[label]  # probability of predicted class
    reason = f"Predicted as '{verdict}' with confidence {confidence:.2%}."
    return verdict, reason

metrics, model, vectorizer, chi2_selector = train_and_evaluate(df, ngram_range=(1,3))
label, probs = predict_resume_text(model, vectorizer, chi2_selector, combined_text)
verdict, reason_for_decision = assign_verdict_and_reason(label, probs)

from resumeReviewer import extract_text_from_pdf
label, probs = predict_resume_text(model, vectorizer, chi2_selector, new_resume_text)
print(f"Predicted label: {label}")
print(f"Probabilities: Not hired = {probs[0]:.3f}, Hired = {probs[1]:.3f}")



Predicted label: 0
Probabilities: Not hired = 0.518, Hired = 0.482


In [15]:
import numpy as np
feature_names = vectorizer.get_feature_names_out()

# Get indices of top features with highest difference in log-prob between classes
class0_log_prob = model.feature_log_prob_[0]
class1_log_prob = model.feature_log_prob_[1]
diff = class1_log_prob - class0_log_prob  # positive = favors class 1 (hired)

# Sort features by difference
top_pos_indices = np.argsort(diff)[-10:]  # top 10 features favoring class 1
top_neg_indices = np.argsort(diff)[:10]   # top 10 features favoring class 0
print("Top features favoring Hired:")
for idx in reversed(top_pos_indices):
    print(f"{feature_names[idx]}: {diff[idx]:.4f}")

print("\nTop features favoring Not Hired:")
for idx in top_neg_indices:
    print(f"{feature_names[idx]}: {diff[idx]:.4f}")


Top features favoring Hired:
2018 def: 8.0722
achievements winner ux: 8.0722
ability communicate complex: 8.0722
achievements best paper: 8.0722
2018 data science: 8.0704
20xx: 8.0146
2022: 8.0146
2017 professional: 8.0146
2017 professional memberships: 8.0146
2016 insufficient design: 8.0146

Top features favoring Not Hired:
25 reduction customer: -8.2234
30 increase efficiency: -8.2124
2017 lacks: -8.0932
achieving 20 improvement: -8.0932
achieving 25: -8.0932
achieving 25 increase: -8.0932
2018 published article: -8.0932
30 increase employee: -8.0904
actionable insights drive: -8.0904
30 designed: -8.0904


In [16]:
def predict_hire_probability(candidate_text, job_description, role, vectorizer, chi2_selector, model):
    # Combine input text exactly as training
    combined_text = f"{candidate_text} {job_description} {role}"

    # Vectorize
    X_vect = vectorizer.transform([combined_text])

    # Feature select
    X_selected = chi2_selector.transform(X_vect)

    # Predict probability of hired (class 1)
    proba = model.predict_proba(X_selected)[:, 1][0]

    return proba
roles_list = all_roles['Role'].tolist()

for role in roles_list:
    probability = predict_hire_probability(combined_text, job_desc, role, vectorizer, chi2_selector, model)
    print(f"Predicted hiring probability for {role}: {probability:.3f}")


Predicted hiring probability for E-commerce Specialist: 0.488
Predicted hiring probability for Game Developer: 0.487
Predicted hiring probability for Human Resources Specialist: 0.488
Predicted hiring probability for Mobile App Developer: 0.487
Predicted hiring probability for UX Designer: 0.492
Predicted hiring probability for Cloud Engineer: 0.486
Predicted hiring probability for Digital Marketing Specialist: 0.489
Predicted hiring probability for AI Researcher: 0.490
Predicted hiring probability for UI Engineer: 0.487
Predicted hiring probability for AR/VR Developer: 0.487
Predicted hiring probability for Machine Learning Engineer: 0.483
Predicted hiring probability for Database Administrator: 0.486
Predicted hiring probability for Data Engineer: 0.486
Predicted hiring probability for Cybersecurity Analyst: 0.487
Predicted hiring probability for Robotics Engineer: 0.488
Predicted hiring probability for Business Analyst: 0.487
Predicted hiring probability for Data Analyst: 0.487
Pred

In [17]:
import pickle

# Save trained model
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save vectorizer
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

# Save chi2 selector
with open("chi2_selector.pkl", "wb") as f:
    pickle.dump(chi2_selector, f)

print("Model, vectorizer, and selector successfully exported!")


Model, vectorizer, and selector successfully exported!
