# Imports

In [33]:
!pip3 install spacy




In [41]:
"""
End-to-end ATS-score pipeline
–––––––––––––––––––––––––––––
1. Cleans + lemmatises Resume & Job-Description text
2. Creates sentence-BERT embeddings
3. Computes weighted (70-30) ATS score:
      70 % similarity(Resume, JD)  +
      30 % similarity(Resume, Role)
4. Prints and optionally saves results
"""

# ───────────────────────────── Imports
import re
import pathlib
import pandas as pd
import numpy as np

# NLP pre-processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy

# Embeddings & maths
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

import nltk

# Download all necessary corpora
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


# ───────────────────────────── House-keeping

DATA_FPATH = "dataset.csv"          # ← change if needed
MODEL_NAME = "all-MiniLM-L6-v2"     # fast, 384-d vector model
SPACY_MODEL = "en_core_web_sm"      # lemmatisation model
DEBOUNCE_DELAY = 600                # ms for UI typing – kept here for reference

# ---  Make sure required NLTK artefacts exist only once
def ensure_nltk_resources() -> None:
    resources = {
        "punkt": "tokenizers/punkt",
        "stopwords": "corpora/stopwords",
    }
    for key, target in resources.items():
        try:
            nltk.data.find(target)
        except LookupError:
            nltk.download(key)

ensure_nltk_resources()
STOP_WORDS = set(stopwords.words("english"))

# ---  Make sure spaCy model is available
try:
    _ = spacy.load(SPACY_MODEL)
except OSError:  # model missing
    from spacy.cli import download
    download(SPACY_MODEL)

nlp = spacy.load(SPACY_MODEL, disable=["ner", "parser"])  # we only need tagger+lemmatizer

# ─────────────────────────────  Text helpers
_re_non_word = re.compile(r"\W+")
_re_digit    = re.compile(r"\d+")

def clean_text(text: str) -> str:
    """
    Lower-case, drop punctuation & numbers, strip stop-words.
    """
    text = text.lower()
    text = _re_non_word.sub(" ", text)   # punctuation → space
    text = _re_digit.sub(" ", text)      # numbers → space
    tokens = [
        w for w in word_tokenize(text)
        if w not in STOP_WORDS and len(w) > 1            # keep tokens >1 char
    ]
    return " ".join(tokens)


def lemmatise(text: str) -> str:
    """
    Lemmatise via spaCy (‘en_core_web_sm’).
    """
    return " ".join(tok.lemma_ for tok in nlp(text))


def preprocess_series(ser: pd.Series) -> pd.Series:
    """
    Pipeline: clean → lemmatise.
    """
    return ser.fillna("").astype(str).map(clean_text).map(lemmatise)


# ─────────────────────────────  Main
def main() -> None:
    # 1️⃣  Load data
    if not pathlib.Path(DATA_FPATH).exists():
        raise FileNotFoundError(f"{DATA_FPATH} not found – check path.")
    df = pd.read_csv(DATA_FPATH)

    required_cols = {"Resume", "Job_Description", "Role"}
    if not required_cols.issubset(df.columns):
        missing = required_cols - set(df.columns)
        raise KeyError(f"CSV missing columns: {', '.join(missing)}")

    # 2️⃣  Clean + lemmatise (vectorised via Pandas)
    print("🔄  Pre-processing text …")
    df["Resume_Clean"] = preprocess_series(df["Resume"])
    df["JD_Clean"]     = preprocess_series(df["Job_Description"])

    # 3️⃣  Sentence-BERT embeddings
    print("🔄  Encoding with Sentence-BERT …")
    model = SentenceTransformer(MODEL_NAME)

    resume_vecs = model.encode(df["Resume_Clean"].tolist(), convert_to_numpy=True, show_progress_bar=True)
    jd_vecs     = model.encode(df["JD_Clean"].tolist(),     convert_to_numpy=True, show_progress_bar=True)
    role_vecs   = model.encode(df["Role"].fillna("").tolist(), convert_to_numpy=True, show_progress_bar=True)

    # 4️⃣  Similarities
    sim_res_jd   = cosine_similarity(resume_vecs, jd_vecs).diagonal()
    sim_res_role = cosine_similarity(resume_vecs, role_vecs).diagonal()

    # 5️⃣  Normalise both sets 0–1, then weighted score
    scaler = MinMaxScaler()
    sim_res_jd   = scaler.fit_transform(sim_res_jd.reshape(-1, 1)).ravel()
    sim_res_role = scaler.fit_transform(sim_res_role.reshape(-1, 1)).ravel()

    df["ATS_Score"] = (0.7 * sim_res_jd + 0.3 * sim_res_role) * 100

    # 6️⃣  Output
    cols_to_show = ["Name", "Role", "ATS_Score"]
    if "decision" in df.columns:
        cols_to_show.insert(2, "decision")

    print("\n🎯  Results:")
    print(df[cols_to_show].sort_values("ATS_Score", ascending=False).to_string(index=False))

    # Uncomment if you want to save
    # df.to_csv("ats_scores_with_role.csv", index=False)


if __name__ == "__main__":
    main()


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1028)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1028)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1028)>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
[nltk_data]     failed: unable to get local issuer certificate
[nltk_data]     (_ssl.c:1028)>
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/Ahmad/nltk_data'
    - '/Users/Ahmad/Desktop/MyProjects/ats-score-model/.venv/nltk_data'
    - '/Users/Ahmad/Desktop/MyProjects/ats-score-model/.venv/share/nltk_data'
    - '/Users/Ahmad/Desktop/MyProjects/ats-score-model/.venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [44]:
import pandas as pd
import re #to remove punctuations(regex)
import nltk # natural language tool kit
import spacy
from nltk.corpus import stopwords # text preprocessing
from nltk.tokenize import word_tokenize # text preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer # text -> vector
from sklearn.metrics.pairwise import cosine_similarity # compare two vectors how closely they are related




In [45]:
df = pd.read_csv("dataset.csv")

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10174 entries, 0 to 10173
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   10174 non-null  object
 1   Name                 10174 non-null  object
 2   Role                 10174 non-null  object
 3   Transcript           10174 non-null  object
 4   Resume               10174 non-null  object
 5   decision             10174 non-null  object
 6   Reason_for_decision  10174 non-null  object
 7   Job_Description      10174 non-null  object
dtypes: object(8)
memory usage: 636.0+ KB


In [47]:
df.head()

Unnamed: 0,ID,Name,Role,Transcript,Resume,decision,Reason_for_decision,Job_Description
0,jasojo159,Jason Jones,E-commerce Specialist,"Interviewer: Good morning, Jason. It's great t...",Here's a professional resume for Jason Jones:\...,reject,Lacked leadership skills for a senior position.,Be part of a passionate team at the forefront ...
1,annma759,Ann Marshall,Game Developer,Interview Scene\n\nA conference room with a ta...,Here's a professional resume for Ann Marshall:...,select,Strong technical skills in AI and ML.,Help us build the next-generation products as ...
2,patrmc729,Patrick Mcclain,Human Resources Specialist,Interview Setting: A conference room in a medi...,Here's a professional resume for Patrick Mccla...,reject,Insufficient system design expertise for senio...,We need a Human Resources Specialist to enhanc...
3,patrgr422,Patricia Gray,E-commerce Specialist,Here's a simulated professional interview for ...,Here's a professional resume for Patricia Gray...,select,Impressive leadership and communication abilit...,Be part of a passionate team at the forefront ...
4,amangr696,Amanda Gross,E-commerce Specialist,Here's the simulated interview:\n\nInterviewer...,Here's a professional resume for Amanda Gross:...,reject,Lacked leadership skills for a senior position.,We are looking for an experienced E-commerce S...


In [48]:

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')  # For lemmatization
nltk.download('averaged_perceptron_tagger')  # For POS tagging

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1028)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1028)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1028)>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
[nltk_data]     failed: unable to get local issuer certificate
[nltk_data]     (_ssl.c:1028)>


False

In [49]:
# Function to clean text

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    words = word_tokenize(text)  # Tokenize
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(words)

In [50]:
import spacy

spacy.cli.download("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [51]:
# Function for lemmatization
nlp = spacy.load('en_core_web_sm')
def lemmatize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

In [52]:
# Apply preprocessing to Resume & Job Description
df["Resume_Clean"] = df["Resume"].astype(str).apply(clean_text).apply(lemmatize_text)
df["Job_Desc_Clean"] = df["Job_Description"].astype(str).apply(clean_text).apply(lemmatize_text)


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/Ahmad/nltk_data'
    - '/Users/Ahmad/Desktop/MyProjects/ats-score-model/.venv/nltk_data'
    - '/Users/Ahmad/Desktop/MyProjects/ats-score-model/.venv/share/nltk_data'
    - '/Users/Ahmad/Desktop/MyProjects/ats-score-model/.venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
df.head()

Unnamed: 0,ID,Name,Role,Transcript,Resume,decision,Reason_for_decision,Job_Description,Resume_Clean,Job_Desc_Clean
0,jasojo159,Jason Jones,E-commerce Specialist,"Interviewer: Good morning, Jason. It's great t...",Here's a professional resume for Jason Jones:\...,reject,Lacked leadership skills for a senior position.,Be part of a passionate team at the forefront ...,professional resume jason jones jason jones e ...,part passionate team forefront machine learn e...
1,annma759,Ann Marshall,Game Developer,Interview Scene\n\nA conference room with a ta...,Here's a professional resume for Ann Marshall:...,select,Strong technical skills in AI and ML.,Help us build the next-generation products as ...,professional resume ann marshall ann marshall ...,help we build next generation product game dev...
2,patrmc729,Patrick Mcclain,Human Resources Specialist,Interview Setting: A conference room in a medi...,Here's a professional resume for Patrick Mccla...,reject,Insufficient system design expertise for senio...,We need a Human Resources Specialist to enhanc...,professional resume patrick mcclain patrick mc...,need human resource specialist enhance team te...
3,patrgr422,Patricia Gray,E-commerce Specialist,Here's a simulated professional interview for ...,Here's a professional resume for Patricia Gray...,select,Impressive leadership and communication abilit...,Be part of a passionate team at the forefront ...,professional resume patricia gray patricia gra...,part passionate team forefront cloud computing...
4,amangr696,Amanda Gross,E-commerce Specialist,Here's the simulated interview:\n\nInterviewer...,Here's a professional resume for Amanda Gross:...,reject,Lacked leadership skills for a senior position.,We are looking for an experienced E-commerce S...,professional resume amanda gross amanda gross ...,look experience e commerce specialist join tea...


In [None]:
# import numpy as np
# import pandas as pd
# import re
# import spacy
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity


# # Initialize TF-IDF Vectorizer
# tfidf_vectorizer = TfidfVectorizer()

# # Fit & transform Resume and Job Description
# tfidf_resume = tfidf_vectorizer.fit_transform(df["Resume_Clean"])
# tfidf_job_desc = tfidf_vectorizer.transform(df["Job_Desc_Clean"])

# # Compute Cosine Similarity
# similarity_scores = cosine_similarity(tfidf_resume, tfidf_job_desc)

# # Extract diagonal values as ATS scores
# df["ATS_Score"] = similarity_scores.diagonal() * 100  # Convert to percentage

# # Display results
# print(df[["Name", "Role", "decision", "ATS_Score"]])

# # # Save results to CSV (optional)
# # df.to_csv("ats_scores.csv", index=False)


                  Name                        Role decision  ATS_Score
0          Jason Jones       E-commerce Specialist   reject   9.226798
1         Ann Marshall              Game Developer   select   5.624799
2      Patrick Mcclain  Human Resources Specialist   reject  16.415510
3        Patricia Gray       E-commerce Specialist   select   7.105350
4         Amanda Gross       E-commerce Specialist   reject   3.949370
...                ...                         ...      ...        ...
10169     Diana Miller             Product Manager   reject  59.163743
10170     Grace Taylor                 UI Engineer   reject  45.243484
10171       Hank Brown                 UI Engineer   select  41.368585
10172     Diana Wilson               Data Engineer   reject  44.630917
10173   Charlie Miller             Product Manager   select  60.052097

[10174 rows x 4 columns]


In [None]:
# import numpy as np
# import pandas as pd
# import re
# import spacy
# from sentence_transformers import SentenceTransformer
# from sklearn.metrics.pairwise import cosine_similarity

# # Load the BERT-based sentence transformer model
# model = SentenceTransformer('all-MiniLM-L6-v2')

# # Convert Resume & Job Descriptions to embeddings
# resume_embeddings = model.encode(df["Resume_Clean"], convert_to_numpy=True)
# job_desc_embeddings = model.encode(df["Job_Desc_Clean"], convert_to_numpy=True)

# # Compute Cosine Similarity
# similarity_scores = cosine_similarity(resume_embeddings, job_desc_embeddings)

# # Extract diagonal values as ATS scores
# df["ATS_Score"] = similarity_scores.diagonal() * 100  # Convert to percentage

# # Display results
# print(df[["Name", "Role", "decision", "ATS_Score"]])

# # # Save results to CSV (optional)
# # df.to_csv("ats_scores_improved.csv", index=False)


                  Name                        Role decision  ATS_Score
0          Jason Jones       E-commerce Specialist   reject  37.113407
1         Ann Marshall              Game Developer   select  32.107269
2      Patrick Mcclain  Human Resources Specialist   reject  48.492916
3        Patricia Gray       E-commerce Specialist   select  37.091381
4         Amanda Gross       E-commerce Specialist   reject  41.472687
...                ...                         ...      ...        ...
10169     Diana Miller             Product Manager   reject  83.335228
10170     Grace Taylor                 UI Engineer   reject  87.547577
10171       Hank Brown                 UI Engineer   select  81.156395
10172     Diana Wilson               Data Engineer   reject  78.613815
10173   Charlie Miller             Product Manager   select  83.494476

[10174 rows x 4 columns]


In [None]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler


# Load the BERT-based sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert Resume, Job Descriptions, and Role to embeddings
resume_embeddings = model.encode(df["Resume_Clean"], convert_to_numpy=True)
job_desc_embeddings = model.encode(df["Job_Desc_Clean"], convert_to_numpy=True)
role_embeddings = model.encode(df["Role"], convert_to_numpy=True)  # New step for Role

# Compute Cosine Similarity
similarity_resume_jd = cosine_similarity(resume_embeddings, job_desc_embeddings).diagonal()
similarity_resume_role = cosine_similarity(resume_embeddings, role_embeddings).diagonal()  # New similarity

# Normalize both similarity scores
scaler = MinMaxScaler()
similarity_resume_jd = scaler.fit_transform(similarity_resume_jd.reshape(-1, 1)).flatten()
similarity_resume_role = scaler.fit_transform(similarity_resume_role.reshape(-1, 1)).flatten()

# Compute the final ATS score (weighted sum)
df["ATS_Score"] = (0.7 * similarity_resume_jd + 0.3 * similarity_resume_role) * 100  # 70-30 weight split

# Display results
print(df[["Name", "Role", "decision", "ATS_Score"]])

# # Save results to CSV (optional)
# df.to_csv("ats_scores_with_role.csv", index=False)


                  Name                        Role decision  ATS_Score
0          Jason Jones       E-commerce Specialist   reject  47.348263
1         Ann Marshall              Game Developer   select  43.961040
2      Patrick Mcclain  Human Resources Specialist   reject  57.735325
3        Patricia Gray       E-commerce Specialist   select  46.574200
4         Amanda Gross       E-commerce Specialist   reject  48.630505
...                ...                         ...      ...        ...
10169     Diana Miller             Product Manager   reject  77.520844
10170     Grace Taylor                 UI Engineer   reject  83.561493
10171       Hank Brown                 UI Engineer   select  77.151260
10172     Diana Wilson               Data Engineer   reject  78.636681
10173   Charlie Miller             Product Manager   select  82.010712

[10174 rows x 4 columns]


In [None]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the BERT-based sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert Resume, Job Descriptions, and Role to embeddings
resume_embeddings = model.encode(df["Resume_Clean"], convert_to_numpy=True)
job_desc_embeddings = model.encode(df["Job_Desc_Clean"], convert_to_numpy=True)
role_embeddings = model.encode(df["Role"], convert_to_numpy=True)

# Compute Cosine Similarity
similarity_resume_jd = cosine_similarity(resume_embeddings, job_desc_embeddings).diagonal()
similarity_resume_role = cosine_similarity(resume_embeddings, role_embeddings).diagonal()

# Normalize scores
scaler = MinMaxScaler()
similarity_resume_jd = scaler.fit_transform(similarity_resume_jd.reshape(-1, 1)).flatten()
similarity_resume_role = scaler.fit_transform(similarity_resume_role.reshape(-1, 1)).flatten()

# Compute final ATS score (weighted sum)
df["ATS_Score"] = (0.8 * similarity_resume_jd + 0.2 * similarity_resume_role) * 100

# 🔹 Extract Important Keywords using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=10)  # Top 10 keywords
job_desc_keywords = vectorizer.fit_transform(df["Job_Desc_Clean"])
resume_keywords = vectorizer.transform(df["Resume_Clean"])

# # Generate suggestions
# def generate_suggestions(index):
#     missing_keywords = set(vectorizer.get_feature_names_out()) - set(df["Resume_Clean"][index].split())
#     suggestions = []
    
#     if similarity_resume_jd[index] < 0.6:  # If job description match is low
#         suggestions.append("Include more relevant experience matching the job description.")
    
#     if similarity_resume_role[index] < 0.5:  # If role match is low
#         suggestions.append(f"Tailor your resume for the role '{df['Role'][index]}'.")

#     if missing_keywords:
#         suggestions.append(f"Consider adding these skills/keywords: {', '.join(missing_keywords)}.")

#     return " ".join(suggestions) if suggestions else "Your resume is well-matched!"

# df["Suggestions"] = df.index.map(generate_suggestions)

# Display results
# print(df[["Name", "Role", "ATS_Score", "Suggestions"]])
print(df[["Name", "Role", "ATS_Score"]])

# # Save results to CSV (optional)
# df.to_csv("ats_scores_with_suggestions.csv", index=False)


                  Name                        Role  ATS_Score
0          Jason Jones       E-commerce Specialist  44.386967
1         Ann Marshall              Game Developer  40.343868
2      Patrick Mcclain  Human Resources Specialist  55.369049
3        Patricia Gray       E-commerce Specialist  43.863071
4         Amanda Gross       E-commerce Specialist  46.796097
...                ...                         ...        ...
10169     Diana Miller             Product Manager  80.982475
10170     Grace Taylor                 UI Engineer  86.511490
10171       Hank Brown                 UI Engineer  79.959213
10172     Diana Wilson               Data Engineer  80.042938
10173   Charlie Miller             Product Manager  84.032494

[10174 rows x 3 columns]


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10174 entries, 0 to 10173
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   10174 non-null  object 
 1   Name                 10174 non-null  object 
 2   Role                 10174 non-null  object 
 3   Transcript           10174 non-null  object 
 4   Resume               10174 non-null  object 
 5   decision             10174 non-null  object 
 6   Reason_for_decision  10174 non-null  object 
 7   Job_Description      10174 non-null  object 
 8   Resume_Clean         10174 non-null  object 
 9   Job_Desc_Clean       10174 non-null  object 
 10  ATS_Score            10174 non-null  float32
dtypes: float32(1), object(10)
memory usage: 834.7+ KB


In [None]:
df = df.drop(columns=["ID", "Name", "Transcript", "Resume", "decision", "Reason_for_decision", "Job_Description"])


In [None]:
df.head()

Unnamed: 0,Role,Resume_Clean,Job_Desc_Clean,ATS_Score
0,E-commerce Specialist,professional resume jason jones jason jones e ...,part passionate team forefront machine learn e...,44.386967
1,Game Developer,professional resume ann marshall ann marshall ...,help we build next generation product game dev...,40.343868
2,Human Resources Specialist,professional resume patrick mcclain patrick mc...,need human resource specialist enhance team te...,55.369049
3,E-commerce Specialist,professional resume patricia gray patricia gra...,part passionate team forefront cloud computing...,43.863071
4,E-commerce Specialist,professional resume amanda gross amanda gross ...,look experience e commerce specialist join tea...,46.796097


In [None]:
# df["Role"]
# Get unique values
unique_roles = df['Role'].unique()

# Count how many unique values there are
num_unique_roles = df['Role'].nunique()

# Get value counts (frequency of each unique value)
role_counts = df['Role'].value_counts()

# Print results
print(f"Number of unique roles: {num_unique_roles}")
print("\nTop 10 most common roles:")
print(role_counts.head(10))

# To see the percentage distribution
role_percentage = df['Role'].value_counts(normalize=True) * 100
print("\nPercentage distribution (top 10):")
print(role_percentage.head(10))

Number of unique roles: 45

Top 10 most common roles:
Role
Data Scientist       538
Software Engineer    480
Product Manager      458
Data Engineer        447
UI Engineer          375
Data Analyst         329
data engineer        307
software engineer    307
product manager      303
data scientist       287
Name: count, dtype: int64

Percentage distribution (top 10):
Role
Data Scientist       5.287989
Software Engineer    4.717908
Product Manager      4.501671
Data Engineer        4.393552
UI Engineer          3.685866
Data Analyst         3.233733
data engineer        3.017496
software engineer    3.017496
product manager      2.978180
data scientist       2.820916
Name: proportion, dtype: float64


In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

# Load pre-trained BERT model
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert Resume & Job Description to BERT embeddings
df["Resume_Embeddings"] = df["Resume_Clean"].apply(lambda x: bert_model.encode(x))
df["Job_Desc_Embeddings"] = df["Job_Desc_Clean"].apply(lambda x: bert_model.encode(x))

# Encode "Role" column
role_encoder = LabelEncoder()
df["Role_Encoded"] = role_encoder.fit_transform(df["Role"])

# Convert embeddings to NumPy array
X_resumes = np.array(df["Resume_Embeddings"].tolist())
X_job_descs = np.array(df["Job_Desc_Embeddings"].tolist())

# Stack Resume, Job Description, and Role together
X = np.hstack((X_resumes, X_job_descs, df["Role_Encoded"].values.reshape(-1, 1)))
y = df["ATS_Score"].values  # Target variable

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost Model
xgb_model = xgb.XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=6)
xgb_model.fit(X_train, y_train)

# Predictions
y_pred = xgb_model.predict(X_test)

# Model Performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"✅ Model Trained Successfully!")
print(f"📊 MSE: {mse:.4f}")
print(f"📈 R² Score: {r2:.4f}")


✅ Model Trained Successfully!
📊 MSE: 9.8282
📈 R² Score: 0.9379


# Save The Model

In [None]:
import joblib
joblib.dump(xgb_model, "ats_model.pkl")  
joblib.dump(role_encoder, "role_encoder.pkl")


['role_encoder.pkl']

In [None]:
# 👉 Higher similarity scores mean a resume is a better match for the job.

In [None]:
# from sklearn.metrics.pairwise import cosine_similarity

# # Compute similarity between Resume and Job Description
# similarity_scores = cosine_similarity(resume_tfidf, job_desc_tfidf)

# # Convert similarity matrix to DataFrame
# df["Resume_Job_Similarity"] = [similarity_scores[i, i] for i in range(len(df))]

# # View similarity scores
# print(df[["Name", "Role", "Resume_Job_Similarity"]].head())


              Name                        Role  Resume_Job_Similarity
0      Jason Jones       E-commerce Specialist               0.122216
1     Ann Marshall              Game Developer               0.061060
2  Patrick Mcclain  Human Resources Specialist               0.164754
3    Patricia Gray       E-commerce Specialist               0.093921
4     Amanda Gross       E-commerce Specialist               0.056634


In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report

# # Encode target variable (convert "select" and "reject" to 1 and 0)
# df["decision"] = df["decision"].map({"select": 1, "reject": 0})

# # Select features and target
# X = df[["Resume_Job_Similarity"]]  # You can add more features later
# y = df["decision"]

# # Split into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Train a model (Random Forest)
# model = RandomForestClassifier(n_estimators=100, random_state=42)
# model.fit(X_train, y_train)

# # Make predictions
# y_pred = model.predict(X_test)

# # Evaluate model performance
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))


Accuracy: 0.48697788697788696
              precision    recall  f1-score   support

           0       0.49      0.47      0.48      1034
           1       0.48      0.51      0.49      1001

    accuracy                           0.49      2035
   macro avg       0.49      0.49      0.49      2035
weighted avg       0.49      0.49      0.49      2035



In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# # Initialize TF-IDF Vectorizer
# tfidf_vectorizer = TfidfVectorizer()

# # Fit & transform Resume and Job Description
# tfidf_resume = tfidf_vectorizer.fit_transform(df["Resume_Clean"])
# tfidf_job_desc = tfidf_vectorizer.transform(df["Job_Desc_Clean"])

# # Compute Cosine Similarity
# similarity_scores = cosine_similarity(tfidf_resume, tfidf_job_desc)

# # Normalize similarity score to a percentage (0-100)
# df["ATS_Score"] = similarity_scores.diagonal() * 100

# # Display results
# print(df[["Name", "Role", "decision", "ATS_Score"]].head())


              Name                        Role  decision  ATS_Score
0      Jason Jones       E-commerce Specialist         0   9.226798
1     Ann Marshall              Game Developer         1   5.624799
2  Patrick Mcclain  Human Resources Specialist         0  16.415510
3    Patricia Gray       E-commerce Specialist         1   7.105350
4     Amanda Gross       E-commerce Specialist         0   3.949370


In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# # Combine resume and job description text for vectorization
# all_text = df["Resume_Clean"].tolist() + df["Job_Desc_Clean"].tolist()

# # Initialize TF-IDF Vectorizer on combined text
# tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit features for better accuracy
# tfidf_vectorizer.fit(all_text)

# # Transform resumes and job descriptions
# tfidf_resume = tfidf_vectorizer.transform(df["Resume_Clean"])
# tfidf_job_desc = tfidf_vectorizer.transform(df["Job_Desc_Clean"])

# # Compute Cosine Similarity
# similarity_scores = cosine_similarity(tfidf_resume, tfidf_job_desc)

# # Normalize similarity score to percentage (0-100)
# df["ATS_Score"] = similarity_scores.diagonal() * 100

# # Display results
# print(df[["Name", "Role", "decision", "ATS_Score"]].head())


In [None]:
# df.head()

Unnamed: 0,ID,Name,Role,Transcript,Resume,decision,Reason_for_decision,Job_Description,Resume_Clean,Job_Desc_Clean
0,jasojo159,Jason Jones,E-commerce Specialist,"Interviewer: Good morning, Jason. It's great t...",Here's a professional resume for Jason Jones:\...,reject,Lacked leadership skills for a senior position.,Be part of a passionate team at the forefront ...,professional resume jason jones jason jones e ...,part passionate team forefront machine learn e...
1,annma759,Ann Marshall,Game Developer,Interview Scene\n\nA conference room with a ta...,Here's a professional resume for Ann Marshall:...,select,Strong technical skills in AI and ML.,Help us build the next-generation products as ...,professional resume ann marshall ann marshall ...,help we build next generation product game dev...
2,patrmc729,Patrick Mcclain,Human Resources Specialist,Interview Setting: A conference room in a medi...,Here's a professional resume for Patrick Mccla...,reject,Insufficient system design expertise for senio...,We need a Human Resources Specialist to enhanc...,professional resume patrick mcclain patrick mc...,need human resource specialist enhance team te...
3,patrgr422,Patricia Gray,E-commerce Specialist,Here's a simulated professional interview for ...,Here's a professional resume for Patricia Gray...,select,Impressive leadership and communication abilit...,Be part of a passionate team at the forefront ...,professional resume patricia gray patricia gra...,part passionate team forefront cloud computing...
4,amangr696,Amanda Gross,E-commerce Specialist,Here's the simulated interview:\n\nInterviewer...,Here's a professional resume for Amanda Gross:...,reject,Lacked leadership skills for a senior position.,We are looking for an experienced E-commerce S...,professional resume amanda gross amanda gross ...,look experience e commerce specialist join tea...


In [None]:


# ## 🔹 Step 1: Train TF-IDF on Resume + Job Description ##
# all_text = df["Resume_Clean"].tolist() + df["Job_Desc_Clean"].tolist()
# tfidf_vectorizer = TfidfVectorizer(max_features=5000)
# tfidf_vectorizer.fit(all_text)

# # Transform resumes and job descriptions
# tfidf_resume = tfidf_vectorizer.transform(df["Resume_Clean"])
# tfidf_job_desc = tfidf_vectorizer.transform(df["Job_Desc_Clean"])

# # Compute TF-IDF Cosine Similarity
# tfidf_similarity_scores = cosine_similarity(tfidf_resume, tfidf_job_desc).diagonal()

# ## 🔹 Step 2: Compute Weighted Keyword Matching ##
# idf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))
# important_words = sorted(idf_scores, key=idf_scores.get, reverse=True)[:50]  # Top 50 keywords

# def weighted_keyword_match(resume, job_desc):
#     resume_words = set(resume.split())
#     job_words = set(job_desc.split())
#     common_words = resume_words.intersection(job_words)
#     return sum(idf_scores.get(word, 0) for word in common_words) / len(important_words)

# df["Weighted_Match_Score"] = df.apply(lambda row: weighted_keyword_match(row["Resume_Clean"], row["Job_Desc_Clean"]), axis=1)

# ## 🔹 Step 3: Use BERT for Semantic Similarity ##
# model = SentenceTransformer('all-MiniLM-L6-v2')

# resume_embeddings = model.encode(df["Resume_Clean"].tolist(), convert_to_tensor=True)
# job_desc_embeddings = model.encode(df["Job_Desc_Clean"].tolist(), convert_to_tensor=True)

# bert_similarity_scores = cosine_similarity(resume_embeddings.cpu(), job_desc_embeddings.cpu()).diagonal()

# ## 🔹 Step 4: Combine All Scores ##
# df["ATS_Score"] = (0.4 * tfidf_similarity_scores) + (0.3 * df["Weighted_Match_Score"]) + (0.3 * bert_similarity_scores)
# df["ATS_Score"] = df["ATS_Score"] * 100  # Normalize to percentage scale

# # Display Results
# print(df[["Name", "Role", "decision", "ATS_Score"]].head())



              Name                        Role decision  ATS_Score
0      Jason Jones       E-commerce Specialist   reject  23.335724
1     Ann Marshall              Game Developer   select  16.846628
2  Patrick Mcclain  Human Resources Specialist   reject  31.382445
3    Patricia Gray       E-commerce Specialist   select  20.211019
4     Amanda Gross       E-commerce Specialist   reject  23.006243


In [43]:
from sklearn.preprocessing import MinMaxScaler

# 🔹 **TF-IDF Feature Extraction with Improved Parameters**
tfidf_vectorizer = TfidfVectorizer(max_features=5000, min_df=5, ngram_range=(1,2))
tfidf_vectorizer.fit(df["Resume_Clean"].tolist() + df["Job_Desc_Clean"].tolist())

tfidf_resume = tfidf_vectorizer.transform(df["Resume_Clean"])
tfidf_job_desc = tfidf_vectorizer.transform(df["Job_Desc_Clean"])

tfidf_similarity_scores = cosine_similarity(tfidf_resume, tfidf_job_desc).diagonal()

# 🔹 **Weighted Keyword Matching (Domain-Specific Enhancement)**
idf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))

def weighted_keyword_match(resume, job_desc):
    resume_words = set(resume.split())
    job_words = set(job_desc.split())
    common_words = resume_words.intersection(job_words)
    return sum(idf_scores.get(word, 0) for word in common_words) / max(1, len(common_words))

df["Weighted_Match_Score"] = df.apply(lambda row: weighted_keyword_match(row["Resume_Clean"], row["Job_Desc_Clean"]), axis=1)

# 🔹 **BERT Embeddings with Normalization**
model = SentenceTransformer('all-MiniLM-L6-v2')

resume_embeddings = model.encode(df["Resume_Clean"].tolist(), normalize_embeddings=True)
job_desc_embeddings = model.encode(df["Job_Desc_Clean"].tolist(), normalize_embeddings=True)

bert_similarity_scores = cosine_similarity(resume_embeddings, job_desc_embeddings).diagonal()

# 🔹 **Normalize All Scores Before Combining**
scaler = MinMaxScaler()
df["TFIDF_Score"] = scaler.fit_transform(tfidf_similarity_scores.reshape(-1, 1)).flatten()
df["Weighted_Score"] = scaler.fit_transform(df["Weighted_Match_Score"].values.reshape(-1, 1)).flatten()
df["BERT_Score"] = scaler.fit_transform(bert_similarity_scores.reshape(-1, 1)).flatten()

# 🔹 **Final ATS Score Calculation (Adjusted Weights)**
df["ATS_Score"] = (0.5 * df["BERT_Score"]) + (0.3 * df["TFIDF_Score"]) + (0.2 * df["Weighted_Score"])
df["ATS_Score"] = df["ATS_Score"] * 100  # Convert to percentage scale

# 🔹 **Display Results**
print(df[["Name", "Role", "decision", "ATS_Score"]].head())


KeyError: 'Resume_Clean'

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


# ✅ **TF-IDF With Feature Selection**
tfidf_vectorizer = TfidfVectorizer(max_features=7000, min_df=3, ngram_range=(1,3))
tfidf_vectorizer.fit(df["Resume_Clean"].tolist() + df["Job_Desc_Clean"].tolist())

tfidf_resume = tfidf_vectorizer.transform(df["Resume_Clean"])
tfidf_job_desc = tfidf_vectorizer.transform(df["Job_Desc_Clean"])
tfidf_similarity_scores = (tfidf_resume @ tfidf_job_desc.T).diagonal()

# ✅ **BERT-Based Sentence Similarity (Instead of Cosine Similarity)**
model = SentenceTransformer('all-MiniLM-L6-v2')

resume_embeddings = model.encode(df["Resume_Clean"].tolist(), normalize_embeddings=True)
job_desc_embeddings = model.encode(df["Job_Desc_Clean"].tolist(), normalize_embeddings=True)

bert_similarity_scores = (resume_embeddings * job_desc_embeddings).sum(axis=1)

# ✅ **Cross-Encoder for Improved Semantic Matching**
cross_encoder = CrossEncoder('cross-encoder/stsb-TinyBERT-L-4')
cross_inputs = [[r, j] for r, j in zip(df["Resume_Clean"], df["Job_Desc_Clean"])]
cross_scores = cross_encoder.predict(cross_inputs)

# ✅ **Scaling All Scores**
scaler = MinMaxScaler()
df["TFIDF_Score"] = scaler.fit_transform(tfidf_similarity_scores.reshape(-1, 1)).flatten()
df["BERT_Score"] = scaler.fit_transform(bert_similarity_scores.reshape(-1, 1)).flatten()
df["Cross_Score"] = scaler.fit_transform(np.array(cross_scores).reshape(-1, 1)).flatten()

# ✅ **Final ATS Score Calculation (Cross-Encoder Weighted Higher)**
df["ATS_Score"] = (0.6 * df["Cross_Score"]) + (0.3 * df["BERT_Score"]) + (0.1 * df["TFIDF_Score"])
df["ATS_Score"] = df["ATS_Score"] * 100  # Convert to percentage

# ✅ **Display Results**
print(df[["Name", "Role", "decision", "ATS_Score"]].head())


In [42]:
import joblib

# Load the trained ATS model
ats_model = joblib.load("ats_model.pkl")

# Load the role encoder
role_encoder = joblib.load("role_encoder.pkl")


role_encoded = role_encoder.transform(["Software Engineer"])  # Example role encoding
predicted_ats_score = ats_model.predict([[role_encoded, resume_vectorized, job_desc_vectorized]])


ModuleNotFoundError: No module named 'xgboost'