In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ats-score-preprocessed-dataset/preprocessed_ats_dataset.csv


In [11]:
# ─── Imports ─────────────────────────────────────────────────
import pandas as pd, numpy as np, joblib, re, pathlib, shutil
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sentence_transformers import SentenceTransformer
import spacy

# ─── Config ─────────────────────────────────────────────────
DATA_PATH = "/kaggle/input/ats-score-preprocessed-dataset/preprocessed_ats_dataset.csv"   # update accordingly
OUT_DIR   = pathlib.Path("/kaggle/working")
MODEL_DIR = OUT_DIR / "miniLM_model"

# ─── Load spaCy ─────────────────────────────────────────────
try:
    nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
except:
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

SPACY_STOP = nlp.Defaults.stop_words
RE_NONWORD = re.compile(r"\W+")
RE_DIGITS  = re.compile(r"\d+")

def preprocess(text):
    text = RE_DIGITS.sub(" ", RE_NONWORD.sub(" ", str(text).lower()))
    doc  = nlp(text)
    return " ".join(tok.lemma_ for tok in doc if tok.lemma_ not in SPACY_STOP and len(tok) > 1)

# ─── Load Dataset ───────────────────────────────────────────
df = pd.read_csv(DATA_PATH)
required_cols = {"Resume", "Job_Description", "Role"}
assert required_cols.issubset(df.columns), f"Missing: {required_cols - set(df.columns)}"
print(f"✅ Loaded {len(df)} rows")

# ─── Preprocess ─────────────────────────────────────────────
print("🔄 Preprocessing…")
df["Resume_Clean"] = df["Resume"].map(preprocess)
df["JD_Clean"]     = df["Job_Description"].map(preprocess)
df["Role_Clean"]   = df["Role"].map(preprocess)

# ─── Train/Test Split ───────────────────────────────────────
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print(f"🔀 Train: {len(train_df)} | Test: {len(test_df)}")

# ─── Embedding Model ────────────────────────────────────────
print("🔄 Embedding using Sentence-BERT")
sbert = SentenceTransformer("paraphrase-MiniLM-L3-v2")

def compute_score(df_part):
    res_vec  = sbert.encode(df_part["Resume_Clean"].tolist(), convert_to_numpy=True, show_progress_bar=True)
    jd_vec   = sbert.encode(df_part["JD_Clean"].tolist(),     convert_to_numpy=True, show_progress_bar=True)
    role_vec = sbert.encode(df_part["Role_Clean"].tolist(),   convert_to_numpy=True, show_progress_bar=True)

    sim_rj = cosine_similarity(res_vec, jd_vec).diagonal()
    sim_rr = cosine_similarity(res_vec, role_vec).diagonal()

    return sim_rj, sim_rr

sim_rj_train, sim_rr_train = compute_score(train_df)
scaler_rj = MinMaxScaler()
scaler_rr = MinMaxScaler()

sim_rj_train_scaled = scaler_rj.fit_transform(sim_rj_train.reshape(-1, 1)).ravel()
sim_rr_train_scaled = scaler_rr.fit_transform(sim_rr_train.reshape(-1, 1)).ravel()
train_df["ATS_Score"] = np.round((0.7 * sim_rj_train_scaled + 0.3 * sim_rr_train_scaled) * 100, 2)

# ─── Use same scalers on test data ──────────────────────────
sim_rj_test, sim_rr_test = compute_score(test_df)
sim_rj_test_scaled = scaler_rj.transform(sim_rj_test.reshape(-1, 1)).ravel()
sim_rr_test_scaled = scaler_rr.transform(sim_rr_test.reshape(-1, 1)).ravel()
test_df["ATS_Score"] = np.round((0.7 * sim_rj_test_scaled + 0.3 * sim_rr_test_scaled) * 100, 2)

# ─── Save complete scored dataset ───────────────────────────
scored_df = pd.concat([train_df, test_df], axis=0)
scored_df.to_csv(OUT_DIR / "ats_scored.csv", index=False)

# ─── Regression Metrics ─────────────────────────────────────
# [Optional] Simulated "ground truth" score for testing purpose
# You can remove this if you plan to manually annotate or use real scores later
test_df["True_Score"] = test_df["ATS_Score"] + np.random.normal(0, 3, len(test_df))  # simulate some label noise

mae  = mean_absolute_error(test_df["True_Score"], test_df["ATS_Score"])
rmse = mean_squared_error(test_df["True_Score"], test_df["ATS_Score"], squared=False)
r2   = r2_score(test_df["True_Score"], test_df["ATS_Score"])

print(f"\n📊 Regression Metrics on Test (simulated ground truth):")
print(f"MAE : {mae:.2f}   RMSE : {rmse:.2f}   R² : {r2:.4f}")

# ─── Save Artifacts ─────────────────────────────────────────
print("💾 Saving artifacts …")
MODEL_DIR.mkdir(parents=True, exist_ok=True)
sbert.save(str(MODEL_DIR))
joblib.dump({"rj": scaler_rj, "rr": scaler_rr}, "minmax_scaler.pkl")

shutil.make_archive(str(MODEL_DIR), "zip", str(MODEL_DIR))

print("✅ All done.")
print(scored_df[["Name", "Role", "ATS_Score"]].sample(5).to_string(index=False))


✅ Loaded 10171 rows
🔄 Preprocessing…
🔀 Train: 8136 | Test: 2035
🔄 Embedding using Sentence-BERT


Batches:   0%|          | 0/255 [00:00<?, ?it/s]

Batches:   0%|          | 0/255 [00:00<?, ?it/s]

Batches:   0%|          | 0/255 [00:00<?, ?it/s]

Batches:   0%|          | 0/64 [00:00<?, ?it/s]

Batches:   0%|          | 0/64 [00:00<?, ?it/s]

Batches:   0%|          | 0/64 [00:00<?, ?it/s]


📊 Regression Metrics on Test (simulated ground truth):
MAE : 2.45   RMSE : 3.05   R² : 0.9481
💾 Saving artifacts …
✅ All done.
            Name                 Role  ATS_Score
     Fanish Basu      Project Manager  44.759998
    saanvi kohli         data analyst  38.060001
      niraj sahu    software engineer  39.840000
    pratiti iyer        data engineer  53.320000
Vincent Williams Mobile App Developer  42.189999


In [10]:
import joblib
import re
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# ── Load required models
sbert = SentenceTransformer("/kaggle/working/miniLM_model")
scaler_rj = joblib.load("/kaggle/working/scaler_rj.pkl")
scaler_rr = joblib.load("/kaggle/working/scaler_rr.pkl")

# ── spaCy init
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
SPACY_STOP = nlp.Defaults.stop_words
RE_NONWORD = re.compile(r"\W+")
RE_DIGITS  = re.compile(r"\d+")

def preprocess(text: str) -> str:
    text = RE_DIGITS.sub(" ", RE_NONWORD.sub(" ", text.lower()))
    doc = nlp(text)
    return " ".join(tok.lemma_ for tok in doc if len(tok) > 1 and tok.lemma_ not in SPACY_STOP)

# ── YOUR INPUT
resume = """Skilled MERN Stack Developer with experience in React, Node.js, Express, MongoDB, and NLP projects. Built ATS resume checker using ML and Gemini LLM."""
job_description = """We are looking for a Full-Stack Developer proficient in React, Node.js, and modern AI/NLP tools to build web-based ML products."""
role = "Full Stack Developer"

# ── Preprocess
res_clean = preprocess(resume)
jd_clean  = preprocess(job_description)
role_clean = preprocess(role)

# ── Encode using SBERT
res_vec, jd_vec, role_vec = sbert.encode(
    [res_clean, jd_clean, role_clean], convert_to_numpy=True
)

# ── Compute similarities
sim_rj = cosine_similarity([res_vec], [jd_vec])[0][0]
sim_rr = cosine_similarity([res_vec], [role_vec])[0][0]

# ── Scale similarities
sim_rj_scaled = scaler_rj.transform([[sim_rj]])[0][0]
sim_rr_scaled = scaler_rr.transform([[sim_rr]])[0][0]

# ── Final ATS score
ats_score = round((0.7 * sim_rj_scaled + 0.3 * sim_rr_scaled) * 100, 2)

# ── Output
print("🎯 ATS Score:", ats_score)
print("Similarity (Resume ↔ JD):", round(sim_rj, 4))
print("Similarity (Resume ↔ Role):", round(sim_rr, 4))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

🎯 ATS Score: 73.9
Similarity (Resume ↔ JD): 0.6019
Similarity (Resume ↔ Role): 0.4788
