<a href="https://colab.research.google.com/github/abhishekkpatell/DEMONS/blob/main/job_role_prediction_with_serpapi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Job Role Prediction Notebook (SerpAPI + Transformer)

This Colab-ready notebook will:

1. Install dependencies
2. Optionally fetch real job postings using **SerpAPI** (you'll need a SerpAPI key)
3. Or use the included synthetic fallback dataset `jobs_dataset.csv`
4. Train a sentence-transformer based classifier to predict job roles/categories
5. Evaluate and save the model

**How to use:** Upload this notebook to Google Colab. If you have a SerpAPI key, paste it when prompted to fetch real job postings. Otherwise the notebook will use the included dataset.


In [1]:
# Install dependencies (run in Colab)
!pip install -q sentence-transformers scikit-learn pandas joblib serpapi tqdm
!pip install google-search-results



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import pandas as pd
from tqdm import tqdm

DATA_PATH = '/content/jobs_dataset.csv'  # Colab path; we'll copy the included file to /content
LOCAL_INCLUDED = 'jobs_dataset.csv'  # included with this workspace; when you upload notebook to Colab, you can also upload this CSV
print('Local included dataset path:', LOCAL_INCLUDED)
# Copy included dataset into Colab runtime (if you upload it alongside notebook)
try:
    if os.path.exists(LOCAL_INCLUDED):
        print('Found included dataset at', LOCAL_INCLUDED)
        # In Colab you can upload the file; or if you manually place it, copy command below will work if files were uploaded to runtime.
        # !cp {LOCAL_INCLUDED} /content/jobs_dataset.csv
except Exception as e:
    print('Copy step skipped:', e)

if os.path.exists('/content/jobs_dataset.csv'):
    df = pd.read_csv('/content/jobs_dataset.csv')
    print('Loaded dataset with', len(df), 'rows. Sample:')
    display(df.head())
else:
    print('No dataset found in runtime yet; the notebook will fetch via SerpAPI if you provide a key, or generate a fallback dataset later.')

Local included dataset path: jobs_dataset.csv
Found included dataset at jobs_dataset.csv
Loaded dataset with 400 rows. Sample:


Unnamed: 0,job_title,description,category
0,DevOps Engineer - Lead,"DevOps Engineer role. Required: Kubernetes, mo...",DevOps Engineer
1,ML Engineer - Senior,"ML Engineer role. Required: TensorFlow, PyTorc...",ML Engineer
2,Python Developer - II,"Python Developer role. Required: PostgreSQL, F...",Python Developer
3,SDE - Senior,"SDE role. Required: Java, REST APIs, Node.js. ...",SDE
4,Python Developer - I,"Python Developer role. Required: Django, Pytho...",Python Developer


In [3]:
import os
import pandas as pd
from tqdm import tqdm

# Single place for all paths
BASE_DIR = os.getenv("WORK_DIR", os.getcwd())
DATA_PATH = os.path.join(BASE_DIR, "jobs_dataset.csv")
MODEL_PATH = os.path.join(BASE_DIR, "job_role_classifier.joblib")
LE_PATH = os.path.join(BASE_DIR, "label_encoder.joblib")

print("BASE_DIR:", BASE_DIR)
print("DATA_PATH:", DATA_PATH)
print("MODEL_PATH:", MODEL_PATH)

if os.path.exists(DATA_PATH):
    df = pd.read_csv(DATA_PATH)
    print("Loaded dataset with", len(df), "rows.")
    display(df.head())
else:
    print("No dataset found at", DATA_PATH, "— will try SerpAPI or synthetic fallback next.")


BASE_DIR: /content
DATA_PATH: /content/jobs_dataset.csv
MODEL_PATH: /content/job_role_classifier.joblib
Loaded dataset with 400 rows.


Unnamed: 0,job_title,description,category
0,DevOps Engineer - Lead,"DevOps Engineer role. Required: Kubernetes, mo...",DevOps Engineer
1,ML Engineer - Senior,"ML Engineer role. Required: TensorFlow, PyTorc...",ML Engineer
2,Python Developer - II,"Python Developer role. Required: PostgreSQL, F...",Python Developer
3,SDE - Senior,"SDE role. Required: Java, REST APIs, Node.js. ...",SDE
4,Python Developer - I,"Python Developer role. Required: Django, Pytho...",Python Developer


In [10]:
SERPAPI_KEY = "r8275JVEBYqPy5hYAguESb8L"

if SERPAPI_KEY:
    print("SerpAPI key detected — will fetch real postings.")
else:
    print("No SerpAPI key provided — will use local CSV if present, otherwise synthetic data.")

SerpAPI key detected — will fetch real postings.


In [11]:
# Install serpapi client only if key present
if SERPAPI_KEY:
    try:
        import serpapi  # type: ignore
        from serpapi import GoogleSearch
    except Exception:
        !pip install -q google-search-results
        from serpapi import GoogleSearch

def fetch_jobs(query, location="", num=50, serpapi_key=None):
    """
    Fetch basic job fields using SerpAPI Google Jobs.
    """
    if not serpapi_key:
        raise ValueError("SerpAPI key missing.")
    params = {
        "engine": "google_jobs",
        "q": query,
        "hl": "en",
        "location": location,
        "api_key": serpapi_key,
    }
    results, out = [], []
    page = 0
    while len(out) < num and page < 5:
        p = params.copy()
        if page:
            p["start"] = page * 10
        search = GoogleSearch(p)
        res = search.get_dict()
        jobs = res.get("jobs_results", []) or []
        for j in jobs:
            out.append({
                "job_title": j.get("title", ""),
                "company": j.get("company_name", ""),
                "location": j.get("location", ""),
                "description": j.get("description", ""),
            })
            if len(out) >= num:
                break
        if not jobs:
            break
        page += 1
    return out


In [12]:
import numpy as np

if not os.path.exists(DATA_PATH):
    if SERPAPI_KEY:
        print("Fetching multiple categories via SerpAPI...")
        categories = [
            "SDE", "Data Scientist", "ML Engineer", "Python Developer",
            "Data Analyst", "Frontend Engineer", "DevOps Engineer", "QA Engineer"
        ]
        rows = []
        for cat in categories:
            q = f"{cat} developer"
            print("Fetching:", q)
            try:
                jobs = fetch_jobs(q, location="India", num=40, serpapi_key=SERPAPI_KEY)
                for j in jobs:
                    j["category"] = cat
                    rows.append(j)
            except Exception as e:
                print("Fetch error for", q, "->", e)
        df = pd.DataFrame(rows)
        if len(df) == 0:
            print("SerpAPI returned no rows — falling back to synthetic.")
    if (not os.path.exists(DATA_PATH)) and (SERPAPI_KEY == "" or len(df) == 0):
        print("Generating synthetic dataset...")
        roles = [
            "SDE", "Data Scientist", "ML Engineer", "Python Developer",
            "Data Analyst", "Frontend Engineer", "DevOps Engineer", "QA Engineer"
        ]
        skill_snippets = {
            "SDE": ["OOP", "DSA", "microservices", "REST", "MySQL"],
            "Data Scientist": ["statistics", "pandas", "sklearn", "A/B testing", "matplotlib"],
            "ML Engineer": ["TensorFlow", "PyTorch", "MLOps", "ONNX", "deployment"],
            "Python Developer": ["Django", "Flask", "FastAPI", "PostgreSQL", "APIs"],
            "Data Analyst": ["SQL", "Tableau", "PowerBI", "Excel", "ETL"],
            "Frontend Engineer": ["React", "Redux", "TypeScript", "Webpack", "CSS"],
            "DevOps Engineer": ["Docker", "Kubernetes", "CI/CD", "AWS", "monitoring"],
            "QA Engineer": ["Selenium", "pytest", "test plans", "regression", "bug tracking"],
        }
        rows = []
        rng = np.random.default_rng(1)
        for _ in range(400):
            role = rng.choice(roles)
            title = f"{role} - {rng.choice(['I','II','Senior','Lead','Intern'])}"
            desc = f"{role} position. Required skills: " + ", ".join(rng.choice(skill_snippets[role], size=3, replace=False))
            rows.append({"job_title": title, "company": "Acme Inc", "location": "Remote", "description": desc, "category": role})
        df = pd.DataFrame(rows)

    df.to_csv(DATA_PATH, index=False)
    print(f"Dataset ready at {DATA_PATH} — rows: {len(df)}")
else:
    print("Using existing dataset at", DATA_PATH)

display(pd.read_csv(DATA_PATH).head())

Using existing dataset at /content/jobs_dataset.csv


Unnamed: 0,job_title,description,category
0,DevOps Engineer - Lead,"DevOps Engineer role. Required: Kubernetes, mo...",DevOps Engineer
1,ML Engineer - Senior,"ML Engineer role. Required: TensorFlow, PyTorc...",ML Engineer
2,Python Developer - II,"Python Developer role. Required: PostgreSQL, F...",Python Developer
3,SDE - Senior,"SDE role. Required: Java, REST APIs, Node.js. ...",SDE
4,Python Developer - I,"Python Developer role. Required: Django, Pytho...",Python Developer


In [13]:
# Embed with sentence-transformers, then simple LogisticRegression classifier
try:
    from sentence_transformers import SentenceTransformer
except Exception:
    !pip install -q sentence-transformers
    from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib
import numpy as np
import pandas as pd

df = pd.read_csv(DATA_PATH)
df = df.dropna(subset=["description"]).reset_index(drop=True)
labels_text = df["category"].astype(str)

le = LabelEncoder()
y = le.fit_transform(labels_text)

embedder = SentenceTransformer("all-MiniLM-L6-v2")
embs = embedder.encode(df["description"].tolist(), show_progress_bar=True, convert_to_numpy=True)
X = np.asarray(embs)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf = LogisticRegression(max_iter=2000, multi_class="ovr", n_jobs=None)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)
print(classification_report(y_test, y_pred, target_names=le.classes_))

joblib.dump(clf, MODEL_PATH)
joblib.dump(le, LE_PATH)
print("Saved model to:", MODEL_PATH)
print("Saved label encoder to:", LE_PATH)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Accuracy: 1.0
                   precision    recall  f1-score   support

     Data Analyst       1.00      1.00      1.00        10
   Data Scientist       1.00      1.00      1.00        11
  DevOps Engineer       1.00      1.00      1.00        10
Frontend Engineer       1.00      1.00      1.00         9
      ML Engineer       1.00      1.00      1.00         9
 Python Developer       1.00      1.00      1.00        12
      QA Engineer       1.00      1.00      1.00         9
              SDE       1.00      1.00      1.00        10

         accuracy                           1.00        80
        macro avg       1.00      1.00      1.00        80
     weighted avg       1.00      1.00      1.00        80

Saved model to: /content/job_role_classifier.joblib
Saved label encoder to: /content/label_encoder.joblib




In [14]:
import joblib
from sentence_transformers import SentenceTransformer
import numpy as np

clf = joblib.load(MODEL_PATH)
le = joblib.load(LE_PATH)
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def predict_role(text: str) -> str:
    emb = embedder.encode([text], convert_to_numpy=True)
    pred = clf.predict(emb)[0]
    return le.inverse_transform([pred])[0]

sample = "We need a developer to build REST APIs and backend services using Django and PostgreSQL."
print("Predicted role:", predict_role(sample))

Predicted role: Python Developer


In [15]:
try:
    from google.colab import files
    files.download(DATA_PATH)
    files.download(MODEL_PATH)
    files.download(LE_PATH)
except Exception:
    print("Non-Colab environment — artifacts saved under:", BASE_DIR)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>