<a href="https://colab.research.google.com/github/abhishekkpatell/DEMONS/blob/main/job_role_prediction_with_serpapi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Job Role Prediction Notebook (SerpAPI + Transformer)

This Colab-ready notebook will:

1. Install dependencies
2. Optionally fetch real job postings using **SerpAPI** (you'll need a SerpAPI key)
3. Or use the included synthetic fallback dataset `jobs_dataset.csv`
4. Train a sentence-transformer based classifier to predict job roles/categories
5. Evaluate and save the model

**How to use:** Upload this notebook to Google Colab. If you have a SerpAPI key, paste it when prompted to fetch real job postings. Otherwise the notebook will use the included dataset.


In [1]:
# Install dependencies (run in Colab)
!pip install -q sentence-transformers scikit-learn pandas joblib serpapi tqdm
!pip install google-search-results



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import pandas as pd
from tqdm import tqdm

DATA_PATH = '/content/jobs_dataset.csv'  # Colab path; we'll copy the included file to /content
LOCAL_INCLUDED = '/mnt/data/jobs_dataset.csv'  # included with this workspace; when you upload notebook to Colab, you can also upload this CSV
print('Local included dataset path:', LOCAL_INCLUDED)
# Copy included dataset into Colab runtime (if you upload it alongside notebook)
try:
    if os.path.exists(LOCAL_INCLUDED):
        print('Found included dataset at', LOCAL_INCLUDED)
        # In Colab you can upload the file; or if you manually place it, copy command below will work if files were uploaded to runtime.
        # !cp {LOCAL_INCLUDED} /content/jobs_dataset.csv
except Exception as e:
    print('Copy step skipped:', e)

if os.path.exists('/content/jobs_dataset.csv'):
    df = pd.read_csv('/content/jobs_dataset.csv')
    print('Loaded dataset with', len(df), 'rows. Sample:')
    display(df.head())
else:
    print('No dataset found in runtime yet; the notebook will fetch via SerpAPI if you provide a key, or generate a fallback dataset later.')

Local included dataset path: /mnt/data/jobs_dataset.csv
Loaded dataset with 400 rows. Sample:


Unnamed: 0,job_title,description,category
0,DevOps Engineer - Lead,"DevOps Engineer role. Required: Kubernetes, mo...",DevOps Engineer
1,ML Engineer - Senior,"ML Engineer role. Required: TensorFlow, PyTorc...",ML Engineer
2,Python Developer - II,"Python Developer role. Required: PostgreSQL, F...",Python Developer
3,SDE - Senior,"SDE role. Required: Java, REST APIs, Node.js. ...",SDE
4,Python Developer - I,"Python Developer role. Required: Django, Pytho...",Python Developer


In [3]:
# Optionally fetch real job postings using SerpAPI.
# If you have a SerpAPI key, set SERPAPI_KEY variable below and run this cell.
# If you leave SERPAPI_KEY empty, the notebook will use the included fallback dataset.
SERPAPI_KEY = ''  # <-- paste your SerpAPI key here

if SERPAPI_KEY.strip():
    print('Will fetch real job postings using SerpAPI.')
else:
    print('No SerpAPI key provided. Using included dataset or fallback.')


No SerpAPI key provided. Using included dataset or fallback.


In [4]:
# Fetch job postings using SerpAPI (Google Jobs via SerpAPI)
# You will need a SerpAPI key. This example searches by query and location and collects basic fields.
!pip install google-search-results
from serpapi import GoogleSearch

SERPAPI_KEY = "r8275JVEBYqPy5hYAguESb8L"  # Replace with your real key

def fetch_jobs(query, location='', num=50):
    params = {
        'engine': 'google_jobs',
        'q': query,
        'hl': 'en',
        'num': num,
        'location': location,
        'api_key': SERPAPI_KEY
    }
    search = GoogleSearch(params)
    results = search.get_dict()
    jobs = results.get('jobs_results', [])
    out = []
    for j in jobs:
        out.append({
            'job_title': j.get('title', ''),
            'company': j.get('company_name', ''),
            'location': j.get('location', ''),
            'description': j.get('description', '')
        })
    return out

# Example usage
jobs_data = fetch_jobs("Python Developer", "India", num=20)
print(jobs_data)


# Example usage (left commented; uncomment and run when you have your key):
# jobs = fetch_jobs('Python developer', 'India', num=50, serpapi_key=SERPAPI_KEY)
# print(len(jobs))

[]


In [5]:
# Build dataset: either fetch via SerpAPI (if key provided) or load included CSV as fallback.
SERPAPI_KEY = SERPAPI_KEY.strip() if 'SERPAPI_KEY' in globals() else ''
if SERPAPI_KEY:
    print('Fetching multiple categories via SerpAPI...')
    categories = ['SDE','Data Scientist','ML Engineer','Python Developer','Data Analyst','Frontend Engineer','DevOps Engineer','QA Engineer']
    rows = []
    for cat in categories:
        q = cat + ' developer'
        print('Fetching for', q)
        try:
            jobs = fetch_jobs(q, num=50, serpapi_key=SERPAPI_KEY)
            for j in jobs:
                j['category'] = cat
                rows.append(j)
        except Exception as e:
            print('Error fetching for', q, e)
    df = pd.DataFrame(rows)
    df.to_csv('/content/jobs_dataset.csv', index=False)
    print('Saved fetched dataset with', len(df), 'rows to /content/jobs_dataset.csv')
else:
    # use included fallback which should have been copied earlier; if not, generate a synthetic dataset inline
    if os.path.exists('/content/jobs_dataset.csv'):
        df = pd.read_csv('/content/jobs_dataset.csv')
        print('Using included dataset with', len(df), 'rows.')
    else:
        print('No included dataset found in runtime; generating synthetic dataset (300 rows) as fallback.')
        import numpy as np
        roles = ['SDE','Data Scientist','ML Engineer','Python Developer','Data Analyst','Frontend Engineer','DevOps Engineer','QA Engineer']
        skill_snippets = {
            'SDE': ['Java','C++','algorithms','data structures','system design','REST APIs','Spring','Node.js'],
            'Data Scientist': ['Python','pandas','scikit-learn','statistics','visualization','SQL','R'],
            'ML Engineer': ['TensorFlow','PyTorch','model deployment','feature engineering','ML pipelines','Docker'],
            'Python Developer': ['Python','Django','Flask','REST','APIs','PostgreSQL'],
            'Data Analyst': ['SQL','Excel','Tableau','reporting','data cleaning','BI'],
            'Frontend Engineer': ['React','JavaScript','HTML','CSS','TypeScript','UI','UX'],
            'DevOps Engineer': ['Docker','Kubernetes','CI/CD','AWS','infrastructure','monitoring'],
            'QA Engineer': ['testing','test automation','Selenium','unit tests','bug tracking']
        }
        rows = []
        np.random.seed(1)
        for i in range(300):
            role = np.random.choice(roles)
            title = role + ' - ' + np.random.choice(['I','II','Senior','Lead','Intern'])
            desc = role + ' position. Required skills: ' + ', '.join(np.random.choice(skill_snippets[role], size=3, replace=False))
            rows.append({'job_title': title, 'company': 'Acme Inc', 'location': 'Remote', 'description': desc, 'category': role})
        df = pd.DataFrame(rows)
        df.to_csv('/content/jobs_dataset.csv', index=False)
        print('Generated synthetic dataset with', len(df), 'rows at /content/jobs_dataset.csv')

print(df.head())

Fetching multiple categories via SerpAPI...
Fetching for SDE developer
Error fetching for SDE developer fetch_jobs() got an unexpected keyword argument 'serpapi_key'
Fetching for Data Scientist developer
Error fetching for Data Scientist developer fetch_jobs() got an unexpected keyword argument 'serpapi_key'
Fetching for ML Engineer developer
Error fetching for ML Engineer developer fetch_jobs() got an unexpected keyword argument 'serpapi_key'
Fetching for Python Developer developer
Error fetching for Python Developer developer fetch_jobs() got an unexpected keyword argument 'serpapi_key'
Fetching for Data Analyst developer
Error fetching for Data Analyst developer fetch_jobs() got an unexpected keyword argument 'serpapi_key'
Fetching for Frontend Engineer developer
Error fetching for Frontend Engineer developer fetch_jobs() got an unexpected keyword argument 'serpapi_key'
Fetching for DevOps Engineer developer
Error fetching for DevOps Engineer developer fetch_jobs() got an unexpected

In [8]:
# Train a multi-class classifier using sentence-transformers embeddings
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib
import numpy as np

df = pd.read_csv('/content/jobs_dataset.csv')
print('Dataset rows:', len(df))
df = df.dropna(subset=['description']).reset_index(drop=True)

# Create labels
labels = df['category'].astype(str)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(labels)

# Use sentence-transformers to encode descriptions
model_name = 'all-MiniLM-L6-v2'
print('Loading embedding model', model_name)
embedder = SentenceTransformer(model_name)

descriptions = df['description'].tolist()
embs = embedder.encode(descriptions, show_progress_bar=True, batch_size=32)

X = np.array(embs)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

clf = LogisticRegression(max_iter=2000, multi_class='ovr')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Save model and label encoder
joblib.dump(clf, '/content/job_role_classifier.joblib')
joblib.dump(le, '/content/label_encoder.joblib')
print('Saved classifier and label encoder to /content')

EmptyDataError: No columns to parse from file

In [7]:
# Test prediction with a sample description
import joblib
clf = joblib.load('/content/job_role_classifier.joblib')
le = joblib.load('/content/label_encoder.joblib')
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('all-MiniLM-L6-v2')

def predict_role(text):
    emb = embedder.encode([text])[0].reshape(1,-1)
    pred = clf.predict(emb)[0]
    return le.inverse_transform([pred])[0]

sample = 'We are looking for an experienced Python developer to build REST APIs and backend services using Django and PostgreSQL.'
print('Predicted role:', predict_role(sample))

FileNotFoundError: [Errno 2] No such file or directory: '/content/job_role_classifier.joblib'

In [None]:
# Save artifacts for download in Colab environment
# If running in Colab, these cells will pop up download dialogs.
try:
    from google.colab import files
    files.download('/content/jobs_dataset.csv')  # dataset
    files.download('/content/job_role_classifier.joblib')  # model
    files.download('/content/label_encoder.joblib')  # label encoder
except Exception as e:
    print('Download in non-Colab environment: files are available in the runtime at /content/')
