# **Import Library**

In [25]:
import os
import re
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import spacy
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# **Ensure required NLTK packages are downloaded**

In [26]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zafir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# **1. Load the Dataset**

In [27]:
df = pd.read_csv("job_descriptions.csv")

In [28]:
df.shape

(1615940, 23)

In [29]:
df.head()

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,1089843540111562,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,54.2361,-4.5481,Intern,26801,...,001-381-930-7517x737,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie..."
1,398454096642776,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,38.9697,59.5563,Intern,100340,...,461-509-4216,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com..."
2,481640072963533,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",22.1987,113.5439,Temporary,84525,...,9687619505,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P..."
3,688192671473044,4 to 11 Years,PhD,$65K-$91K,Porto-Novo,Benin,9.3077,2.3158,Full-Time,129896,...,+1-820-643-5431x47576,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O..."
4,117057806156508,1 to 12 Years,MBA,$64K-$87K,Santiago,Chile,-35.6751,-71.5429,Intern,53944,...,343.975.4702x9340,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ..."


# **2. Data Cleaning and Preparation**


In [30]:
df.isnull().sum()

Job Id                 0
Experience             0
Qualifications         0
Salary Range           0
location               0
Country                0
latitude               0
longitude              0
Work Type              0
Company Size           0
Job Posting Date       0
Preference             0
Contact Person         0
Contact                0
Job Title              0
Role                   0
Job Portal             0
Job Description        0
Benefits               0
skills                 0
Responsibilities       0
Company                0
Company Profile     5478
dtype: int64

In [31]:
df.dropna(inplace=True)

In [32]:
df.isnull().sum()

Job Id              0
Experience          0
Qualifications      0
Salary Range        0
location            0
Country             0
latitude            0
longitude           0
Work Type           0
Company Size        0
Job Posting Date    0
Preference          0
Contact Person      0
Contact             0
Job Title           0
Role                0
Job Portal          0
Job Description     0
Benefits            0
skills              0
Responsibilities    0
Company             0
Company Profile     0
dtype: int64

### **1. Handling Duplicate Data**

In [33]:
df.duplicated().sum()

0

In [34]:
# Remove duplicate rows
df.drop_duplicates(inplace=True)
print("Shape after removing duplicates:", df.shape)

Shape after removing duplicates: (1610462, 23)


### **2. Handling Missing Data**


In [35]:
# Drop rows with missing job_description or job_title
df.dropna(subset=["Job Description", "Job Title"], inplace=True) # Drop rows where critical fields are missing
print("Shape after dropping rows with missing values:", df.shape)

Shape after dropping rows with missing values: (1610462, 23)


### **3. Handling Outliers (e.g., job postings with unrealistic word counts)**


In [36]:
desc_length = df["Job Description"].apply(lambda x: len(str(x).split()))
q1, q3 = np.percentile(desc_length, [25, 75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
df = df[(desc_length >= lower_bound) & (desc_length <= upper_bound)]
print("Shape after handling outliers:", df.shape)

Shape after handling outliers: (1575780, 23)


# **3. Handling Inconsistent Data & Text Cleaning**

#### **Text Cleaning with Lemmatization and Stemming**


In [37]:
# Load spaCy model (Enable GPU if available)
# Enable GPU for spaCy (raises error if no GPU is available)
try:
    spacy.require_gpu()
    print("Using GPU for spaCy processing.")
except:
    print("No GPU available, using CPU.")    

Using GPU for spaCy processing.


In [38]:
# Initialize spaCy model and NLTK's PorterStemmer
nlp = spacy.load("en_core_web_sm")
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

In [39]:
def clean_text_preprocess(text):
    """
    Preprocess the input text by:
      - Converting to lowercase.
      - Removing HTML tags.
      - Removing non-alphabetical characters.
      - Removing extra whitespace.
      - Removing stopwords.
    """
    text = text.lower()
    text = re.sub(r"<[^>]+>", " ", text)  # Remove HTML tags
    text = re.sub(r"[^a-z\s]", " ", text)  # Keep only alphabets
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

In [40]:
# If you have any RAM requirement—8 GB, 16 GB, 32 GB, or 64 GB—you can replace batch_size = any GB accordingly
def clean_text_batch(texts, batch_size=32):
    """
    Process a list of texts using spaCy's nlp.pipe in batches.
    Applies pre-processing, lemmatization, and stemming.
    
    Args:
        texts (list): List of raw text strings.
        batch_size (int): Batch size for spaCy processing.
        
    Returns:
        list: List of cleaned text strings.
    """
    preprocessed_texts = [clean_text_preprocess(text) for text in texts]
    cleaned_texts = []
    for doc in nlp.pipe(preprocessed_texts, batch_size=batch_size):
        processed_tokens = [stemmer.stem(token.lemma_) for token in doc if token.is_alpha]
        cleaned_texts.append(" ".join(processed_tokens))
    return cleaned_texts

In [41]:
# If you have any RAM requirement—8 GB, 16 GB, 32 GB, or 64 GB—you can replace batch_size = any GB accordingly
# Apply cleaning in batches or If 
df["Cleaned Job Description"] = clean_text_batch(df["Job Description"].tolist(), batch_size=32)

In [42]:
# Standardize job titles (lowercasing and removing extra spaces)
df["Cleaned Job Title"] = df["Job Title"].apply(lambda x: x.lower().strip())

In [43]:
print("Preview of cleaned job descriptions and titles:")
df[["Cleaned Job Description", "Cleaned Job Title"]].head()

Preview of cleaned job descriptions and titles:


Unnamed: 0,Cleaned Job Description,Cleaned Job Title
0,social medium manag overse organ social medium...,digital marketing specialist
1,frontend web develop design implement user int...,web developer
2,qualiti control manag establish enforc qualiti...,operations manager
3,wireless network engin design implement mainta...,network engineer
4,confer manag coordin manag confer meet event p...,event manager


In [44]:
# Analyze job title distribution (show top 10)
print("Job title distribution (top 10):")
df["Cleaned Job Title"].value_counts().head(10)

Job title distribution (top 10):


Cleaned Job Title
ux/ui designer                  48376
digital marketing specialist    27881
software engineer               27537
network engineer                24307
software tester                 20871
executive assistant             20710
procurement manager             20664
financial advisor               20605
sales representative            17612
social media manager            17549
Name: count, dtype: int64

# **4. Feature Scaling (If Numerical Features Exist)**

#### **Define a helper function to parse salary range strings like "$59K-$99K"**

In [45]:
def parse_salary_range(salary_str):
    """
    Converts a salary range string (e.g., "$59K-$99K") into a numeric average.
    Assumes salary values are in 'K' (thousands) and returns the average salary in thousands.
    """
    # Remove dollar signs, 'K', commas, and extra whitespace
    salary_str = salary_str.replace("$", "").replace("K", "").replace(",", "").strip()
    try:
        # Split the range into minimum and maximum parts
        parts = salary_str.split("-")
        if len(parts) == 2:
            salary_min = float(parts[0])
            salary_max = float(parts[1])
            return (salary_min + salary_max) / 2 # Average of min and max
        else:
            return np.nan
    except Exception as e:
        return np.nan

#### **Check if "Salary Range" exists in the dataframe**

In [46]:
if "Salary Range" in df.columns:
    # Apply the parsing function to create a numeric average salary column
    df["Salary Avg"] = df["Salary Range"].apply(parse_salary_range)

    # Drop rows where the salary couldn't be parsed (optional)
    df.dropna(subset=["Salary Avg"], inplace=True)

    # Scale the numeric salary values using MinMaxScaler
    scaler = MinMaxScaler() # Alternative: StandardScaler()
    df["Scaled Salary"] = scaler.fit_transform(df[["Salary Avg"]])
    
    print("Scaled Salary (first 5 rows):")
    print(df["Scaled Salary"].head())

Scaled Salary (first 5 rows):
0    0.383333
1    0.616667
2    0.500000
3    0.350000
4    0.266667
Name: Scaled Salary, dtype: float64


# **5. Feature Extraction using TF-IDF**

In [47]:
tfidf = TfidfVectorizer(stop_words="english", max_features=5000, binary=True)
X = tfidf.fit_transform(df["Cleaned Job Description"])
print("TF-IDF feature matrix shape:", X.shape)

TF-IDF feature matrix shape: (1575780, 931)


In [48]:
X[0].sum()

3.8927433225415435

In [49]:
tfidf.get_feature_names_out()

array(['abus', 'academ', 'accept', 'access', 'accordingli', 'account',
       'accur', 'accuraci', 'achiev', 'acquir', 'acquisit', 'action',
       'activ', 'acut', 'ad', 'adapt', 'addict', 'address', 'adher',
       'adjust', 'administ', 'administr', 'adolesc', 'adopt', 'adult',
       'advanc', 'advers', 'advertis', 'advic', 'advis', 'advisor',
       'advoc', 'aerodynam', 'aesthet', 'age', 'agenc', 'agent', 'agil',
       'agreement', 'aid', 'aim', 'air', 'aircraft', 'algorithm', 'align',
       'allianc', 'alloc', 'analysi', 'analyst', 'analyt', 'analyz',
       'anim', 'answer', 'anthropolog', 'api', 'app', 'appeal', 'applic',
       'appreci', 'architect', 'architectur', 'area', 'arrang', 'art',
       'articl', 'artist', 'artwork', 'aspect', 'assess', 'asset',
       'assist', 'associ', 'assur', 'attend', 'attende', 'attorney',
       'attract', 'audienc', 'audit', 'autom', 'avail', 'avion', 'awar',
       'backend', 'backlog', 'base', 'beauti', 'behavior', 'belong',
       'ben

# **7. Compute Cosine Similarity**

In [53]:
from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute').fit(X)
distances, indices = nbrs.kneighbors(X[:5])
print(indices)

[[ 507051  714756 1301140  557657  664269]
 [ 757229  459574  757173 1269186  502485]
 [ 488362  660281 1095811  488254  881514]
 [ 840509 1171424  145086  517553 1356708]
 [ 626322  451355  212906   26789 1243801]]


# **8. Encode Target Variable (Job Titles)**

In [54]:
le = LabelEncoder()
y = le.fit_transform(df["Cleaned Job Title"])
print("Number of unique job titles:", len(le.classes_))
print("Sample job title classes:", le.classes_[:10])

Number of unique job titles: 143
Sample job title classes: ['account director' 'account executive' 'account manager' 'accountant'
 'administrative assistant' 'aerospace engineer' 'architect'
 'architectural designer' 'art director' 'art teacher']


# **9. Train-Test Split**

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [56]:
print("Training set shape:", X_train.shape)

Training set shape: (1260624, 931)


In [57]:
print("Test set shape:", X_test.shape)

Test set shape: (315156, 931)


# **10. Model Training and Analysis**

### Train a Decision Tree Classifier

In [58]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy:", dt_accuracy)
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt, target_names=le.classes_))

Decision Tree Accuracy: 1.0
Decision Tree Classification Report:
                                     precision    recall  f1-score   support

                   account director       1.00      1.00      1.00      1379
                  account executive       1.00      1.00      1.00      2096
                    account manager       1.00      1.00      1.00      2768
                         accountant       1.00      1.00      1.00      2097
           administrative assistant       1.00      1.00      1.00      3486
                 aerospace engineer       1.00      1.00      1.00      2081
                          architect       1.00      1.00      1.00      2779
             architectural designer       1.00      1.00      1.00      1357
                       art director       1.00      1.00      1.00      2073
                        art teacher       1.00      1.00      1.00      2116
                 back-end developer       1.00      1.00      1.00      1390
          

### Train a Random Forest Classifier

In [59]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=le.classes_))

Random Forest Accuracy: 1.0
Random Forest Classification Report:
                                     precision    recall  f1-score   support

                   account director       1.00      1.00      1.00      1379
                  account executive       1.00      1.00      1.00      2096
                    account manager       1.00      1.00      1.00      2768
                         accountant       1.00      1.00      1.00      2097
           administrative assistant       1.00      1.00      1.00      3486
                 aerospace engineer       1.00      1.00      1.00      2081
                          architect       1.00      1.00      1.00      2779
             architectural designer       1.00      1.00      1.00      1357
                       art director       1.00      1.00      1.00      2073
                        art teacher       1.00      1.00      1.00      2116
                 back-end developer       1.00      1.00      1.00      1390
          

### Plot and save confusion matrix for Random Forest model


In [60]:
cm = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(35, 25))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Random Forest Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.savefig("random_forest_confusion_matrix.png")
plt.close()
print("Confusion matrix saved as 'random_forest_confusion_matrix.png'")

Confusion matrix saved as 'random_forest_confusion_matrix.png'


# **11. Save Artifacts (Models, Vectorizer, Label Encoder)**

In [61]:
model_dir = "models"
os.makedirs(model_dir, exist_ok=True)

joblib.dump(tfidf, os.path.join(model_dir, "tfidf_vectorizer.pkl"))
joblib.dump(dt_model, os.path.join(model_dir, "decision_tree_model.pkl"))
joblib.dump(rf_model, os.path.join(model_dir, "random_forest_model.pkl"))
joblib.dump(le, os.path.join(model_dir, "label_encoder.pkl"))

print("Artifacts saved successfully in the 'models' directory.")

Artifacts saved successfully in the 'models' directory.
