# Import Library

In [1]:
import os
import re
import joblib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Load the Dataset

In [None]:
data_path = "job_descriptions.csv"

In [5]:
df = pd.read_csv(data_path)

In [6]:
df.shape

(1615940, 23)

In [9]:
print("Initial dataset shape:", df.shape)
print("Dataset preview:")
print(df.head())

Initial dataset shape: (1615940, 23)
Dataset preview:
             Job Id     Experience Qualifications Salary Range    location  \
0  1089843540111562  5 to 15 Years         M.Tech    $59K-$99K     Douglas   
1   398454096642776  2 to 12 Years            BCA   $56K-$116K    Ashgabat   
2   481640072963533  0 to 12 Years            PhD   $61K-$104K       Macao   
3   688192671473044  4 to 11 Years            PhD    $65K-$91K  Porto-Novo   
4   117057806156508  1 to 12 Years            MBA    $64K-$87K    Santiago   

            Country  latitude  longitude  Work Type  Company Size  ...  \
0       Isle of Man   54.2361    -4.5481     Intern         26801  ...   
1      Turkmenistan   38.9697    59.5563     Intern        100340  ...   
2  Macao SAR, China   22.1987   113.5439  Temporary         84525  ...   
3             Benin    9.3077     2.3158  Full-Time        129896  ...   
4             Chile  -35.6751   -71.5429     Intern         53944  ...   

                 Contact        

In [7]:
df.head()

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,1089843540111562,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,54.2361,-4.5481,Intern,26801,...,001-381-930-7517x737,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie..."
1,398454096642776,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,38.9697,59.5563,Intern,100340,...,461-509-4216,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com..."
2,481640072963533,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",22.1987,113.5439,Temporary,84525,...,9687619505,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P..."
3,688192671473044,4 to 11 Years,PhD,$65K-$91K,Porto-Novo,Benin,9.3077,2.3158,Full-Time,129896,...,+1-820-643-5431x47576,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O..."
4,117057806156508,1 to 12 Years,MBA,$64K-$87K,Santiago,Chile,-35.6751,-71.5429,Intern,53944,...,343.975.4702x9340,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ..."


# Data Cleaning and Preparation


In [10]:
# Remove duplicate rows
df.drop_duplicates(inplace=True)
print("Shape after removing duplicates:", df.shape)


Shape after removing duplicates: (1615940, 23)


In [12]:
df.columns

Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')

In [14]:
# Drop rows with missing job_description or job_title
df.dropna(subset=["Job Description", "Job Title"], inplace=True)
print("Shape after dropping rows with missing values:", df.shape)

Shape after dropping rows with missing values: (1615940, 23)


In [15]:
# Define a function to clean text data
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<[^>]+>', ' ', text) # Remove HTML tags if any
    text = re.sub(r'[^a-z\s]', ' ', text) # Remove punctuation, numbers, special characters
    text = re.sub(r'\s+', ' ',text).strip() # Remove extra whitespace
    return text


In [16]:
# Apply cleaning to  job descriptions and job titles
df["Cleaned Job Description"] = df["Job Description"].apply(clean_text)

In [17]:
df['Cleaned Job Title'] = df['Job Title'].apply(lambda x: x.lower().strip())

In [22]:
print("Preview of cleaned lob descriptions and titles:")
df[["Cleaned Job Description", "Cleaned Job Title"]].head()

Preview of cleaned lob descriptions and titles:


Unnamed: 0,Cleaned Job Description,Cleaned Job Title
0,social media managers oversee an organizations...,digital marketing specialist
1,frontend web developers design and implement u...,web developer
2,quality control managers establish and enforce...,operations manager
3,wireless network engineers design implement an...,network engineer
4,a conference manager coordinates and manages c...,event manager


In [23]:
# Analyze job title distribution (show top 10)
print("Job title distribution (top 10):")
df["Cleaned Job Title"].value_counts().head(10)

Job title distribution (top 10):


Cleaned Job Title
ux/ui designer                  48551
digital marketing specialist    27975
software engineer               27630
network engineer                24393
software tester                 20945
executive assistant             20776
procurement manager             20734
financial advisor               20687
sales representative            17664
social media manager            17613
Name: count, dtype: int64

# Feature Extraction using TF-IDF

In [24]:
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
X = tfidf.fit_transform(df["Cleaned Job Description"])
print("TF-IDF feature matrix shape:", X.shape)

TF-IDF feature matrix shape: (1615940, 1421)


# Prepare the Target

In [26]:
le = LabelEncoder()
y = le.fit_transform(df["Cleaned Job Title"])
print("Number of unique job titles:", len(le.classes_))
print("Sample job title classes:", le.classes_[:10])

Number of unique job titles: 147
Sample job title classes: ['account director' 'account executive' 'account manager' 'accountant'
 'administrative assistant' 'aerospace engineer' 'architect'
 'architectural designer' 'art director' 'art teacher']


# Train-Test Split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=54, stratify=y)

In [31]:
print("Training set shape:", X_train.shape)

Training set shape: (1292752, 1421)


In [32]:
print("Test set shape:", X_test.shape)

Test set shape: (323188, 1421)


# Model Training and Analysis

### Train a Decision Tree Classifier

In [33]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy:", dt_accuracy)
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt, target_names=le.classes_))

Decision Tree Accuracy: 1.0
Decision Tree Classification Report:
                                     precision    recall  f1-score   support

                   account director       1.00      1.00      1.00      1385
                  account executive       1.00      1.00      1.00      2102
                    account manager       1.00      1.00      1.00      2778
                         accountant       1.00      1.00      1.00      2103
           administrative assistant       1.00      1.00      1.00      3497
                 aerospace engineer       1.00      1.00      1.00      2088
                          architect       1.00      1.00      1.00      2791
             architectural designer       1.00      1.00      1.00      1361
                       art director       1.00      1.00      1.00      2077
                        art teacher       1.00      1.00      1.00      2122
                 back-end developer       1.00      1.00      1.00      1396
          

### Train a Random Forest Classifier

In [34]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=le.classes_))

Random Forest Accuracy: 1.0
Random Forest Classification Report:
                                     precision    recall  f1-score   support

                   account director       1.00      1.00      1.00      1385
                  account executive       1.00      1.00      1.00      2102
                    account manager       1.00      1.00      1.00      2778
                         accountant       1.00      1.00      1.00      2103
           administrative assistant       1.00      1.00      1.00      3497
                 aerospace engineer       1.00      1.00      1.00      2088
                          architect       1.00      1.00      1.00      2791
             architectural designer       1.00      1.00      1.00      1361
                       art director       1.00      1.00      1.00      2077
                        art teacher       1.00      1.00      1.00      2122
                 back-end developer       1.00      1.00      1.00      1396
          

### Plot and save confusion matrix for Random Forest model


In [35]:
cm = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Random Forest Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.savefig("random_forest_confusion_matrix.png")
plt.close()
print("Confusion matrix saved as 'random_forest_confusion_matrix.png'")

Confusion matrix saved as 'random_forest_confusion_matrix.png'


# Save Artifacts (Models, Vectorizer, Label Encoder)

In [36]:
model_dir = "models"
os.makedirs(model_dir, exist_ok=True)

joblib.dump(tfidf, os.path.join(model_dir, "tfidf_vectorizer.pkl"))
joblib.dump(dt_model, os.path.join(model_dir, "decision_tree_model.pkl"))
joblib.dump(rf_model, os.path.join(model_dir, "random_forest_model.pkl"))
joblib.dump(le, os.path.join(model_dir, "label_encoder.pkl"))

print("Artifacts saved successfully in the 'models' directory.")

Artifacts saved successfully in the 'models' directory.
