# **Import Libraries and Setup**

In [1]:
import os
import re
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import nltk
import spacy
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Ensure required NLTK packages are downloaded
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zafir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# **1. Load the Dataset**

In [2]:
df = pd.read_csv("job_description_subset_first.csv")

In [3]:
df

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,1089843540111562,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,54.2361,-4.5481,Intern,26801,...,001-381-930-7517x737,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie..."
1,398454096642776,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,38.9697,59.5563,Intern,100340,...,461-509-4216,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com..."
2,481640072963533,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",22.1987,113.5439,Temporary,84525,...,9687619505,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P..."
3,688192671473044,4 to 11 Years,PhD,$65K-$91K,Porto-Novo,Benin,9.3077,2.3158,Full-Time,129896,...,+1-820-643-5431x47576,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O..."
4,117057806156508,1 to 12 Years,MBA,$64K-$87K,Santiago,Chile,-35.6751,-71.5429,Intern,53944,...,343.975.4702x9340,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,294966963537218,3 to 9 Years,B.Tech,$65K-$96K,City of Port Louis,Mauritius,-20.3484,57.5522,Temporary,31536,...,824-652-9619x801,Web Designer,E-commerce Web Designer,Stack Overflow Jobs,E-commerce Web Designers specialize in designi...,"{'Flexible Spending Accounts (FSAs), Relocatio...",E-commerce web design UX/UI design Shopping ca...,Focus on designing user interfaces for e-comme...,Valero Energy,"{""Sector"":""Energy"",""Industry"":""Petroleum Refin..."
14996,1364760048987040,3 to 11 Years,M.Com,$55K-$116K,Willemstad,Curacao,12.1696,-68.9900,Full-Time,134005,...,844-896-0429x1384,Customer Service Representative,Live Chat Support Agent,SimplyHired,A Live Chat Support Agent provides customer su...,"{'Employee Assistance Programs (EAP), Tuition ...",Live chat support Online customer communicatio...,Offer real-time customer support through live ...,HDFC Ltd,"{""Sector"":""Financial Services"",""Industry"":""Ban..."
14997,1828052769874683,0 to 8 Years,M.Com,$62K-$104K,Prague,Czech Republic,49.8175,15.4730,Part-Time,117548,...,+1-635-614-8200,Software Tester,Quality Assurance Analyst,Stack Overflow Jobs,A Quality Assurance Analyst tests software and...,"{'Transportation Benefits, Professional Develo...",Quality assurance processes Testing methodolog...,Test software applications and systems to iden...,Alibaba Group,"{""Sector"":""E-commerce"",""Industry"":""E-commerce ..."
14998,597132853070793,5 to 8 Years,B.Com,$56K-$106K,Baku,Azerbaijan,40.1431,47.5769,Intern,115336,...,953.736.8405,Systems Administrator,Database Administrator,FlexJobs,"Database Administrators manage databases, ensu...","{'Casual Dress Code, Social and Recreational A...","Database management systems (e.g., MySQL, Orac...","Administer and optimize databases, ensuring da...",Glenmark Pharmaceuticals,"{""Sector"":""Pharmaceuticals"",""Industry"":""Pharma..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Job Id            15000 non-null  int64  
 1   Experience        15000 non-null  object 
 2   Qualifications    15000 non-null  object 
 3   Salary Range      15000 non-null  object 
 4   location          15000 non-null  object 
 5   Country           15000 non-null  object 
 6   latitude          15000 non-null  float64
 7   longitude         15000 non-null  float64
 8   Work Type         15000 non-null  object 
 9   Company Size      15000 non-null  int64  
 10  Job Posting Date  15000 non-null  object 
 11  Preference        15000 non-null  object 
 12  Contact Person    15000 non-null  object 
 13  Contact           15000 non-null  object 
 14  Job Title         15000 non-null  object 
 15  Role              15000 non-null  object 
 16  Job Portal        15000 non-null  object

In [5]:
df.describe()

Unnamed: 0,Job Id,latitude,longitude,Company Size
count,15000.0,15000.0,15000.0,15000.0
mean,1552358000000000.0,18.913126,15.935685,73682.708267
std,894354500000000.0,23.452494,71.48284,35480.644565
min,201817500000.0,-40.9006,-175.1982,12675.0
25%,772491000000000.0,4.5709,-15.3101,42591.0
50%,1558861000000000.0,17.6078,19.3744,73961.0
75%,2323712000000000.0,38.9697,48.5164,104234.0
max,3099079000000000.0,71.7069,178.065,134831.0


In [6]:
df.shape

(15000, 23)

In [7]:
df.size

345000

In [8]:
df.columns

Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')

# **2. Data Cleaning and Preparation**

### **1. Handling Duplicate Data**

In [9]:
df.isnull().sum()

Job Id               0
Experience           0
Qualifications       0
Salary Range         0
location             0
Country              0
latitude             0
longitude            0
Work Type            0
Company Size         0
Job Posting Date     0
Preference           0
Contact Person       0
Contact              0
Job Title            0
Role                 0
Job Portal           0
Job Description      0
Benefits             0
skills               0
Responsibilities     0
Company              0
Company Profile     53
dtype: int64

In [10]:
df.duplicated().sum()

0

In [11]:
df.dtypes

Job Id                int64
Experience           object
Qualifications       object
Salary Range         object
location             object
Country              object
latitude            float64
longitude           float64
Work Type            object
Company Size          int64
Job Posting Date     object
Preference           object
Contact Person       object
Contact              object
Job Title            object
Role                 object
Job Portal           object
Job Description      object
Benefits             object
skills               object
Responsibilities     object
Company              object
Company Profile      object
dtype: object

In [12]:
df.columns

Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')

In [13]:
# Filling null values with mode
for column in df.select_dtypes(include=['object']).columns:  # Select only categorical columns
    if df[column].isnull().any():  # Check if the column has any missing values
        df[column] = df[column].fillna(df[column].mode()[0])  # Fill with mode

In [14]:
df.isnull().sum()

Job Id              0
Experience          0
Qualifications      0
Salary Range        0
location            0
Country             0
latitude            0
longitude           0
Work Type           0
Company Size        0
Job Posting Date    0
Preference          0
Contact Person      0
Contact             0
Job Title           0
Role                0
Job Portal          0
Job Description     0
Benefits            0
skills              0
Responsibilities    0
Company             0
Company Profile     0
dtype: int64

In [15]:
# Remove duplicate rows
df.drop_duplicates(inplace=True)
print("Shape after removing duplicates:", df.shape)

Shape after removing duplicates: (15000, 23)


### **2. Handling Missing Data**

In [16]:
# Drop rows with missing job_description or job_title
df.dropna(subset=["Job Description", "Job Title"], inplace=True) # Drop rows where critical fields are missing
print("Shape after dropping rows with missing values:", df.shape)

Shape after dropping rows with missing values: (15000, 23)


# **3. Handling Outliers Based on Job Description Word Count**


In [17]:
desc_length = df["Job Description"].apply(lambda x: len(str(x).split()))
q1, q3 = np.percentile(desc_length, [25, 75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
df = df[(desc_length >= lower_bound) & (desc_length <= upper_bound)]
print("Shape after handling outliers:", df.shape)

Shape after handling outliers: (14659, 23)


# **4. Text Cleaning with spaCy & NLTK (Lemmatization and Stemming)**

In [18]:
# Load spaCy model (Enable GPU if available)
# Enable GPU for spaCy (raises error if no GPU is available)
try:
    spacy.require_gpu()
    print("Using GPU for spaCy processing.")
except:
    print("No GPU available, using CPU.")  

Using GPU for spaCy processing.


In [19]:
# Initialize spaCy model and NLTK's PorterStemmer
nlp = spacy.load("en_core_web_sm")
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

In [20]:
def clean_text_preprocess(text):
    """
    Preprocess the input text by:
      - Converting to lowercase.
      - Removing HTML tags.
      - Removing non-alphabetical characters.
      - Removing extra whitespace.
      - Removing stopwords.
    """
    text = text.lower()
    text = re.sub(r"<[^>]+>", " ", text)  # Remove HTML tags
    text = re.sub(r"[^a-z\s]", " ", text)  # Keep only alphabets
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

In [21]:
# If you have any RAM requirement—8 GB, 16 GB, 32 GB, or 64 GB—you can replace batch_size = any GB accordingly
def clean_text_batch(texts, batch_size=32):
    """
    Process a list of texts using spaCy's nlp.pipe in batches.
    Applies pre-processing, lemmatization, and stemming.
    
    Args:
        texts (list): List of raw text strings.
        batch_size (int): Batch size for spaCy processing.
        
    Returns:
        list: List of cleaned text strings.
    """
    preprocessed_texts = [clean_text_preprocess(text) for text in texts]
    cleaned_texts = []
    for doc in nlp.pipe(preprocessed_texts, batch_size=batch_size):
        processed_tokens = [stemmer.stem(token.lemma_) for token in doc if token.is_alpha]
        cleaned_texts.append(" ".join(processed_tokens))
    return cleaned_texts

In [22]:
# If you have any RAM requirement—8 GB, 16 GB, 32 GB, or 64 GB—you can replace batch_size = any GB accordingly
# Apply cleaning in batches or If 
df["Cleaned Job Description"] = clean_text_batch(df["Job Description"].astype(str).tolist(), batch_size=32)

In [23]:
# Standardize job titles (lowercasing and removing extra spaces)
df["Cleaned Job Title"] = df["Job Title"].apply(lambda x: x.lower().strip())

In [24]:
print("Preview of cleaned job descriptions and titles:")
df[["Cleaned Job Description", "Cleaned Job Title"]].head()

Preview of cleaned job descriptions and titles:


Unnamed: 0,Cleaned Job Description,Cleaned Job Title
0,social medium manag overse organ social medium...,digital marketing specialist
1,frontend web develop design implement user int...,web developer
2,qualiti control manag establish enforc qualiti...,operations manager
3,wireless network engin design implement mainta...,network engineer
4,confer manag coordin manag confer meet event p...,event manager


In [25]:
# Analyze job title distribution (show top 10)
print("Job title distribution (top 10):")
df["Cleaned Job Title"].value_counts().head(10)

Job title distribution (top 10):


Cleaned Job Title
ux/ui designer                  493
software engineer               260
digital marketing specialist    244
network engineer                214
executive assistant             200
software tester                 188
sales representative            183
procurement manager             178
financial advisor               176
customer support specialist     170
Name: count, dtype: int64

# **5. Feature Scaling for Salary Range**

In [26]:
def parse_salary_range(salary_str):
    """
    Converts a salary range string (e.g., "$59K-$99K") into a numeric average.
    Assumes salary values are in 'K' (thousands) and returns the average salary in thousands.
    """
    # Remove dollar signs, 'K', commas, and extra whitespace
    salary_str = salary_str.replace("$", "").replace("K", "").replace(",", "").strip()
    try:
        # Split the range into minimum and maximum parts
        parts = salary_str.split("-")
        if len(parts) == 2:
            salary_min = float(parts[0])
            salary_max = float(parts[1])
            return (salary_min + salary_max) / 2 # Average of min and max
        else:
            return np.nan
    except Exception as e:
        return np.nan

if "Salary Range" in df.columns:
    # Apply the parsing function to create a numeric average salary column
    df["Salary Avg"] = df["Salary Range"].apply(parse_salary_range)

    # Drop rows where the salary couldn't be parsed (optional)
    df.dropna(subset=["Salary Avg"], inplace=True)

    # Scale the numeric salary values using MinMaxScaler
    scaler = MinMaxScaler() # Alternative: StandardScaler()
    df["Scaled Salary"] = scaler.fit_transform(df[["Salary Avg"]])
    
    print("Scaled Salary (first 5 rows):")
    print(df["Scaled Salary"].head()) 

Scaled Salary (first 5 rows):
0    0.383333
1    0.616667
2    0.500000
3    0.350000
4    0.266667
Name: Scaled Salary, dtype: float64


# **6. ENCODING THE COLUMNS FOR BETTER PREDICTION**

### Assume df is your DataFrame

In [27]:
# Identify categorical columns and remove the column we don't want to encode.
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('Job Title')

In [28]:
# (Optional) Check the number of unique values in each categorical column.
print("Unique values per categorical column:")
for col in categorical_cols:
    print(f"{col}: {df[col].nunique()}")

Unique values per categorical column:
Experience: 48
Qualifications: 10
Salary Range: 561
location: 214
Country: 216
Work Type: 5
Job Posting Date: 731
Preference: 3
Contact Person: 13320
Contact: 14659
Role: 366
Job Portal: 16
Job Description: 366
Benefits: 11
skills: 366
Responsibilities: 365
Company: 888
Company Profile: 884
Cleaned Job Description: 366
Cleaned Job Title: 143


In [29]:
# Use sparse output to prevent memory explosion
encoder = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=True)
encoded_data = encoder.fit_transform(df[categorical_cols])

In [30]:
# Convert sparse matrix to DataFrame
encoded_df = pd.DataFrame.sparse.from_spmatrix(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

In [31]:
# (Optional) If you want to merge the encoded features back with the original DataFrame,
# you can drop the original categorical columns and join the new ones.
df_encoded = pd.concat([df.drop(columns=categorical_cols), encoded_df], axis=1)

print("One-hot encoding completed. New DataFrame shape:", df.shape)

One-hot encoding completed. New DataFrame shape: (14659, 27)


In [32]:
# Drop any remaining NaN values
df.dropna(inplace=True)

In [37]:
df.isnull().sum()

Job Id                     0
Experience                 0
Qualifications             0
Salary Range               0
location                   0
Country                    0
latitude                   0
longitude                  0
Work Type                  0
Company Size               0
Job Posting Date           0
Preference                 0
Contact Person             0
Contact                    0
Job Title                  0
Role                       0
Job Portal                 0
Job Description            0
Benefits                   0
skills                     0
Responsibilities           0
Company                    0
Company Profile            0
Cleaned Job Description    0
Cleaned Job Title          0
Salary Avg                 0
Scaled Salary              0
dtype: int64

In [38]:
df.isnull()

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Job Description,Benefits,skills,Responsibilities,Company,Company Profile,Cleaned Job Description,Cleaned Job Title,Salary Avg,Scaled Salary
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14996,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14997,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14998,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# **7. Feature Extraction using TF-IDF for Resume Matching**

In [39]:
tfidf = TfidfVectorizer(stop_words="english", max_features=5000, binary=True)
X = tfidf.fit_transform(df["Cleaned Job Description"])
print("\nTF-IDF feature matrix shape for job descriptions:", X.shape)


TF-IDF feature matrix shape for job descriptions: (14659, 931)


In [40]:
X[0].sum()

3.8934708757719436

In [41]:
tfidf.get_feature_names_out()

array(['abus', 'academ', 'accept', 'access', 'accordingli', 'account',
       'accur', 'accuraci', 'achiev', 'acquir', 'acquisit', 'action',
       'activ', 'acut', 'ad', 'adapt', 'addict', 'address', 'adher',
       'adjust', 'administ', 'administr', 'adolesc', 'adopt', 'adult',
       'advanc', 'advers', 'advertis', 'advic', 'advis', 'advisor',
       'advoc', 'aerodynam', 'aesthet', 'age', 'agenc', 'agent', 'agil',
       'agreement', 'aid', 'aim', 'air', 'aircraft', 'algorithm', 'align',
       'allianc', 'alloc', 'analysi', 'analyst', 'analyt', 'analyz',
       'anim', 'answer', 'anthropolog', 'api', 'app', 'appeal', 'applic',
       'appreci', 'architect', 'architectur', 'area', 'arrang', 'art',
       'articl', 'artist', 'artwork', 'aspect', 'assess', 'asset',
       'assist', 'associ', 'assur', 'attend', 'attende', 'attorney',
       'attract', 'audienc', 'audit', 'autom', 'avail', 'avion', 'awar',
       'backend', 'backlog', 'base', 'beauti', 'behavior', 'belong',
       'ben

In [42]:
def match_resume_to_jobs(resume_text, top_n=5):
    """
    Match a candidate's resume text to job descriptions based on cosine similarity.
    """
    resume_text_clean = resume_text.lower()
    resume_text_clean = re.sub(r"[^a-z\s]", " ", resume_text_clean)
    resume_text_clean = re.sub(r"\s+", " ", resume_text_clean).strip()
    resume_vector = tfidf.transform([resume_text_clean])
    # cosine_sim = cosine_similarity(resume_vector, tfidf_matrix).flatten()
    cosine_sim = cosine_similarity(resume_vector, X).flatten()
    top_indices = cosine_sim.argsort()[-top_n:][::-1]

    print("\nTop matching job descriptions:")
    for idx in top_indices:
        print(f"Job ID: {df['Job Id'].iloc[idx]}, Similarity: {cosine_sim[idx]:.4f}")
        print("Job Title:", df["Job Title"].iloc[idx])
        print("Job Description Preview:", df["Job Description"].iloc[idx][:200], "...\n")

# Test the resume matching function with an example resume
example_resume = """
Experienced digital marketer with expertise in social media strategy,
content creation, and data analytics. Proven track record of boosting engagement
and driving conversions through innovative campaigns.
"""
match_resume_to_jobs(example_resume, top_n=3) 


Top matching job descriptions:
Job ID: 531124294656644, Similarity: 0.3064
Job Title: Marketing Specialist
Job Description Preview: Content Marketing Managers oversee the creation and distribution of content to attract and engage customers. They develop content strategies, manage content teams, and track performance metrics to opt ...

Job ID: 2172401881813554, Similarity: 0.3064
Job Title: Marketing Specialist
Job Description Preview: Content Marketing Managers oversee the creation and distribution of content to attract and engage customers. They develop content strategies, manage content teams, and track performance metrics to opt ...

Job ID: 2657422898340473, Similarity: 0.3064
Job Title: Marketing Specialist
Job Description Preview: Content Marketing Managers oversee the creation and distribution of content to attract and engage customers. They develop content strategies, manage content teams, and track performance metrics to opt ...



# **8. Encode Target Variable (Job Titles)**

In [43]:
le = LabelEncoder()
y = le.fit_transform(df["Job Title"])
print("Number of unique job titles:", len(le.classes_))
print("Sample job title classes:", le.classes_[:10])

Number of unique job titles: 143
Sample job title classes: ['Account Director' 'Account Executive' 'Account Manager' 'Accountant'
 'Administrative Assistant' 'Aerospace Engineer' 'Architect'
 'Architectural Designer' 'Art Director' 'Art Teacher']


# **9. Train-Test Split**

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [45]:
print("Training set shape:", X_train.shape)

Training set shape: (11727, 931)


In [46]:
print("Test set shape:", X_test.shape)

Test set shape: (2932, 931)


# **10. Model Training and Analysis**

### Train a Decision Tree Classifier

In [47]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy:", dt_accuracy)
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt, target_names=le.classes_))

Decision Tree Accuracy: 1.0
Decision Tree Classification Report:
                                     precision    recall  f1-score   support

                   Account Director       1.00      1.00      1.00        12
                  Account Executive       1.00      1.00      1.00        21
                    Account Manager       1.00      1.00      1.00        26
                         Accountant       1.00      1.00      1.00        17
           Administrative Assistant       1.00      1.00      1.00        33
                 Aerospace Engineer       1.00      1.00      1.00        20
                          Architect       1.00      1.00      1.00        27
             Architectural Designer       1.00      1.00      1.00        13
                       Art Director       1.00      1.00      1.00        20
                        Art Teacher       1.00      1.00      1.00        22
                 Back-End Developer       1.00      1.00      1.00        13
          

### Train a Random Forest Classifier

In [48]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=le.classes_))

Random Forest Accuracy: 1.0
Random Forest Classification Report:
                                     precision    recall  f1-score   support

                   Account Director       1.00      1.00      1.00        12
                  Account Executive       1.00      1.00      1.00        21
                    Account Manager       1.00      1.00      1.00        26
                         Accountant       1.00      1.00      1.00        17
           Administrative Assistant       1.00      1.00      1.00        33
                 Aerospace Engineer       1.00      1.00      1.00        20
                          Architect       1.00      1.00      1.00        27
             Architectural Designer       1.00      1.00      1.00        13
                       Art Director       1.00      1.00      1.00        20
                        Art Teacher       1.00      1.00      1.00        22
                 Back-End Developer       1.00      1.00      1.00        13
          

### Plot and save confusion matrix for Random Forest model

In [49]:
cm = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(35, 25))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Random Forest Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.savefig("random_forest_confusion_matrix.png")
plt.close()
print("Confusion matrix saved as 'random_forest_confusion_matrix.png'")

Confusion matrix saved as 'random_forest_confusion_matrix.png'


# **11. Save Artifacts (Models, Vectorizer, Label Encoder)**

In [50]:
model_dir = "models"
os.makedirs(model_dir, exist_ok=True)

joblib.dump(tfidf, os.path.join(model_dir, "tfidf_vectorizer.pkl"))
joblib.dump(dt_model, os.path.join(model_dir, "decision_tree_model.pkl"))
joblib.dump(rf_model, os.path.join(model_dir, "random_forest_model.pkl"))
joblib.dump(le, os.path.join(model_dir, "label_encoder.pkl"))

print("Artifacts saved successfully in the 'models' directory.")

Artifacts saved successfully in the 'models' directory.
