In [1]:
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import pickle

In [2]:
df = pd.read_csv("project_data.csv")

In [3]:
df.head()

Unnamed: 0,Experience,UG,Location,PG,Salary,Department,Role Category
0,0 - 1 years,Graduation Not Required,Hiring office located in Pune,,2.25-2.75 Lacs P.A.,"BFSI, Investments & Trading",Lending
1,0 - 1 years,Any Graduate,Bengaluru,Any Postgraduate,Not Disclosed,"BFSI, Investments & Trading",Banking Operations
2,0 - 1 years,Any Graduate,Bengaluru,Any Postgraduate,Not Disclosed,"BFSI, Investments & Trading",Lending
3,1 - 4 years,Any Graduate,Hyderabad,Any Postgraduate,Not Disclosed,"BFSI, Investments & Trading",Banking Operations
4,0 - 4 years,Any Graduate,"Kolkata, Mumbai, Hyderabad, Bengaluru",Any Postgraduate,Not Disclosed,"BFSI, Investments & Trading",Lending


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56685 entries, 0 to 56684
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Experience     56685 non-null  object
 1   UG             52703 non-null  object
 2   Location       56685 non-null  object
 3   PG             35645 non-null  object
 4   Salary         56685 non-null  object
 5   Department     54701 non-null  object
 6   Role Category  54687 non-null  object
dtypes: object(7)
memory usage: 3.0+ MB


In [5]:
pd.set_option('display.max_rows', None)

# Pre-Processing

## Experience

In [6]:
# Function to process the Experience column
def process_experience(exp):
    exp = exp.lower().strip()  # Standardize the string
    if "no fixed duration" in exp:  # Handle "No fixed duration"
        return 0
    elif "months" in exp:  # Convert months to years
        months = [int(s) for s in exp.split() if s.isdigit()]
        if months:
            return months[0] / 12
        else:  # Handle ambiguous "few months"
            return 0.25  # Default to 0.25 years (~3 months)
    elif "years" in exp and "-" in exp:  # Handle ranges of years
        years = [float(x) for x in exp.split("years")[0].split("-")]
        mean_years = sum(years) / len(years)
        # Cap at a maximum threshold (e.g., 30 years)
        return min(mean_years, 30)
    elif "years" in exp:  # Handle single years (e.g., "15 years")
        return min(float(exp.split()[0]), 30)
    elif "year" in exp:  # Handle ambiguous "a year" or "1 year"
        return 1
    elif "month" in exp: # Handle ambiguous "a month" or "1 month"
        return 0

In [7]:
# Apply the function to the Experience column
df['Processed_Experience'] = df['Experience'].apply(process_experience)

In [8]:
# Round the processed experience values to 2 decimal places
df['Processed_Experience'] = df['Processed_Experience'].round(2)

In [9]:
# Define bins and labels for the ranges
bins = [0, 1, 3, 5, 10, 15, float('inf')]
labels = ['Entry-level', 'Junior', 'Mid-level', 'Experienced', 'Senior', 'Expert']

# Create a new column for categorized experience
df['Experience_Category'] = pd.cut(df['Processed_Experience'], bins=bins, labels=labels, right=False)

In [10]:
df['Experience_Category'].value_counts()

Experience_Category
Experienced    22165
Mid-level      16052
Junior         11372
Senior          3564
Entry-level     2013
Expert          1519
Name: count, dtype: int64

## UG PG

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56685 entries, 0 to 56684
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Experience            56685 non-null  object  
 1   UG                    52703 non-null  object  
 2   Location              56685 non-null  object  
 3   PG                    35645 non-null  object  
 4   Salary                56685 non-null  object  
 5   Department            54701 non-null  object  
 6   Role Category         54687 non-null  object  
 7   Processed_Experience  56685 non-null  float64 
 8   Experience_Category   56685 non-null  category
dtypes: category(1), float64(1), object(7)
memory usage: 3.5+ MB


In [12]:
df['UG'] = df['UG'].fillna('NA')
df['PG'] = df['PG'].fillna('NA')

In [13]:
# Convert the 'UG' column to binary encoding based on degree requirements
def encode_degree_requirement(ug_value):
    if pd.isna(ug_value) or ug_value in ['Graduation Not Required', 'NA']:
        return 0  # No degree required
    else:
        return 1  # Degree required

# Apply the function to encode the UG column
df['UG_Encoded'] = df['UG'].apply(encode_degree_requirement)

In [14]:
df['UG_Encoded'].value_counts()

UG_Encoded
1    50223
0     6462
Name: count, dtype: int64

In [15]:
# Function to encode the 'PG' column based on multiple conditions
def encode_pg_degree_requirement(pg_value):
    if pd.isna(pg_value):  # Check for missing or NaN values
        return 0  # No degree required
    pg_value = pg_value.lower()  # Convert to lowercase for case-insensitivity
    # Conditions for "No degree required"
    no_degree_conditions = ['post graduation not required', 'not applicable', 'na', 'none', 'no post graduation']
    if any(condition in pg_value for condition in no_degree_conditions):
        return 0  # No degree required
    else:
        return 1  # Degree required

In [16]:
df['PG_Encoded'] = df['PG'].apply(encode_pg_degree_requirement)

In [17]:
df['PG_Encoded'].value_counts()

PG_Encoded
1    34755
0    21930
Name: count, dtype: int64

## Location

In [18]:
# df.drop('Location', axis=1, inplace=True)

In [19]:
def rmhirefromvalue(value):
    if 'Hiring office located in' in value:
        value = value.split('located in')[1].strip()
        return value
    else:
        
        return value


In [20]:
df['Location']=df['Location'].apply(rmhirefromvalue)

In [21]:
def extract_city(location):
    # Rule 1: If comma-separated, take the first part
    if ',' in location:
        return location.split(',')[0].strip()
    # Rule 2: For addresses, use a regex pattern to find potential city-like words
    # Example: Capitalize words with more than 2 characters
    match = re.search(r'\b[A-Z][a-zA-Z]+\b', location)
    if match:
        return match.group(0)
    # Rule 3: Return as is if no pattern matches
    return location

In [22]:
df['City'] = df['Location'].apply(extract_city)

In [23]:
cities_list = ['Bengaluru', 'Mumbai', 'Delhi', 'Hyderabad', 'Chennai', 'Pune', 'Kolkata', 
               'Ahmedabad', 'Jaipur', 'Coimbatore', 'Indore', 'Surat', 'Vadodara', 'Lucknow', 
               'Nagpur', 'Bhopal', 'Chandigarh', 'Thiruvananthapuram', 'Visakhapatnam', 'Patna','Gurugram']


In [24]:
def func_city(location):
    values=[]
    if ',' in location:
        values.extend(location.split(','))
    else:
        values.append(location)
    for v in values:
        for c in cities_list:
            if v in c :
                return c
    return location

In [25]:
def func_city1(location):
    # values=[]
    # if ',' in location:
    #     values.extend(location.split(','))
    # else:
    #     values.append(location)
    # for v in values:
    #     for c in cities_list:
    #         if v in c :
    #             return c
    # return location
    for city in cities_list:
        if city in location:
            return city
    return location

In [26]:
df['City'] = df['City'].replace("New Delhi","Delhi")
df['City'] = df['City'].replace("Navi","Mumbai")
df['City'] = df['City'].replace("Bengalore","Bengaluru")


In [27]:
def value_conversion(location):
    for city in cities_list:
        if city.lower() in  location.lower():
            return city
    return location

In [28]:
df['City'] = df['City'].apply(value_conversion)

In [29]:
mumbai_city = [
        "Colaba", "Fort", "Churchgate", "Marine Lines", 
        "Nariman Point", "Cuffe Parade", "Malabar Hill", 
        "Girgaon (Girgaum)", "Dadar", "Tardeo",
        "Byculla", "Parel", "Lower Parel", 
        "Worli", "Mahalaxmi",
        "Bandra", "Khar", "Santacruz", "Vile Parle", 
        "Andheri", "Jogeshwari", "Goregaon", "Malad", 
        "Kandivali", "Borivali", "Dahisar",
        "Kurla", "Chembur", "Ghatkopar", 
        "Vikhroli", "Bhandup", "Mulund",
        "Dockyard Road", "Sewri", "Wadala", 
        "Mankhurd", "Govandi",
        "Powai", "Chandivali",
        "Vashi", "Nerul", "Kharghar", "Panvel",
        "Thane", "Kalyan", "Dombivli", 
        "Ambernath", "Ulhasnagar", "Vasai-Virar"
    ]

In [30]:
delhi_city = [
        "Connaught Place", "Karol Bagh", "Daryaganj", 
        "Paharganj", "Pragati Maidan",
        "Kashmere Gate", "Civil Lines", "Model Town", 
        "GTB Nagar", "Shakti Nagar",
        "Saket", "Hauz Khas", "Greater Kailash", 
        "Lajpat Nagar", "Defence Colony", "Vasant Kunj", 
        "Mehrauli", "Malviya Nagar",
        "Rajouri Garden", "Punjabi Bagh", "Janakpuri", 
        "Tilak Nagar", "Uttam Nagar", "Paschim Vihar",
        "Preet Vihar", "Laxmi Nagar", "Mayur Vihar", 
        "Patparganj", "Anand Vihar", "Shahdara",
        "Chanakyapuri", "Sarojini Nagar", "RK Puram", 
        "Lodhi Colony", "Jor Bagh", "Diplomatic Enclave",
        "Rohini", "Pitampura", "Ashok Vihar", 
        "Shalimar Bagh", "Narela",
        "Dwarka", "Najafgarh", "Palam", 
        "Vasant Vihar", "Mahipalpur",
        "Seelampur", "Yamuna Vihar", "Shastri Park", 
        "Bhajanpura", "Khajuri Khas",
        "Bawana", "Alipur", "Najafgarh Rural", 
        "Narela Rural"
    ]

In [31]:
def mumbai_cities_filter(location):
    for city in mumbai_city:
        if city.lower() in  location.lower():
            return 'Mumbai'
    return location

In [32]:
def delhi_cities_filter(location):
    for city in delhi_city:
        if city.lower() in  location.lower():
            return 'Delhi'
    return location

In [33]:
df['City'] = df['City'].apply(delhi_cities_filter)
df['City'] = df['City'].apply(mumbai_cities_filter)


In [34]:
df['City'].value_counts().head(20)

City
Bengaluru     10959
Mumbai         6652
Hyderabad      6173
Pune           5453
Chennai        3800
Gurugram       3112
Noida          3044
Kolkata        2552
Ahmedabad      1313
Coimbatore      757
Jaipur          584
Kochi           550
Mohali          393
Vadodara        334
Nagpur          315
Delhi           273
Surat           266
Lucknow         257
Chandigarh      243
Faridabad       217
Name: count, dtype: int64

In [35]:
top_cities=['Bengaluru','Mumbai',
'Hyderabad','Noida',
'Pune','Chennai',
'Gurugram',
'Kolkata','Ahmedabad',
'Mohali','Kochi',
'Jaipur','Coimbatore',
'Delhi','Vadodara',
'Lucknow','Madurai',
'Surat','Faridabad',
'Chandigarh']

In [36]:
def otherCategory(value):
    for city in top_cities:
        if value == city:
            return city
    return 'other'

In [37]:
df['City']= df['City'].apply(otherCategory)

## Department & Role Category

In [38]:
df = df.dropna(subset=['Department'])

In [39]:
df['Role Category'] = df['Role Category'].fillna('Others')

In [40]:
# Define the threshold for grouping departments
dept_threshold = 180

# Identify rare departments
rare_departments = df['Department'].value_counts()[df['Department'].value_counts() < dept_threshold].index

# Replace rare departments with "Others"
df['Department'] = df['Department'].replace(rare_departments, 'Others')

In [41]:
# Define the threshold for grouping role categories
role_threshold = 100

# Identify rare role categories
rare_roles = df['Role Category'].value_counts()[df['Role Category'].value_counts() < role_threshold].index

# Replace rare role categories with "Others"
df['Role Category'] = df['Role Category'].replace(rare_roles, 'Others')

In [42]:
# Ensure at least one unique role category per department
for dept in df['Department'].unique():
    # Subset data for the current department
    dept_roles = df[df['Department'] == dept]['Role Category']
    
    # Check if all roles for this department are "Others"
    if dept_roles.nunique() == 1 and dept_roles.iloc[0] == 'Others':
        # Find the most frequent rare role for this department
        original_roles = df[(df['Department'] == dept) & (df['Role Category'].isin(rare_roles))]
        if not original_roles.empty:
            most_frequent_role = original_roles['Role Category'].mode()[0]

            # Replace "Others" with the most frequent role
            df.loc[
                (df['Department'] == dept) & (df['Role Category'] == 'Others'),
                'Role Category'
            ] = most_frequent_role

In [43]:
# Correct "Others - Finance" anomaly
df.loc[(df['Department'] == 'Others') & (df['Role Category'] == 'Finance'), 'Department'] = 'Finance & Accounting'

In [44]:
# Append department name to "Others" role for better specificity
df.loc[df['Role Category'] == 'Others', 'Role Category'] = df['Department'] + " - Others"

In [45]:
# Define the threshold for grouping role categories
role_threshold = 80

# Identify rare role categories
rare_roles = df['Role Category'].value_counts()[df['Role Category'].value_counts() < role_threshold].index

# Replace rare role categories with "Others"
df['Role Category'] = df['Role Category'].replace(rare_roles, 'Others')

In [46]:
# Create a combined feature for Department and Role Category
df['Department_Role'] = df['Department'] + " - " + df['Role Category']

In [47]:
df.head()

Unnamed: 0,Experience,UG,Location,PG,Salary,Department,Role Category,Processed_Experience,Experience_Category,UG_Encoded,PG_Encoded,City,Department_Role
0,0 - 1 years,Graduation Not Required,Pune,,2.25-2.75 Lacs P.A.,"BFSI, Investments & Trading",Lending,0.5,Entry-level,0,0,Pune,"BFSI, Investments & Trading - Lending"
1,0 - 1 years,Any Graduate,Bengaluru,Any Postgraduate,Not Disclosed,"BFSI, Investments & Trading",Banking Operations,0.5,Entry-level,1,1,Bengaluru,"BFSI, Investments & Trading - Banking Operations"
2,0 - 1 years,Any Graduate,Bengaluru,Any Postgraduate,Not Disclosed,"BFSI, Investments & Trading",Lending,0.5,Entry-level,1,1,Bengaluru,"BFSI, Investments & Trading - Lending"
3,1 - 4 years,Any Graduate,Hyderabad,Any Postgraduate,Not Disclosed,"BFSI, Investments & Trading",Banking Operations,2.5,Junior,1,1,Hyderabad,"BFSI, Investments & Trading - Banking Operations"
4,0 - 4 years,Any Graduate,"Kolkata, Mumbai, Hyderabad, Bengaluru",Any Postgraduate,Not Disclosed,"BFSI, Investments & Trading",Lending,2.0,Junior,1,1,Kolkata,"BFSI, Investments & Trading - Lending"


## Salary

In [48]:
df = df.loc[df['Salary'] != 'Not Disclosed']

In [49]:
def remove_parentheses_content(salary):
    if pd.isna(salary):
        return salary
    return re.sub(r'\(.*?\)', '', salary).strip()
df['Salary'] = df['Salary'].apply(remove_parentheses_content)

In [50]:
df['Salary'].value_counts()

Salary
3-6 Lacs P.A.            378
3-5 Lacs P.A.            322
2.5-4 Lacs P.A.          293
3-4 Lacs P.A.            266
2-3 Lacs P.A.            254
2.25-4 Lacs P.A.         245
2-4 Lacs P.A.            226
2-5 Lacs P.A.            204
3-4.5 Lacs P.A.          188
2-3.5 Lacs P.A.          185
50,000-3 Lacs P.A.       185
4-6 Lacs P.A.            180
2.5-3.5 Lacs P.A.        177
15-25 Lacs P.A.          146
10-20 Lacs P.A.          140
Unpaid                   136
5-10 Lacs P.A.           133
2.5-5 Lacs P.A.          132
2.5-4.25 Lacs P.A.       129
3-8 Lacs P.A.            127
4-9 Lacs P.A.            126
1-3 Lacs P.A.            124
3-7 Lacs P.A.            121
2.5-3 Lacs P.A.          119
15-30 Lacs P.A.          117
3-5.5 Lacs P.A.          116
2.75-4 Lacs P.A.         115
4-8 Lacs P.A.            110
4-7 Lacs P.A.            109
2.5-5.5 Lacs P.A.        107
5-8 Lacs P.A.            107
5-7 Lacs P.A.            106
2.5-4.5 Lacs P.A.        104
15-20 Lacs P.A.          103
3-3.5 L

In [51]:
def value_lacsPA(value):
    if '-' in value:
        lower, upper = value.split("-")
        if "Lacs P.A." in  upper:
            upper= upper.replace("Lacs P.A.","")
            if ',' in lower:
                lower = lower.replace(",","")
            lower = float(lower)
            upper = float(upper)
            if lower<1000 and upper<1000:
                return f"{lower*100000}-{upper*100000}"
            elif lower>1000 and upper<1000:
                return f"{lower}-{upper*100000}"
            elif lower<1000 and upper>1000:
                return f"{lower*100000}-{upper}"
            elif lower >1000 and upper >1000:
                return f"{lower}-{upper}"
    return value

In [52]:
df['Salary'] = df['Salary'].apply(value_lacsPA)

In [53]:
def convert_monthly_to_annual(salary):
    if '/month' in salary: 
        monthly_salary = float(salary.replace('/month', '').replace(',', ''))
        annual_salary = monthly_salary * 12
        return f"{annual_salary}"
    return salary

In [54]:
df['Salary']  = df['Salary'].apply(convert_monthly_to_annual)

In [55]:
def existslacspa(value):
    if 'Lacs P.A.' in value:
        value = value.replace('Lacs P.A.',"")
        return f"{float(value)*100000}"
    return value

In [56]:
df['Salary']  = df['Salary'].apply(existslacspa)

In [57]:
df['Salary']=df['Salary'].str.replace("P.A.","",regex=False)

In [58]:
def funcforCrLac(value):
    if 'and' in value:
        value1,value2 = value.split("and")
        value1 = value1.replace("Cr", "")
        
        value1 = float(value1)*10000000
        return f"{value1}"
        
    if "Cr" in value:
        if '-' in value:
            lower, upper = value.split("-")
            if "Lacs" in lower:
                lower = lower.replace("Lacs","")
                upper = upper.replace("Cr","")
                lower = float(lower)*100000
                upper = float(upper)*10000000
                return f"{lower}-{upper}"
    
            upper = upper.replace("Cr","")
            lower = float(lower)*10000000
            upper = float(upper)*10000000
            return  f"{lower}-{upper}"
        else:
            value = value.replace("Cr","")
            value = float(value)*10000000
            return f"{value}"
    return f"{value}"

In [59]:
df['Salary'] = df['Salary'].apply(funcforCrLac)

In [60]:
def removecommas(value):
    if '-' in value:
        lower, upper = value.split("-")
        if ',' in lower :
            lower = lower.replace(",","")
        if ',' in upper:
            upper = upper.replace(",","")
        return f"{lower}-{upper}"
    elif ',' in value:
        return f"{value.replace(",","")}"
    else:
        return value

In [61]:
df['Salary'] = df['Salary'].apply(removecommas)

In [62]:
def functForMeanSal(value):
    if '-' in value:
        lower, upper = value.split("-")
        lower = float(lower)
        upper = float(upper)
        mean = (upper+lower)/2
        return mean
    elif value == 'Unpaid':
        return 0
    else:
        value = float(value)
        return value

In [63]:
df['Salary'] = df['Salary'].apply(functForMeanSal)

In [64]:
def salCategories(value):
    if 0 <= value <350000:
        return '[0,350000]'
    elif 350000 <= value <600000:
        return '[350000,600000]'
    elif 600000 <= value <1000000:
        return '[600000,1000000]'
    elif 1000000 <= value <=1500000:
        return '[1000000,1500000]'
    elif value>1500000:
        return 'above 1500000'

In [65]:
df['labelSal'] = df['Salary'].apply(salCategories)

In [66]:
df['labelSal'].value_counts()

labelSal
[0,350000]           6629
[350000,600000]      5102
above 1500000        2110
[600000,1000000]     2052
[1000000,1500000]    1505
Name: count, dtype: int64

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17398 entries, 0 to 56684
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Experience            17398 non-null  object  
 1   UG                    17398 non-null  object  
 2   Location              17398 non-null  object  
 3   PG                    17398 non-null  object  
 4   Salary                17398 non-null  float64 
 5   Department            17398 non-null  object  
 6   Role Category         17398 non-null  object  
 7   Processed_Experience  17398 non-null  float64 
 8   Experience_Category   17398 non-null  category
 9   UG_Encoded            17398 non-null  int64   
 10  PG_Encoded            17398 non-null  int64   
 11  City                  17398 non-null  object  
 12  Department_Role       17398 non-null  object  
 13  labelSal              17398 non-null  object  
dtypes: category(1), float64(2), int64(2), object(9)
memory usag

## convert all qualitative columns to quantitative ones

In [68]:
# convert all qualitative columns to quantitative ones
from sklearn.preprocessing import LabelEncoder

# create encoders
df['Experience_Category'] = LabelEncoder().fit_transform(df['Experience_Category'])
df['UG_Encoded'] = LabelEncoder().fit_transform(df['UG_Encoded'])
df['PG_Encoded'] = LabelEncoder().fit_transform(df['PG_Encoded'])
df['Department_Role'] = LabelEncoder().fit_transform(df['Department_Role'])
df['Department'] = LabelEncoder().fit_transform(df['Department'])
df['Role Category'] = LabelEncoder().fit_transform(df['Role Category'])
df['City'] = LabelEncoder().fit_transform(df['City'])
df['labelSal'] = LabelEncoder().fit_transform(df['labelSal'])

In [69]:
df=df[['Experience_Category', 'City', 'UG_Encoded', 'PG_Encoded', 'Department_Role', 'Department', 'Role Category', 'labelSal']]

In [70]:
df.corr()

Unnamed: 0,Experience_Category,City,UG_Encoded,PG_Encoded,Department_Role,Department,Role Category,labelSal
Experience_Category,1.0,0.087804,-0.009905,-0.091795,0.078836,0.088754,-0.118868,-0.08001
City,0.087804,1.0,0.038118,0.026197,0.093587,0.100035,-0.138595,-0.076445
UG_Encoded,-0.009905,0.038118,1.0,-0.10902,0.128832,0.130548,-0.084083,0.072249
PG_Encoded,-0.091795,0.026197,-0.10902,1.0,0.034582,0.028939,-0.067891,0.171511
Department_Role,0.078836,0.093587,0.128832,0.034582,1.0,0.995408,-0.236367,-0.031627
Department,0.088754,0.100035,0.130548,0.028939,0.995408,1.0,-0.292895,-0.034042
Role Category,-0.118868,-0.138595,-0.084083,-0.067891,-0.236367,-0.292895,1.0,0.109505
labelSal,-0.08001,-0.076445,0.072249,0.171511,-0.031627,-0.034042,0.109505,1.0


In [71]:
# decide the x and y
x = df.drop(['Department_Role', 'labelSal'], axis=1)
y = df['labelSal']

In [72]:
# split the data into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=123456)

In [73]:
from sklearn.linear_model import LogisticRegression

def create_model_logistic_regression():
    # create the model
    model = LogisticRegression(max_iter=1000)
    
    # train the model
    model.fit(x_train, y_train)

    return model

In [74]:
from sklearn.naive_bayes import GaussianNB

def create_model_naive_bayes():
    # create the model
    model = GaussianNB()
    
    # train the model
    model.fit(x_train, y_train)

    return model

In [75]:
from sklearn.neighbors import KNeighborsClassifier

def create_model_knn():
    # create the model
    model = KNeighborsClassifier(n_neighbors=5)
    
    # train the model
    model.fit(x_train, y_train)

    return model

In [76]:
from sklearn.svm import SVC

def create_model_svm():
    # create the model
    model = SVC()
    
    # train the model
    model.fit(x_train, y_train)

    return model

In [77]:
from sklearn.tree import DecisionTreeClassifier

def create_model_decision_tree():
    # create the model
    model = DecisionTreeClassifier()
    
    # train the model
    model.fit(x_train, y_train)

    return model

In [78]:
from sklearn.ensemble import RandomForestClassifier

def create_model_random_forest():
    # create a model
    model = RandomForestClassifier(n_estimators=500)

    # train the model
    model.fit(x_train, y_train)

    return model

In [79]:
from catboost import CatBoostClassifier

def create_model_catboost():
    model = CatBoostClassifier(logging_level='Silent', depth= 8, iterations= 200, l2_leaf_reg= 1, learning_rate= 0.1)
    model.fit(x_train, y_train)
    return model

In [80]:
from sklearn.ensemble import AdaBoostClassifier

def create_model_ada_boost():
    model = AdaBoostClassifier()
    model.fit(x_train, y_train)
    return model

In [81]:
from sklearn.ensemble import GradientBoostingClassifier

def create_model_grandient_boost():
    model = GradientBoostingClassifier(learning_rate= 0.1, max_depth= 5, n_estimators= 100, subsample= 0.8)
    model.fit(x_train, y_train)
    return model

In [82]:
from xgboost import XGBClassifier

def create_model_xgboost():
    model = XGBClassifier(colsample_bytree= 0.8, learning_rate= 0.1, max_depth= 7, n_estimators= 100, subsample= 0.8)
    model.fit(x_train, y_train)
    return model

In [83]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(model):
    # training accuracry
    y_pred = model.predict(x_train)
    y_true = y_train

    # calculate training accuracy
    training_accuracy = accuracy_score(y_true, y_pred)

    # testing accuracy
    y_pred = model.predict(x_test)
    y_true = y_test

    # get the performance metrics for testing
    testing_accuracy = accuracy_score(y_true, y_pred)
    testing_precision = precision_score(y_true, y_pred,average='macro')
    testing_recall = recall_score(y_true, y_pred,average='macro')
    testing_f1 = f1_score(y_true, y_pred,average='macro')

    return training_accuracy, testing_accuracy, testing_precision, testing_recall, testing_f1

In [84]:
models = [
    ("Logistic Regression", create_model_logistic_regression(), (0, 0)),
    ("Naive Bayes", create_model_naive_bayes(), (0, 1)),
    ("KNN", create_model_knn(), (1, 0)),
    ("SVM", create_model_svm(), (1, 1)),
    ("Decision Tree", create_model_decision_tree(), (2, 0)),
    ("Random Forest", create_model_random_forest(), (2, 1)),
    ("CAT Boost", create_model_catboost(), (3, 0)),
    ("Ada Boost", create_model_ada_boost(), (3, 1)),
    ("Grandient Boost", create_model_grandient_boost(), (4, 0)),
    ("XGBoost", create_model_xgboost(), (4, 1))
]



In [85]:
performance_data = []
for model_name, model, position in models:
    train_accuracy, test_accuracy, precision, recall, f1 = evaluate_model(model)
    performance_data.append([
        model_name, f"{train_accuracy * 100:0.2f}%", f"{test_accuracy * 100:0.2f}%", precision, recall, f1
    ])
    
performance_chart = pd.DataFrame(performance_data, 
            columns=["Model", "Train Accuracy", "Test Accuracy", "Precision", "Recall", "F1"])
performance_chart

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Model,Train Accuracy,Test Accuracy,Precision,Recall,F1
0,Logistic Regression,38.23%,38.59%,0.209937,0.22715,0.1904
1,Naive Bayes,37.99%,38.39%,0.231618,0.247428,0.213845
2,KNN,61.73%,50.95%,0.463965,0.437467,0.447841
3,SVM,38.54%,39.68%,0.145688,0.208705,0.146617
4,Decision Tree,75.08%,57.96%,0.514376,0.504303,0.507146
5,Random Forest,75.08%,59.14%,0.529826,0.522991,0.524359
6,CAT Boost,67.60%,59.83%,0.537701,0.522205,0.521695
7,Ada Boost,52.14%,50.72%,0.453736,0.442466,0.442789
8,Grandient Boost,66.81%,59.91%,0.533377,0.522583,0.521949
9,XGBoost,67.32%,60.49%,0.541076,0.527482,0.527426


# Grid-Seacrh

## Cat-Boost

In [86]:
catboost_model = CatBoostClassifier(logging_level='Silent')
catboost_model.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x28b90a5a4b0>

In [87]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define parameter grid
param_grid = {
    'iterations': [100, 200],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1],
    'l2_leaf_reg': [1, 3, 5]
}

# Wrap the model for GridSearchCV
grid_search = GridSearchCV(
    estimator=catboost_model, 
    param_grid=param_grid, 
    cv=3, 
    scoring='accuracy', 
    n_jobs=-1, 
    verbose=2
)

# Perform grid search
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [88]:
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

Best Parameters: {'depth': 8, 'iterations': 200, 'l2_leaf_reg': 1, 'learning_rate': 0.1}
Best CV Score: 0.5922542945492117


## Gradient Boost

In [89]:
gb_model = GradientBoostingClassifier()
gb_model.fit(x_train, y_train)

In [90]:
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
}
grid_search_gbc = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search_gbc.fit(x_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [91]:
print("Best Parameters:", grid_search_gbc.best_params_)
print("Best CV Score:", grid_search_gbc.best_score_)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
Best CV Score: 0.5962779954062633


## XG Boost

In [92]:
xg_model = XGBClassifier()
xg_model.fit(x_train, y_train)

In [93]:
# Parameter grid for Grid Search
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

# GridSearchCV setup
grid_search_xgb = GridSearchCV(estimator=xg_model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the model
grid_search_xgb.fit(x_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


In [94]:
print("Best Parameters:", grid_search_xgb.best_params_)
print("Best CV Score:", grid_search_xgb.best_score_)

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.8}
Best CV Score: 0.600085869303977
