In [81]:
import pandas as pd
import numpy as np
import re

In [82]:
df = pd.read_csv("project_data.csv")

In [83]:
df.head()

Unnamed: 0,Experience,UG,Location,PG,Salary,Department,Doctorate:,Skills,Role Category
0,0 - 1 years,Graduation Not Required,Hiring office located in Pune,,2.25-2.75 Lacs P.A.,"BFSI, Investments & Trading",,"Field Collections, Legal Documentation, Field ...",Lending
1,0 - 1 years,Any Graduate,Bengaluru,Any Postgraduate,Not Disclosed,"BFSI, Investments & Trading",,"standard operating procedures, anti money laun...",Banking Operations
2,0 - 1 years,Any Graduate,Bengaluru,Any Postgraduate,Not Disclosed,"BFSI, Investments & Trading",,"aml, anti money laundering, client onboarding,...",Lending
3,1 - 4 years,Any Graduate,Hyderabad,Any Postgraduate,Not Disclosed,"BFSI, Investments & Trading",,"Data analysis, Change management, Corporate ac...",Banking Operations
4,0 - 4 years,Any Graduate,"Kolkata, Mumbai, Hyderabad, Bengaluru",Any Postgraduate,Not Disclosed,"BFSI, Investments & Trading",,"Retail, Data analysis, Project management, Pro...",Lending


In [84]:
pd.set_option('display.max_rows', None)

# Pre-Processing

## Experience

In [85]:
# Function to process the Experience column
def process_experience(exp):
    exp = exp.lower().strip()  # Standardize the string
    if "no fixed duration" in exp:  # Handle "No fixed duration"
        return 0
    elif "months" in exp:  # Convert months to years
        months = [int(s) for s in exp.split() if s.isdigit()]
        if months:
            return months[0] / 12
        else:  # Handle ambiguous "few months"
            return 0.25  # Default to 0.25 years (~3 months)
    elif "years" in exp and "-" in exp:  # Handle ranges of years
        years = [float(x) for x in exp.split("years")[0].split("-")]
        mean_years = sum(years) / len(years)
        # Cap at a maximum threshold (e.g., 30 years)
        return min(mean_years, 30)
    elif "years" in exp:  # Handle single years (e.g., "15 years")
        return min(float(exp.split()[0]), 30)
    elif "year" in exp:  # Handle ambiguous "a year" or "1 year"
        return 1
    elif "month" in exp: # Handle ambiguous "a month" or "1 month"
        return 0

In [86]:
# Apply the function to the Experience column
df['Processed_Experience'] = df['Experience'].apply(process_experience)

In [87]:
# Round the processed experience values to 2 decimal places
df['Processed_Experience'] = df['Processed_Experience'].round(2)

In [88]:
# Define bins and labels for the ranges
bins = [0, 1, 3, 5, 10, 15, float('inf')]
labels = ['Entry-level', 'Junior', 'Mid-level', 'Experienced', 'Senior', 'Expert']

# Create a new column for categorized experience
df['Experience_Category'] = pd.cut(df['Processed_Experience'], bins=bins, labels=labels, right=False)

In [89]:
df['Experience_Category'].value_counts()

Experience_Category
Experienced    22165
Mid-level      16052
Junior         11372
Senior          3564
Entry-level     2013
Expert          1519
Name: count, dtype: int64

## UG PG

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56685 entries, 0 to 56684
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Experience            56685 non-null  object  
 1   UG                    52703 non-null  object  
 2   Location              56685 non-null  object  
 3   PG                    35645 non-null  object  
 4   Salary                56685 non-null  object  
 5   Department            54701 non-null  object  
 6   Doctorate:            1927 non-null   object  
 7   Skills                56685 non-null  object  
 8   Role Category         54687 non-null  object  
 9   Processed_Experience  56685 non-null  float64 
 10  Experience_Category   56685 non-null  category
dtypes: category(1), float64(1), object(9)
memory usage: 4.4+ MB


In [91]:
df['UG'] = df['UG'].fillna('NA')
df['PG'] = df['PG'].fillna('NA')

In [92]:
# Convert the 'UG' column to binary encoding based on degree requirements
def encode_degree_requirement(ug_value):
    if pd.isna(ug_value) or ug_value in ['Graduation Not Required', 'NA']:
        return 0  # No degree required
    else:
        return 1  # Degree required

# Apply the function to encode the UG column
df['UG_Encoded'] = df['UG'].apply(encode_degree_requirement)

In [93]:
df['UG_Encoded'].value_counts()

UG_Encoded
1    50223
0     6462
Name: count, dtype: int64

In [94]:
# Function to encode the 'PG' column based on multiple conditions
def encode_pg_degree_requirement(pg_value):
    if pd.isna(pg_value):  # Check for missing or NaN values
        return 0  # No degree required
    pg_value = pg_value.lower()  # Convert to lowercase for case-insensitivity
    # Conditions for "No degree required"
    no_degree_conditions = ['post graduation not required', 'not applicable', 'na', 'none', 'no post graduation']
    if any(condition in pg_value for condition in no_degree_conditions):
        return 0  # No degree required
    else:
        return 1  # Degree required

In [95]:
df['PG_Encoded'] = df['PG'].apply(encode_pg_degree_requirement)

In [96]:
df['PG_Encoded'].value_counts()

PG_Encoded
1    34755
0    21930
Name: count, dtype: int64

## Location

In [97]:
df.head(3)

Unnamed: 0,Experience,UG,Location,PG,Salary,Department,Doctorate:,Skills,Role Category,Processed_Experience,Experience_Category,UG_Encoded,PG_Encoded
0,0 - 1 years,Graduation Not Required,Hiring office located in Pune,,2.25-2.75 Lacs P.A.,"BFSI, Investments & Trading",,"Field Collections, Legal Documentation, Field ...",Lending,0.5,Entry-level,0,0
1,0 - 1 years,Any Graduate,Bengaluru,Any Postgraduate,Not Disclosed,"BFSI, Investments & Trading",,"standard operating procedures, anti money laun...",Banking Operations,0.5,Entry-level,1,1
2,0 - 1 years,Any Graduate,Bengaluru,Any Postgraduate,Not Disclosed,"BFSI, Investments & Trading",,"aml, anti money laundering, client onboarding,...",Lending,0.5,Entry-level,1,1


## Role Category and Department 

In [98]:
df['Department'] = df['Department'].fillna('Others')

In [99]:
# Define the threshold for grouping departments
dept_threshold = 500

# Identify rare departments
rare_departments = df['Department'].value_counts()[df['Department'].value_counts() < dept_threshold].index

# Replace rare departments with "Others"
df['Department'] = df['Department'].replace(rare_departments, 'Others')

In [100]:
df['Department'].value_counts().count()

16

In [101]:
df['Role Category'] = df['Role Category'].fillna('Others')

In [102]:
# Define the threshold for grouping role categories
role_threshold = 300

# Identify rare role categories
rare_roles = df['Role Category'].value_counts()[df['Role Category'].value_counts() < role_threshold].index

# Replace rare role categories with "Others"
df['Role Category'] = df['Role Category'].replace(rare_roles, 'Others')

In [103]:
df['Role Category'].value_counts().count()

35

In [104]:
# Standardize "Other" role categories within the same department
df.loc[df['Role Category'].str.contains('Other', case=False, na=False), 'Role Category'] = 'Others'

In [105]:
# Ensure at least one unique role category per department
for dept in df['Department'].unique():
    # Subset data for the current department
    dept_roles = df[df['Department'] == dept]['Role Category']
    
    # Check if all roles for this department are "Others"
    if dept_roles.nunique() == 1 and dept_roles.iloc[0] == 'Others':
        # Find the most frequent rare role for this department
        original_roles = df[(df['Department'] == dept) & (df['Role Category'].isin(rare_roles))]
        if not original_roles.empty:
            most_frequent_role = original_roles['Role Category'].mode()[0]

            # Replace "Others" with the most frequent role
            df.loc[
                (df['Department'] == dept) & (df['Role Category'] == 'Others'),
                'Role Category'
            ] = most_frequent_role


In [106]:
df['Role Category'].value_counts().count()

32

In [107]:
# Append department name to "Others" role for better specificity
df.loc[df['Role Category'] == 'Others', 'Role Category'] = df['Department'] + " - Others"

In [108]:
# Define the threshold for grouping role categories
role_threshold = 200

# Identify rare role categories
rare_roles = df['Role Category'].value_counts()[df['Role Category'].value_counts() < role_threshold].index

# Replace rare role categories with "Others"
df['Role Category'] = df['Role Category'].replace(rare_roles, 'Others')

In [109]:
# Correct "Others - Finance" anomaly
df.loc[(df['Department'] == 'Others') & (df['Role Category'] == 'Finance'), 'Department'] = 'Finance & Accounting'

In [110]:
df['Role Category'].value_counts().count()

43

In [111]:
df['Department'].value_counts().count()

16

In [112]:
df['Department'].value_counts()

Department
Engineering - Software & QA                16449
Sales & Business Development                8101
Others                                      4860
Customer Success, Service & Operations      4805
Finance & Accounting                        4205
BFSI, Investments & Trading                 2622
Human Resources                             2582
Data Science & Analytics                    2495
Healthcare & Life Sciences                  2056
Marketing & Communication                   1951
Production, Manufacturing & Engineering     1887
Consulting                                  1548
Project & Program Management                1070
Research & Development                       847
Administration & Facilities                  621
Food, Beverage & Hospitality                 586
Name: count, dtype: int64

In [113]:
# Create a combined feature for Department and Role Category
df['Department_Role'] = df['Department'] + " - " + df['Role Category']
df['Department_Role'].value_counts()

Department_Role
Engineering - Software & QA - Software Development                                            12080
Others - Others - Others                                                                       4860
Sales & Business Development - BD / Pre Sales                                                  4344
Sales & Business Development - Enterprise & B2B Sales                                          2785
Finance & Accounting - Accounting & Taxation                                                   1979
Engineering - Software & QA - Quality Assurance and Testing                                    1969
Customer Success, Service & Operations - Voice / Blended                                       1844
Engineering - Software & QA - DBA / Data warehousing                                           1747
Customer Success, Service & Operations - Customer Success, Service & Operations - Others       1677
Data Science & Analytics - Business Intelligence & Analytics                        

In [114]:
df['Department_Role'].value_counts()

Department_Role
Engineering - Software & QA - Software Development                                            12080
Others - Others - Others                                                                       4860
Sales & Business Development - BD / Pre Sales                                                  4344
Sales & Business Development - Enterprise & B2B Sales                                          2785
Finance & Accounting - Accounting & Taxation                                                   1979
Engineering - Software & QA - Quality Assurance and Testing                                    1969
Customer Success, Service & Operations - Voice / Blended                                       1844
Engineering - Software & QA - DBA / Data warehousing                                           1747
Customer Success, Service & Operations - Customer Success, Service & Operations - Others       1677
Data Science & Analytics - Business Intelligence & Analytics                        

## Salary

In [115]:
def remove_variables(salary):
    if '(Including Variable: 30%)':
        salary = salary.split('(Including Variable')[0].strip()
        return salary
    return salary

In [116]:
df['Salary'] = df['Salary'].apply(remove_variables)

In [117]:
def value_lacsPA(value):
    if '-' in value:
        lower, upper = value.split("-")
        if "Lacs P.A." in  upper:
            upper= upper.replace("Lacs P.A.","")
            if ',' in lower:
                lower = lower.replace(",","")
            lower = float(lower)
            upper = float(upper)
            if lower<1000 and upper<1000:
                return f"{lower*100000}-{upper*100000}"
            elif lower>1000 and upper<1000:
                return f"{lower}-{upper*100000}"
            elif lower<1000 and upper>1000:
                return f"{lower*100000}-{upper}"
            elif lower >1000 and upper >1000:
                return f"{lower}-{upper}"
    return value

In [118]:
df['Salary'] = df['Salary'].apply(value_lacsPA)


In [119]:
def convert_monthly_to_annual(salary):
    if '/month' in salary: 
        monthly_salary = float(salary.replace('/month', '').replace(',', ''))
        annual_salary = monthly_salary * 12
        return f"{annual_salary}"
    return salary

In [120]:
df['Salary']  = df['Salary'].apply(convert_monthly_to_annual)

In [121]:
def existslacspa(value):
    if 'Lacs P.A.' in value:
        value = value.replace('Lacs P.A.',"")
        return f"{float(value)*100000}"
    return value

In [122]:
df['Salary']  = df['Salary'].apply(existslacspa)

In [123]:
df['Salary']=df['Salary'].str.replace("P.A.","",regex=False)

In [124]:
def funcforCrLac(value):
    if 'and' in value:
        value1,value2 = value.split("and")
        value1 = value1.replace("Cr", "")
        
        value1 = float(value1)*10000000
        return f"{value1}"
        
    if "Cr" in value:
        if '-' in value:
            lower, upper = value.split("-")
            if "Lacs" in lower:
                lower = lower.replace("Lacs","")
                upper = upper.replace("Cr","")
                lower = float(lower)*100000
                upper = float(upper)*10000000
                return f"{lower}-{upper}"
    
            upper = upper.replace("Cr","")
            lower = float(lower)*10000000
            upper = float(upper)*10000000
            return  f"{lower}-{upper}"
        else:
            value = value.replace("Cr","")
            value = float(value)*10000000
            return f"{value}"
    return f"{value}"

In [125]:
df['Salary'] = df['Salary'].apply(funcforCrLac)

In [126]:
def removecommas(value):
    if '-' in value:
        lower, upper = value.split("-")
        if ',' in lower :
            lower = lower.replace(",","")
        if ',' in upper:
            upper = upper.replace(",","")
        return f"{lower}-{upper}"
    elif ',' in value:
        return f"{value.replace(",","")}"
    else:
        return value

In [127]:
df['Salary'] = df['Salary'].apply(removecommas)

In [128]:
df = df.loc[df['Salary'] != 'Not Disclosed']

In [129]:
def functForMeanSal(value):
    if '-' in value:
        lower, upper = value.split("-")
        lower = float(lower)
        upper = float(upper)
        mean = (upper+lower)/2
        return mean
    elif value == 'Unpaid':
        return 0
    else:
        value = float(value)
        return value

In [130]:
df['newSal'] = df['Salary'].apply(functForMeanSal)

In [131]:
Q1 = df['newSal'].quantile(0.25)
Q1

287500.0

In [132]:
Q3 = df['newSal'].quantile(0.75)
Q3

800000.0

In [133]:
IQR = Q3-Q1
IQR

512500.0

In [134]:
lower_bound = max(0,Q1 - (1.5*IQR))
upper_bound = Q3 + (5*IQR)
upper_bound

3362500.0

df_no_outlier = df[(df['newSal']>=lower_bound) & (df['newSal'] <= upper_bound )]

df_no_outlier.info()

df_no_outlier['newSal'].max()

In [135]:
def salCategories(value):
    if 0 <= value <200000:
        return '[0,200000]'
    elif 200000 <= value <250000:
        return '[200000,250000]'
    elif 250000 <= value <300000:
        return '[250000,300000]'
    elif 300000 <= value <350000:
        return '[300000,350000]'
    elif 350000 <= value <400000:
        return '[350000,400000]'
    elif 400000 <= value <450000:
        return '[400000,450000]'
    elif 450000 <= value <500000:
        return '[450000,500000]'
    elif 500000 <= value <700000:
        return '[500000,700000]'
    elif 700000 <= value <1100000:
        return '[700000,1100000]'
    elif 1100000 <= value <=1550000:
        return '[1100000,1550000]'
    elif value>1550000:
        return 'above 1550000'

df_no_outlier.info()

In [136]:
df['labelSal'] = df['newSal'].apply(salCategories)

# Location

In [137]:
def rmhirefromvalue(value):
    if 'Hiring office located in' in value:
        value = value.split('located in')[1].strip()
        return value
    else:
        
        return value


In [138]:
df['Location']=df['Location'].apply(rmhirefromvalue)

In [140]:
def extract_city(location):
    # Rule 1: If comma-separated, take the first part
    if ',' in location:
        return location.split(',')[0].strip()
    # Rule 2: For addresses, use a regex pattern to find potential city-like words
    # Example: Capitalize words with more than 2 characters
    match = re.search(r'\b[A-Z][a-zA-Z]+\b', location)
    if match:
        return match.group(0)
    # Rule 3: Return as is if no pattern matches
    return location

In [141]:
df['City'] = df['Location'].apply(extract_city)

In [142]:
cities_list = ['Bengaluru', 'Mumbai', 'Delhi', 'Hyderabad', 'Chennai', 'Pune', 'Kolkata', 
               'Ahmedabad', 'Jaipur', 'Coimbatore', 'Indore', 'Surat', 'Vadodara', 'Lucknow', 
               'Nagpur', 'Bhopal', 'Chandigarh', 'Thiruvananthapuram', 'Visakhapatnam', 'Patna','Gurugram']


In [143]:
def func_city(location):
    values=[]
    if ',' in location:
        values.extend(location.split(','))
    else:
        values.append(location)
    for v in values:
        for c in cities_list:
            if v in c :
                return c
    return location

In [144]:
def func_city1(location):
    # values=[]
    # if ',' in location:
    #     values.extend(location.split(','))
    # else:
    #     values.append(location)
    # for v in values:
    #     for c in cities_list:
    #         if v in c :
    #             return c
    # return location
    for city in cities_list:
        if city in location:
            return city
    return location

In [145]:
df['City'] = df['City'].replace("New Delhi","Delhi")
df['City'] = df['City'].replace("Navi","Mumbai")
df['City'] = df['City'].replace("Bengalore","Bengaluru")


In [146]:
def value_conversion(location):
    for city in cities_list:
        if city.lower() in  location.lower():
            return city
    return location

In [147]:
df['City'] = df['City'].apply(value_conversion)

In [148]:
mumbai_city = [
        "Colaba", "Fort", "Churchgate", "Marine Lines", 
        "Nariman Point", "Cuffe Parade", "Malabar Hill", 
        "Girgaon (Girgaum)", "Dadar", "Tardeo",
        "Byculla", "Parel", "Lower Parel", 
        "Worli", "Mahalaxmi",
        "Bandra", "Khar", "Santacruz", "Vile Parle", 
        "Andheri", "Jogeshwari", "Goregaon", "Malad", 
        "Kandivali", "Borivali", "Dahisar",
        "Kurla", "Chembur", "Ghatkopar", 
        "Vikhroli", "Bhandup", "Mulund",
        "Dockyard Road", "Sewri", "Wadala", 
        "Mankhurd", "Govandi",
        "Powai", "Chandivali",
        "Vashi", "Nerul", "Kharghar", "Panvel",
        "Thane", "Kalyan", "Dombivli", 
        "Ambernath", "Ulhasnagar", "Vasai-Virar"
    ]

In [149]:
delhi_city = [
        "Connaught Place", "Karol Bagh", "Daryaganj", 
        "Paharganj", "Pragati Maidan",
        "Kashmere Gate", "Civil Lines", "Model Town", 
        "GTB Nagar", "Shakti Nagar",
        "Saket", "Hauz Khas", "Greater Kailash", 
        "Lajpat Nagar", "Defence Colony", "Vasant Kunj", 
        "Mehrauli", "Malviya Nagar",
        "Rajouri Garden", "Punjabi Bagh", "Janakpuri", 
        "Tilak Nagar", "Uttam Nagar", "Paschim Vihar",
        "Preet Vihar", "Laxmi Nagar", "Mayur Vihar", 
        "Patparganj", "Anand Vihar", "Shahdara",
        "Chanakyapuri", "Sarojini Nagar", "RK Puram", 
        "Lodhi Colony", "Jor Bagh", "Diplomatic Enclave",
        "Rohini", "Pitampura", "Ashok Vihar", 
        "Shalimar Bagh", "Narela",
        "Dwarka", "Najafgarh", "Palam", 
        "Vasant Vihar", "Mahipalpur",
        "Seelampur", "Yamuna Vihar", "Shastri Park", 
        "Bhajanpura", "Khajuri Khas",
        "Bawana", "Alipur", "Najafgarh Rural", 
        "Narela Rural"
    ]

In [150]:
def mumbai_cities_filter(location):
    for city in mumbai_city:
        if city.lower() in  location.lower():
            return 'Mumbai'
    return location

In [151]:
def delhi_cities_filter(location):
    for city in delhi_city:
        if city.lower() in  location.lower():
            return 'Delhi'
    return location

In [152]:
df['City'] = df['City'].apply(delhi_cities_filter)
df['City'] = df['City'].apply(mumbai_cities_filter)


In [156]:
df['City'].value_counts().head(20)

City
Bengaluru     2147
Mumbai        1856
Hyderabad     1671
Noida         1344
Pune          1292
Chennai       1156
Gurugram      1011
Kolkata        712
Ahmedabad      457
Mohali         233
Kochi          219
Jaipur         213
Coimbatore     203
Delhi          180
Vadodara       142
Lucknow        141
Madurai        136
Surat          121
Faridabad      120
Chandigarh     111
Name: count, dtype: int64

In [158]:
top_cities=['Bengaluru','Mumbai',
'Hyderabad','Noida',
'Pune','Chennai',
'Gurugram',
'Kolkata','Ahmedabad',
'Mohali','Kochi',
'Jaipur','Coimbatore',
'Delhi','Vadodara',
'Lucknow','Madurai',
'Surat','Faridabad',
'Chandigarh']

In [160]:
def otherCategory(value):
    for city in top_cities:
        if value == city:
            return city
    return 'other'

In [161]:
df['City']= df['City'].apply(otherCategory)

In [163]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18098 entries, 0 to 56684
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Experience            18098 non-null  object  
 1   UG                    18098 non-null  object  
 2   Location              18098 non-null  object  
 3   PG                    18098 non-null  object  
 4   Salary                18098 non-null  object  
 5   Department            18098 non-null  object  
 6   Doctorate:            384 non-null    object  
 7   Skills                18098 non-null  object  
 8   Role Category         18098 non-null  object  
 9   Processed_Experience  18098 non-null  float64 
 10  Experience_Category   18098 non-null  category
 11  UG_Encoded            18098 non-null  int64   
 12  PG_Encoded            18098 non-null  int64   
 13  Department_Role       18098 non-null  object  
 14  newSal                18098 non-null  float64 
 15  labelSa

In [164]:
df.drop(['Doctorate:','Processed_Experience','newSal','Role Category','Location','PG','UG','Experience','Salary','Skills'],axis=1,inplace=True)

In [165]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18098 entries, 0 to 56684
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   Department           18098 non-null  object  
 1   Experience_Category  18098 non-null  category
 2   UG_Encoded           18098 non-null  int64   
 3   PG_Encoded           18098 non-null  int64   
 4   Department_Role      18098 non-null  object  
 5   labelSal             18098 non-null  object  
 6   City                 18098 non-null  object  
dtypes: category(1), int64(2), object(4)
memory usage: 1007.6+ KB


In [166]:
df['Experience_Category'].value_counts()

Experience_Category
Mid-level      5907
Junior         5090
Experienced    4542
Entry-level    1150
Senior          864
Expert          545
Name: count, dtype: int64

In [167]:
df['Department_Role'].value_counts()

Department_Role
Sales & Business Development - BD / Pre Sales                                                 2841
Others - Others - Others                                                                      1902
Engineering - Software & QA - Software Development                                            1511
Customer Success, Service & Operations - Voice / Blended                                      1406
Sales & Business Development - Enterprise & B2B Sales                                         1311
Customer Success, Service & Operations - Customer Success, Service & Operations - Others       902
Finance & Accounting - Accounting & Taxation                                                   872
Human Resources - Recruitment & Talent Acquisition                                             476
Healthcare & Life Sciences - Doctor                                                            474
Sales & Business Development - Sales Support & Operations                                    

In [168]:
df['labelSal'].value_counts()

labelSal
[300000,350000]      2202
above 1550000        2178
[500000,700000]      2026
[350000,400000]      1782
[0,200000]           1745
[700000,1100000]     1708
[250000,300000]      1636
[400000,450000]      1291
[1100000,1550000]    1254
[200000,250000]      1249
[450000,500000]      1027
Name: count, dtype: int64

In [169]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18098 entries, 0 to 56684
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   Department           18098 non-null  object  
 1   Experience_Category  18098 non-null  category
 2   UG_Encoded           18098 non-null  int64   
 3   PG_Encoded           18098 non-null  int64   
 4   Department_Role      18098 non-null  object  
 5   labelSal             18098 non-null  object  
 6   City                 18098 non-null  object  
dtypes: category(1), int64(2), object(4)
memory usage: 1007.6+ KB


# convert all qualitative columns to quantitative ones
from sklearn.preprocessing import LabelEncoder

# create encoders
df_no_outlier['Experience_Category'] = LabelEncoder().fit_transform(df_no_outlier['Experience_Category'])
df_no_outlier['UG_Encoded'] = LabelEncoder().fit_transform(df_no_outlier['UG_Encoded'])
df_no_outlier['PG_Encoded'] = LabelEncoder().fit_transform(df_no_outlier['PG_Encoded'])
df_no_outlier['Department_Role'] = LabelEncoder().fit_transform(df_no_outlier['Department_Role'])
df_no_outlier['City'] = LabelEncoder().fit_transform(df_no_outlier['City'])
df_no_outlier['labelSal'] = LabelEncoder().fit_transform(df_no_outlier['labelSal'])

In [171]:
# convert all qualitative columns to quantitative ones
from sklearn.preprocessing import LabelEncoder

# create encoders
df['Experience_Category'] = LabelEncoder().fit_transform(df['Experience_Category'])
df['UG_Encoded'] = LabelEncoder().fit_transform(df['UG_Encoded'])
df['PG_Encoded'] = LabelEncoder().fit_transform(df['PG_Encoded'])
df['Department_Role'] = LabelEncoder().fit_transform(df['Department_Role'])
df['Department'] = LabelEncoder().fit_transform(df['Department'])
df['City'] = LabelEncoder().fit_transform(df['City'])
df['labelSal'] = LabelEncoder().fit_transform(df['labelSal'])

In [173]:
df.corr()

Unnamed: 0,Department,Experience_Category,UG_Encoded,PG_Encoded,Department_Role,labelSal,City
Department,1.0,0.078224,0.098637,0.02818,0.99269,0.016387,0.09592
Experience_Category,0.078224,1.0,-0.007258,-0.088903,0.061433,0.01206,0.084857
UG_Encoded,0.098637,-0.007258,1.0,-0.061636,0.094502,0.058559,0.051674
PG_Encoded,0.02818,-0.088903,-0.061636,1.0,0.036319,0.119098,0.030116
Department_Role,0.99269,0.061433,0.094502,0.036319,1.0,0.026872,0.07935
labelSal,0.016387,0.01206,0.058559,0.119098,0.026872,1.0,-0.033539
City,0.09592,0.084857,0.051674,0.030116,0.07935,-0.033539,1.0


In [175]:
# decide the x and y
x = df.drop('labelSal', axis=1)
y = df['labelSal']

In [176]:
# split the data into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=123456)

In [177]:
from sklearn.linear_model import LogisticRegression

def create_model_logistic_regression():
    # create the model
    model = LogisticRegression(max_iter=1000)
    
    # train the model
    model.fit(x_train, y_train)

    return model

In [178]:
from sklearn.naive_bayes import GaussianNB

def create_model_naive_bayes():
    # create the model
    model = GaussianNB()
    
    # train the model
    model.fit(x_train, y_train)

    return model

In [179]:
from sklearn.neighbors import KNeighborsClassifier

def create_model_knn():
    # create the model
    model = KNeighborsClassifier(n_neighbors=5)
    
    # train the model
    model.fit(x_train, y_train)

    return model

In [180]:
from sklearn.tree import DecisionTreeClassifier

def create_model_decision_tree():
    # create the model
    model = DecisionTreeClassifier()
    
    # train the model
    model.fit(x_train, y_train)

    return model

In [181]:
from sklearn.svm import SVC

def create_model_svm():
    # create the model
    model = SVC()
    
    # train the model
    model.fit(x_train, y_train)

    return model

In [182]:
from sklearn.ensemble import RandomForestClassifier

def create_model_random_forest():
    # create a model
    model = RandomForestClassifier(n_estimators=500)

    # train the model
    model.fit(x_train, y_train)

    return model

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model):
    # get the predictions
    y_pred = model.predict(x_test)
    y_true = y_test

    # get the metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred,average='macro')
    recall = recall_score(y_true, y_pred,average='macro')
    f1 = f1_score(y_true, y_pred,average='macro')

    return accuracy, precision, recall, f1

# collect all model's performances
performance_chart_data = []
for item in models:
    # item[0] => model name
    # item[1] => model
    
    accuracy, precision, recall, f1 = evaluate_model(item[1])
    performance_chart_data.append([item[0], accuracy, precision, recall, f1])

performance_chart = pd.DataFrame(performance_chart_data, 
                     columns=["Model Name", "Accuracy", "Precision", "Recall", "F1"])
performance_chart

# collection of models
models = [
    ("Logistic Regression", create_model_logistic_regression(), (0, 0)),
    ("Naive Bayes", create_model_naive_bayes(), (0, 1)),
    ("KNN", create_model_knn(), (1, 0)),
    ("SVM", create_model_svm(), (1, 1)),
    ("Decision Tree", create_model_decision_tree(), (2, 0))
]

In [192]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model):
    # get the predictions
    y_pred = model.predict(x_test)
    y_true = y_test

    # get the metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    return accuracy, precision, recall, f1

In [193]:
# collection of models
models = [
    ("Logistic Regression", create_model_logistic_regression(), (0, 0)),
    ("Naive Bayes", create_model_naive_bayes(), (0, 1)),
    ("KNN", create_model_knn(), (1, 0)),
    ("SVM", create_model_svm(), (1, 1)),
    ("Decision Tree", create_model_decision_tree(), (2, 0)),
    ("Random Forest", create_model_random_forest(), (2, 1))
]

In [194]:
# collect all model's performances
performance_chart_data = []
for item in models:
    # item[0] => model name
    # item[1] => model
    
    accuracy, precision, recall, f1 = evaluate_model(item[1])
    performance_chart_data.append([item[0], accuracy, precision, recall, f1])

performance_chart = pd.DataFrame(performance_chart_data, 
                     columns=["Model Name", "Accuracy", "Precision", "Recall", "F1"])
performance_chart

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Model Name,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.190239,0.129887,0.190239,0.133801
1,Naive Bayes,0.204604,0.2218,0.204604,0.148184
2,KNN,0.293554,0.292991,0.293554,0.289862
3,SVM,0.20442,0.135722,0.20442,0.147691
4,Decision Tree,0.335175,0.327379,0.335175,0.328117
5,Random Forest,0.348803,0.338458,0.348803,0.34078


In [None]:
 'micro', 'macro', 'weighted'

In [196]:
from sklearn.tree import DecisionTreeClassifier

# create model
model = DecisionTreeClassifier(criterion='gini', max_depth=500)

# train the model
model.fit(x_train, y_train)

In [197]:
from sklearn.model_selection import GridSearchCV

# hyper-parameters
parameters = {
    "criterion": ["gini", "log_loss", "entropy"],
    "max_depth": np.arange(10) * 100,
    "n_estimators": np.arange(10) * 100
}

# create a grid search cv object
grid_search_cv = GridSearchCV(estimator=model, param_grid=parameters)

# fit the data
grid_search_cv.fit(x_train, y_train)

15 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Aaditya\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Aaditya\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\Aaditya\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Aaditya\AppData\Local\Programs\Python\Pytho

In [198]:
grid_search_cv.best_score_

0.3412528109616052

In [199]:
grid_search_cv.best_params_

{'criterion': 'gini', 'max_depth': 900}

# Random Forest

In [None]:
{“gini”, “entropy”, “log_loss”},

In [None]:

from sklearn.model_selection import GridSearchCV

# hyper-parameters
parameters = {
    "criterion": ["gini", "log_loss", "entropy"],
    "max_depth": np.arange(10) * 100,
    "n_estimators": np.arange(10) * 100
}

# create a grid search cv object
grid_search_cv = GridSearchCV(estimator=model, param_grid=parameters)

# fit the data
grid_search_cv.fit(x_train, y_train)

In [201]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=500)
    # train the model
model.fit(x_train, y_train)

In [None]:
grid_search_cv.best_score_

In [None]:
grid_search_cv.best_params_