In [127]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [128]:
df = pd.read_csv('Messy_Employment_India_Dataset.csv')

# 3. EDA

#### 3.1 Viewing Data

In [129]:
df.shape

(2000, 9)

In [130]:
df.head()

Unnamed: 0,Status,Age Group,Education,Industry,Location,AI Risk,Years of Experience,Monthly Salary (INR),Date Recorded
0,EMPLOYED,25_34,high school,Technology,Urban,,,40700.0,1/1/2023
1,EMPLOYED,18-24,Diploma,Fintech,rural,moderate,,17500.0,1/2/2023
2,EMPLOYED,25_34,High School,Retail,Delhi,Low,16.0,77600.0,1/3/2023
3,UNEMPLOYED,18-24,Masters,Retail,Urban,Low,30.0,100200.0,1/4/2023
4,,45-54,Diploma,Fintech,Hyderabad,low,9.0,19500.0,1/5/2023


In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Status                1732 non-null   object 
 1   Age Group             1768 non-null   object 
 2   Education             1804 non-null   object 
 3   Industry              1799 non-null   object 
 4   Location              1787 non-null   object 
 5   AI Risk               1716 non-null   object 
 6   Years of Experience   980 non-null    float64
 7   Monthly Salary (INR)  1613 non-null   float64
 8   Date Recorded         2000 non-null   object 
dtypes: float64(2), object(7)
memory usage: 140.8+ KB


#### 3.2 Summary Stats

In [132]:
df.describe()

Unnamed: 0,Years of Experience,Monthly Salary (INR)
count,980.0,1613.0
mean,15.244898,76886.360818
std,9.103941,41628.008054
min,0.0,5100.0
25%,7.0,40700.0
50%,16.0,77200.0
75%,23.0,112400.0
max,30.0,149900.0


#### 3.3 Value count analysis  

In [133]:
for col in df.columns:
    if col == 'Monthly Salary (INR)':
        continue
    print((f"{'-'*30} {col} {'-'*30}").center(70))
    print("No. Of Unique Values :",df[col].nunique())
    print("Values-> \n",df[col].unique())
    print("-"*70)

 ------------------------------ Status ------------------------------ 
No. Of Unique Values : 6
Values-> 
 ['EMPLOYED' 'UNEMPLOYED' nan 'Unemployed' 'employed' 'unemployed'
 'Employed']
----------------------------------------------------------------------
------------------------------ Age Group ------------------------------
No. Of Unique Values : 7
Values-> 
 ['25_34' '18-24' '45-54' '35 - 44' '55+' '25-34' '18 to 24' nan]
----------------------------------------------------------------------
------------------------------ Education ------------------------------
No. Of Unique Values : 9
Values-> 
 ['high school' 'Diploma' 'High School' 'Masters' 'PhD' 'Master' 'Ph.D'
 'Bachelors' nan "Bachelor's"]
----------------------------------------------------------------------
------------------------------ Industry ------------------------------
No. Of Unique Values : 8
Values-> 
 ['Technology' ' Fintech' 'Retail' 'Tech ' nan 'Health' 'Healthcare'
 'Education' 'Finance']
-------------------

#### 4.4 Handling inconsistant categories

In [134]:
# 1. Status 
df['Status'] = (
    df['Status']
    .astype(str)
    .str.strip()
    .str.lower()
    .map({'employed': 1, 'unemployed': 0})
)

df = df.rename(columns={'Status': 'is_employed'})

In [135]:
# 2. Age Group
df['Age Group'] = df['Age Group'].replace(
    {
        '18-24':"Entry-Level",
        '18 to 24':"Entry-Level",
        '25_34':"Early Career",
        '25-34':"Early Career",
        '35 - 44':  "Mid-Career Professional",
        '45-54':"Senior Professional",
         '55+': "Pre-Retirement"
    }
)

In [136]:
# 3 Education
df['Education'].astype(str).str.strip().str.lower().unique()

array(['high school', 'diploma', 'masters', 'phd', 'master', 'ph.d',
       'bachelors', 'nan', "bachelor's"], dtype=object)

In [137]:
df['Education'] = (
    df['Education'].astype(str).str.strip().str.lower()
    .map({
        'high school':"high school",
        'master':'masters',
        'ph.d':'phd',
        "bachelor's":'bachelors'
    })
)

In [138]:
# 4. Industry
df['Industry'].astype(str).str.strip().str.lower().unique()

array(['technology', 'fintech', 'retail', 'tech', 'nan', 'health',
       'healthcare', 'education', 'finance'], dtype=object)

In [139]:
df['Industry'] = (
    df['Industry']
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({
        'tech': 'technology',
        'health': 'healthcare'
    })
)


In [140]:
# 5. Location 
know_cities = ['delhi', 'hyderabad', 'bangalore' ,'mumbai']
df['Location'] = df['Location'].str.strip().str.lower()
df['Location'].unique()

array(['urban', 'rural', 'delhi', 'hyderabad', 'bangalore', 'mumbai', nan,
       'suburban'], dtype=object)

In [141]:
import numpy as np

In [142]:
# --- LOGIC FOR COLUMN 1: CITY ---
df['Cities'] = df['Location'].apply(lambda x : x if x in know_cities else np.nan)
# --- LOGIC FOR COLUMN 2: AREA_TYPE ---
def fix_location(value):
    if pd.isna(value):
        return np.nan
    # 1
    if value in know_cities:
        return 'urban'
    if value in ['urban', 'rural','suburban'] :
        return value 

    return 'check!'

In [143]:
df['Location'].unique()

array(['urban', 'rural', 'delhi', 'hyderabad', 'bangalore', 'mumbai', nan,
       'suburban'], dtype=object)

In [144]:
df['Area_Type'] = df['Location'].apply(fix_location)

In [145]:
df['Area_Type'].unique()

array(['urban', 'rural', nan, 'suburban'], dtype=object)

In [146]:
# 6. AI Risk
df['AI Risk'].str.strip().str.lower().unique()

array([nan, 'moderate', 'low', 'high', 'medium'], dtype=object)

In [147]:
df['AI Risk'] = (
    df['AI Risk'].str.strip().str.lower()
    .replace({'medium':'moderate'})
)

In [148]:
for col in df.columns:
    if col == 'Monthly Salary (INR)':
        continue
    print((f"{'-'*30} {col} {'-'*30}").center(70))
    print("No. Of Unique Values :",df[col].nunique())
    print("Values-> \n",df[col].unique())
    print("-"*70)

------------------------------ is_employed ------------------------------
No. Of Unique Values : 2
Values-> 
 [ 1.  0. nan]
----------------------------------------------------------------------
------------------------------ Age Group ------------------------------
No. Of Unique Values : 5
Values-> 
 ['Early Career' 'Entry-Level' 'Senior Professional'
 'Mid-Career Professional' 'Pre-Retirement' nan]
----------------------------------------------------------------------
------------------------------ Education ------------------------------
No. Of Unique Values : 4
Values-> 
 ['high school' nan 'masters' 'phd' 'bachelors']
----------------------------------------------------------------------
------------------------------ Industry ------------------------------
No. Of Unique Values : 7
Values-> 
 ['technology' 'fintech' 'retail' 'nan' 'healthcare' 'education' 'finance']
----------------------------------------------------------------------
------------------------------ Location -----

In [149]:
df.head(15)

Unnamed: 0,is_employed,Age Group,Education,Industry,Location,AI Risk,Years of Experience,Monthly Salary (INR),Date Recorded,Cities,Area_Type
0,1.0,Early Career,high school,technology,urban,,,40700.0,1/1/2023,,urban
1,1.0,Entry-Level,,fintech,rural,moderate,,17500.0,1/2/2023,,rural
2,1.0,Early Career,high school,retail,delhi,low,16.0,77600.0,1/3/2023,delhi,urban
3,0.0,Entry-Level,,retail,urban,low,30.0,100200.0,1/4/2023,,urban
4,,Senior Professional,,fintech,hyderabad,low,9.0,19500.0,1/5/2023,hyderabad,urban
5,0.0,Early Career,,technology,bangalore,high,,54500.0,1/6/2023,bangalore,urban
6,1.0,Mid-Career Professional,masters,,mumbai,moderate,20.0,136500.0,1/7/2023,mumbai,urban
7,0.0,Mid-Career Professional,masters,,rural,low,28.0,25300.0,1/8/2023,,rural
8,1.0,Mid-Career Professional,phd,retail,,moderate,0.0,103600.0,1/9/2023,,
9,0.0,Pre-Retirement,phd,technology,,,3.0,126500.0,1/10/2023,,


In [150]:
((df.isnull().sum()/len(df))*100 ).round(2)

is_employed             13.40
Age Group               11.60
Education               52.15
Industry                 0.00
Location                10.65
AI Risk                 14.20
Years of Experience     51.00
Monthly Salary (INR)    19.35
Date Recorded            0.00
Cities                  54.65
Area_Type               10.65
dtype: float64

In [151]:
df['Industry'].unique()

array(['technology', 'fintech', 'retail', 'nan', 'healthcare',
       'education', 'finance'], dtype=object)

In [153]:
df.describe()

Unnamed: 0,is_employed,Years of Experience,Monthly Salary (INR)
count,1732.0,980.0,1613.0
mean,0.51097,15.244898,76886.360818
std,0.500024,9.103941,41628.008054
min,0.0,0.0,5100.0
25%,0.0,7.0,40700.0
50%,1.0,16.0,77200.0
75%,1.0,23.0,112400.0
max,1.0,30.0,149900.0


### Fixes acording to me (ash)
1. fill with unemployed if salary is 0
2. age group -> fill with mode value
3. Edu -> skip 
4. !Null
5. Ai Risk -> find ai-risk mode of each job cagetory then will acordingly
6. Exp-> skip
7. salary -> if is_employed is 0 -> salary = 0 
8. !Null
9. area -> 
    - if 50K salary < 1L = suburban
    - if > 1L = urban


In [154]:
df.columns

Index(['is_employed', 'Age Group', 'Education', 'Industry', 'Location',
       'AI Risk', 'Years of Experience', 'Monthly Salary (INR)',
       'Date Recorded', 'Cities', 'Area_Type'],
      dtype='object')

In [164]:
# Tas
df[(df['is_employed'] == 'No')&(df['Monthly Salary (INR)'].isna())]

Unnamed: 0,is_employed,Age Group,Education,Industry,Location,AI Risk,Years of Experience,Monthly Salary (INR),Date Recorded,Cities,Area_Type


In [165]:
# Tas
df[(df['is_employed'] == 'No')&(df['Monthly Salary (INR)'] != 0)]

Unnamed: 0,is_employed,Age Group,Education,Industry,Location,AI Risk,Years of Experience,Monthly Salary (INR),Date Recorded,Cities,Area_Type


In [166]:
df[(df['is_employed'] == 'Yes') & (df['Monthly Salary (INR)'] == 0)]


Unnamed: 0,is_employed,Age Group,Education,Industry,Location,AI Risk,Years of Experience,Monthly Salary (INR),Date Recorded,Cities,Area_Type


In [159]:
df['Monthly Salary (INR)'].isnull().sum()

np.int64(387)

In [158]:
# Fixing 'Salary' -> 
df.loc[
    (df['is_employed'] == 'No') & (df['Monthly Salary (INR)'].isna()),
    'Monthly Salary (INR)'
] = 0
