In [62]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [63]:
df = pd.read_csv('Messy_Employment_India_Dataset.csv')

# 3. EDA

#### 3.1 Viewing Data

In [64]:
df.shape

(2000, 9)

In [65]:
df.head()

Unnamed: 0,Status,Age Group,Education,Industry,Location,AI Risk,Years of Experience,Monthly Salary (INR),Date Recorded
0,EMPLOYED,25_34,high school,Technology,Urban,,,40700.0,1/1/2023
1,EMPLOYED,18-24,Diploma,Fintech,rural,moderate,,17500.0,1/2/2023
2,EMPLOYED,25_34,High School,Retail,Delhi,Low,16.0,77600.0,1/3/2023
3,UNEMPLOYED,18-24,Masters,Retail,Urban,Low,30.0,100200.0,1/4/2023
4,,45-54,Diploma,Fintech,Hyderabad,low,9.0,19500.0,1/5/2023


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Status                1732 non-null   object 
 1   Age Group             1768 non-null   object 
 2   Education             1804 non-null   object 
 3   Industry              1799 non-null   object 
 4   Location              1787 non-null   object 
 5   AI Risk               1716 non-null   object 
 6   Years of Experience   980 non-null    float64
 7   Monthly Salary (INR)  1613 non-null   float64
 8   Date Recorded         2000 non-null   object 
dtypes: float64(2), object(7)
memory usage: 140.8+ KB


#### 3.2 Summary Stats

In [67]:
df.describe()

Unnamed: 0,Years of Experience,Monthly Salary (INR)
count,980.0,1613.0
mean,15.244898,76886.360818
std,9.103941,41628.008054
min,0.0,5100.0
25%,7.0,40700.0
50%,16.0,77200.0
75%,23.0,112400.0
max,30.0,149900.0


#### 3.3 Value count analysis  

In [68]:
for col in df.columns:
    if col == 'Monthly Salary (INR)':
        continue
    print((f"{'-'*30} {col} {'-'*30}").center(70))
    print("No. Of Unique Values :",df[col].nunique())
    print("Values-> \n",df[col].unique())
    print("-"*70)

 ------------------------------ Status ------------------------------ 
No. Of Unique Values : 6
Values-> 
 ['EMPLOYED' 'UNEMPLOYED' nan 'Unemployed' 'employed' 'unemployed'
 'Employed']
----------------------------------------------------------------------
------------------------------ Age Group ------------------------------
No. Of Unique Values : 7
Values-> 
 ['25_34' '18-24' '45-54' '35 - 44' '55+' '25-34' '18 to 24' nan]
----------------------------------------------------------------------
------------------------------ Education ------------------------------
No. Of Unique Values : 9
Values-> 
 ['high school' 'Diploma' 'High School' 'Masters' 'PhD' 'Master' 'Ph.D'
 'Bachelors' nan "Bachelor's"]
----------------------------------------------------------------------
------------------------------ Industry ------------------------------
No. Of Unique Values : 8
Values-> 
 ['Technology' ' Fintech' 'Retail' 'Tech ' nan 'Health' 'Healthcare'
 'Education' 'Finance']
-------------------

#### 4.4 Handling inconsistant categories

In [69]:
# 1. Status 
df['Status'] = (
    df['Status']
    .astype(str)
    .str.strip()
    .str.lower()
    .map({'employed': 1, 'unemployed': 0})
)

df = df.rename(columns={'Status': 'is_employed'})

In [70]:
# 2. Age Group
df['Age Group'] = df['Age Group'].replace(
    {
        '18-24':"Entry-Level",
        '18 to 24':"Entry-Level",
        '25_34':"Early Career",
        '25-34':"Early Career",
        '35 - 44':  "Mid-Career Professional",
        '45-54':"Senior Professional",
         '55+': "Pre-Retirement"
    }
)

In [71]:
# 3 Education
df['Education'].astype(str).str.strip().str.lower().unique()

array(['high school', 'diploma', 'masters', 'phd', 'master', 'ph.d',
       'bachelors', 'nan', "bachelor's"], dtype=object)

In [72]:
df['Education'] = (
    df['Education'].astype(str).str.strip().str.lower()
    .map({
        'high school':"high school",
        'master':'masters',
        'ph.d':'phd',
        "bachelor's":'bachelors'
    })
)

In [73]:
# 4. Industry
df['Industry'].astype(str).str.strip().str.lower().unique()

array(['technology', 'fintech', 'retail', 'tech', 'nan', 'health',
       'healthcare', 'education', 'finance'], dtype=object)

In [74]:
df['Industry'] = (
    df['Industry']
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({
        'tech': 'technology',
        'health': 'healthcare'
    })
)


In [75]:
# AI Risk
df['AI Risk'].str.strip().str.lower().unique()

array([nan, 'moderate', 'low', 'high', 'medium'], dtype=object)

In [76]:
df['AI Risk'] = (
    df['AI Risk'].str.strip().str.lower()
    .replace({'medium':'moderate'})
)

In [78]:
for col in df.columns:
    if col == 'Monthly Salary (INR)':
        continue
    print((f"{'-'*30} {col} {'-'*30}").center(70))
    print("No. Of Unique Values :",df[col].nunique())
    print("Values-> \n",df[col].unique())
    print("-"*70)

------------------------------ is_employed ------------------------------
No. Of Unique Values : 2
Values-> 
 [ 1.  0. nan]
----------------------------------------------------------------------
------------------------------ Age Group ------------------------------
No. Of Unique Values : 5
Values-> 
 ['Early Career' 'Entry-Level' 'Senior Professional'
 'Mid-Career Professional' 'Pre-Retirement' nan]
----------------------------------------------------------------------
------------------------------ Education ------------------------------
No. Of Unique Values : 4
Values-> 
 ['high school' nan 'masters' 'phd' 'bachelors']
----------------------------------------------------------------------
------------------------------ Industry ------------------------------
No. Of Unique Values : 7
Values-> 
 ['technology' 'fintech' 'retail' 'nan' 'healthcare' 'education' 'finance']
----------------------------------------------------------------------
------------------------------ Location -----