# **Handling Missing Categorical Data**

In [53]:
import pandas as pd

In [54]:
df = pd.read_csv('data_science_job.csv')
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,20.0,,,36.0,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15.0,50-99,Pvt Ltd,47.0,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5.0,,,83.0,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,0.0,,Pvt Ltd,52.0,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,20.0,50-99,Funded Startup,8.0,0.0


In [55]:
df.isnull().sum()

enrollee_id                  0
city                         0
city_development_index     479
gender                    4508
relevent_experience          0
enrolled_university        386
education_level            460
major_discipline          2813
experience                  65
company_size              5938
company_type              6140
training_hours             766
target                       0
dtype: int64

### Handling both Numerical and Categorical data using pandas

In [56]:
#Numerical
df['city_development_index'] = df['city_development_index'].fillna(df['city_development_index'].mean())
df['experience'] = df['experience'].fillna(df['experience'].mean())
df['training_hours'] = df['training_hours'].fillna(df['training_hours'].mean())

#Categorical
df['gender'] = df['gender'].fillna(df['gender'].mode()[0])
df['enrolled_university'] = df['enrolled_university'].fillna(df['enrolled_university'].mode()[0])
df['education_level'] = df['education_level'].fillna(df['education_level'].mode()[0])
df['major_discipline'] = df['major_discipline'].fillna(df['major_discipline'].mode()[0])
df['company_size'] = df['company_size'].fillna(df['company_size'].mode()[0])
df['company_type'] = df['company_type'].fillna(df['company_type'].mode()[0])

In [57]:
df.isnull().sum()

enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
training_hours            0
target                    0
dtype: int64

### Handling both Numerical Data and Categorical Data using Scikit Learn

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target'),df['target'],test_size=0.2)

trf = ColumnTransformer(transformers=[
    ('trf1',SimpleImputer(strategy='mean'),['city_development_index','experience','training_hours']),
    ('trf2',SimpleImputer(strategy='most_frequent'),['gender','enrolled_university','education_level','major_discipline','company_size','company_type'])
],remainder='passthrough')

transformed_data_train = trf.fit_transform(X_train)

transformed_data_test = trf.fit_transform(X_test)

In [59]:
new_df = pd.DataFrame(transformed_data_train)
new_df.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
dtype: int64

In [60]:
new_df = pd.DataFrame(transformed_data_test)
new_df.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
dtype: int64