In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing the data
train = pd.read_csv('./census/train.csv')
test = pd.read_csv('./census/test.csv')

In [3]:
# Exploring the data
train.shape

(32561, 15)

In [4]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# Checking for missing values
train.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
gender            0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [6]:
#train info
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   gender          32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [7]:
# Unique values in each column ordered by frequency
train.nunique().sort_values(ascending=True)


gender                2
income                2
race                  5
relationship          6
marital-status        7
workclass             9
occupation           15
education            16
education-num        16
native-country       42
age                  73
capital-loss         92
hours-per-week       94
capital-gain        119
fnlwgt            21648
dtype: int64

In [8]:
# Checking for duplicate values
train.duplicated().sum()

24

In [9]:
#dropping duplicate values
train.drop_duplicates(inplace=True)

In [10]:
# Convert columns with unique values to categorical
mask = train.nunique() < 16
train.loc[:, mask] = train.loc[:, mask].astype('category')
train.dtypes

age                  int64
workclass         category
fnlwgt               int64
education           object
education-num        int64
marital-status    category
occupation        category
relationship      category
race              category
gender            category
capital-gain         int64
capital-loss         int64
hours-per-week       int64
native-country      object
income            category
dtype: object

## Feature Engineering

In [11]:
# numeric columns
numeric_cols = [c for c in train if train[c].dtype in ['int64', 'float64']]
print(numeric_cols)

['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']


In [12]:
# categorical columns
categorical_cols = [c for c in train if train[c].dtype in ['category', 'object']]
print(categorical_cols)

['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']


In [13]:
# noramlizing numeric columns
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train[numeric_cols])
scaled_numeric_cols = scaler.transform(train[numeric_cols])
scaled_numeric_cols = pd.DataFrame(scaled_numeric_cols, columns=numeric_cols)
# dropping original values
train = train.drop(numeric_cols, axis=1)
# merging normalized values with original data
train = pd.concat([train, scaled_numeric_cols], axis=1)
train.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,gender,native-country,income,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K,0.30137,0.044302,0.8,0.02174,0.0,0.397959
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K,0.452055,0.048238,0.8,0.0,0.0,0.122449
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K,0.287671,0.138113,0.533333,0.0,0.0,0.397959
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K,0.493151,0.151068,0.4,0.0,0.0,0.397959
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K,0.150685,0.221488,0.8,0.0,0.0,0.397959


In [14]:
#categorical columns nunique values
train[categorical_cols].nunique().sort_values(ascending=True)

gender             2
income             2
race               5
relationship       6
marital-status     7
workclass          9
occupation        15
education         16
native-country    42
dtype: int64

In [24]:
X_train = train.drop(['income'], axis=1)
y_train = train['income']

In [25]:
#categorical columns normalization
# One hot encoding for categorical columns
X_train = pd.get_dummies(X_train, drop_first=True)
X_train.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,0.30137,0.044302,0.8,0.02174,0.0,0.397959,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.452055,0.048238,0.8,0.0,0.0,0.122449,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.287671,0.138113,0.533333,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,0.493151,0.151068,0.4,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0.150685,0.221488,0.8,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# y_train normalization using lambda function
y_train = y_train.apply(lambda x:0 if x == '<=50K' else 1)

In [31]:
y_train.head()

0    1
1    1
2    1
3    1
4    1
Name: income, dtype: int64

In [32]:
X_test = test.drop(['income'], axis=1)
y_test = test['income']

In [33]:
# Convert columns with unique values to categorical
mask = X_test.nunique() < 16
X_test.loc[:, mask] = X_test.loc[:, mask].astype('category')
X_test.dtypes

age                  int64
workclass         category
fnlwgt               int64
education           object
education-num        int64
marital-status    category
occupation        category
relationship      category
race              category
gender            category
capital-gain         int64
capital-loss         int64
hours-per-week       int64
native-country      object
Unnamed: 15       category
dtype: object

In [34]:
#normalizing numeric columns
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_test[numeric_cols])
scaled_numeric_cols = scaler.transform(X_test[numeric_cols])
scaled_numeric_cols = pd.DataFrame(scaled_numeric_cols, columns=numeric_cols)
# dropping original values
X_test = X_test.drop(numeric_cols, axis=1)
# merging normalized values with original data
X_test = pd.concat([X_test, scaled_numeric_cols], axis=1)
X_test.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,gender,native-country,Unnamed: 15,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,United-States,,0.109589,0.14443,0.4,0.0,0.0,0.397959
1,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,United-States,,0.287671,0.051677,0.533333,0.0,0.0,0.5
2,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,United-States,,0.150685,0.219011,0.733333,0.0,0.0,0.397959
3,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,United-States,,0.369863,0.099418,0.6,0.076881,0.0,0.397959
4,?,Some-college,Never-married,?,Own-child,White,Female,United-States,,0.013699,0.060942,0.6,0.0,0.0,0.295918


In [35]:
#categorical columns normalization
# One hot encoding for categorical columns
X_test = pd.get_dummies(X_test, drop_first=True)
X_test.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,0.109589,0.14443,0.4,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,0.287671,0.051677,0.533333,0.0,0.0,0.5,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0.150685,0.219011,0.733333,0.0,0.0,0.397959,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.369863,0.099418,0.6,0.076881,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0.013699,0.060942,0.6,0.0,0.0,0.295918,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [36]:
# y_test normalization using lambda function
y_test = y_test.apply(lambda x:0 if x == '<=50K' else 1)
y_test.head()

0    1
1    1
2    1
3    1
4    1
Name: income, dtype: int64

In [38]:
X_train.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,0.30137,0.044302,0.8,0.02174,0.0,0.397959,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.452055,0.048238,0.8,0.0,0.0,0.122449,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.287671,0.138113,0.533333,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,0.493151,0.151068,0.4,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0.150685,0.221488,0.8,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [39]:
y_train.head()

0    1
1    1
2    1
3    1
4    1
Name: income, dtype: int64

In [40]:
X_test.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,0.109589,0.14443,0.4,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,0.287671,0.051677,0.533333,0.0,0.0,0.5,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0.150685,0.219011,0.733333,0.0,0.0,0.397959,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.369863,0.099418,0.6,0.076881,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0.013699,0.060942,0.6,0.0,0.0,0.295918,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [41]:
y_test.head()

0    1
1    1
2    1
3    1
4    1
Name: income, dtype: int64