# 1. Importing and Loading

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data.csv')

# 2. Exploratory Data Analysis

In [3]:
df.shape

(32560, 15)

In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32560 entries, 0 to 32559
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32560 non-null  int64 
 1   workclass       32560 non-null  object
 2   fnlwgt          32560 non-null  int64 
 3   education       32560 non-null  object
 4   education_num   32560 non-null  int64 
 5   marital_status  32560 non-null  object
 6   occupation      32560 non-null  object
 7   relationship    32560 non-null  object
 8   race            32560 non-null  object
 9   sex             32560 non-null  object
 10  capital_gain    32560 non-null  int64 
 11  capital_loss    32560 non-null  int64 
 12  hours_per_week  32560 non-null  int64 
 13  native_country  32560 non-null  object
 14  income          32560 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


### Types of variables

In [6]:
df['age'].dtype, df['workclass'].dtype

(dtype('int64'), dtype('O'))

In [7]:
# categorical columns: having data type object (anything but not numbers)
# numerical columns: having data type int64 (basically, numbers)

In [8]:
cat = [col for col in df.columns if df[col].dtype == 'O']
num = [col for col in df.columns if df[col].dtype == 'int64']

In [9]:
cat, num

(['workclass',
  'education',
  'marital_status',
  'occupation',
  'relationship',
  'race',
  'sex',
  'native_country',
  'income'],
 ['age',
  'fnlwgt',
  'education_num',
  'capital_gain',
  'capital_loss',
  'hours_per_week'])

#### Categorical Columns

In [10]:
df[cat].head()

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country,income
0,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
1,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
2,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
3,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
4,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,United-States,<=50K


In [11]:
df[cat].shape

(32560, 9)

In [12]:
df[cat].isnull().sum()

workclass         0
education         0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
native_country    0
income            0
dtype: int64

In [13]:
# value_counts function: gives the frequency of each unique label in a column
for col in cat:
    print(df[col].value_counts(), end = '\n\n')

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1297
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

HS-grad         10501
Some-college     7291
Bachelors        5354
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: education, dtype: int64

Married-civ-spouse       14976
Never-married            10682
Divorced                  4443
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: marital_status, dtype: int64

Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         

In [14]:
# replacing '?' with np.NaN to identify null columns
for col in cat:
    df[col].replace('?', np.NaN, inplace = True)

In [15]:
# check for null values
for col in cat:
    print(df[col].value_counts(), end = '\n\n')

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
State-gov            1297
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

HS-grad         10501
Some-college     7291
Bachelors        5354
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: education, dtype: int64

Married-civ-spouse       14976
Never-married            10682
Divorced                  4443
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: marital_status, dtype: int64

Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3769
Sales                

In [16]:
df[cat].isnull().sum()

workclass         1836
education            0
marital_status       0
occupation        1843
relationship         0
race                 0
sex                  0
native_country     583
income               0
dtype: int64

In [17]:
# check for cardinality (high cardinality leads to less accurate model)
# cardinality is the number of unique labels in a column
for col in cat:
    print(f'{col}: {len(df[col].unique().tolist())} Labels')

workclass: 9 Labels
education: 16 Labels
marital_status: 7 Labels
occupation: 15 Labels
relationship: 6 Labels
race: 5 Labels
sex: 2 Labels
native_country: 42 Labels
income: 2 Labels


In [18]:
# native_country column has a relatively high cardinality

#### Numerical Columns

In [19]:
df[num].head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,50,83311,13,0,0,13
1,38,215646,9,0,0,40
2,53,234721,7,0,0,40
3,28,338409,13,0,0,40
4,37,284582,14,0,0,40


In [20]:
df[num].shape

(32560, 6)

In [21]:
df[num].isnull().sum()

age               0
fnlwgt            0
education_num     0
capital_gain      0
capital_loss      0
hours_per_week    0
dtype: int64

# 3. Feature Engineering

In [22]:
# feature engineering is the process of transforming raw data
# into useful features that help us to understand our model better
# and increase its predictive power i.e. its accuracy.

In [23]:
x = df.drop('income', axis = 1)
y = df['income']

x.shape, y.shape

((32560, 14), (32560,))

In [24]:
x.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States


In [25]:
# categorical and numerical columns
cat = [col for col in x.columns if x[col].dtype == 'O']
num = [col for col in x.columns if x[col].dtype in ['int64', 'float64']]

In [26]:
cat, num

(['workclass',
  'education',
  'marital_status',
  'occupation',
  'relationship',
  'race',
  'sex',
  'native_country'],
 ['age',
  'fnlwgt',
  'education_num',
  'capital_gain',
  'capital_loss',
  'hours_per_week'])

In [27]:
# check null values
x[cat].isnull().mean()

workclass         0.056388
education         0.000000
marital_status    0.000000
occupation        0.056603
relationship      0.000000
race              0.000000
sex               0.000000
native_country    0.017905
dtype: float64

In [28]:
x[num].isnull().mean()

age               0.0
fnlwgt            0.0
education_num     0.0
capital_gain      0.0
capital_loss      0.0
hours_per_week    0.0
dtype: float64

In [29]:
for col in x.columns:
    if x[col].isnull().mean() > 0:
        x[col].fillna(x[col].mode()[0], inplace = True)

In [30]:
x[num].isnull().mean()

age               0.0
fnlwgt            0.0
education_num     0.0
capital_gain      0.0
capital_loss      0.0
hours_per_week    0.0
dtype: float64

In [31]:
# Categorically Encoding values of the x[cat] features
# Categorical encoder converts x[cat] values to a boolean value based column for each of the features
# This is useful for a Naive Baye's classifier as it helps in calculating the probability easily

In [32]:
x = pd.get_dummies(x)
x.shape

(32560, 105)

In [33]:
x.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native_country_Portugal,native_country_Puerto-Rico,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United-States,native_country_Vietnam,native_country_Yugoslavia
0,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,38,215646,9,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,53,234721,7,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,28,338409,13,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,37,284582,14,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


# 4. Train Test Split

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

# 5. Training

In [36]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

### 5.1 Training without scaling

In [37]:
model_1 = GaussianNB()
model_1.fit(x_train, y_train)

GaussianNB()

In [38]:
accuracy_score(y_test, model_1.predict(x_test))

0.7993447993447993

### 5.2 Training with StandardScaler

In [39]:
from sklearn.preprocessing import StandardScaler
x_standard_scaler = StandardScaler()
x_standard_scaler.fit(x)

StandardScaler()

In [40]:
x_train_standard_scaler = x_standard_scaler.transform(x_train)
x_test_standard_scaler = x_standard_scaler.transform(x_test)

In [41]:
model_2 = GaussianNB()
model_2.fit(x_train_standard_scaler, y_train)

GaussianNB()

In [42]:
accuracy_score(y_test, model_2.predict(x_test_standard_scaler))

0.43314905814905813

### 5.3 Training with RobustScaler

In [43]:
from sklearn.preprocessing import RobustScaler
x_robust_scaler = RobustScaler()
x_robust_scaler.fit(x)

RobustScaler()

In [44]:
x_train_robust_scaler = x_robust_scaler.transform(x_train)
x_test_robust_scaler = x_robust_scaler.transform(x_test)

In [45]:
model_3 = GaussianNB()
model_3.fit(x_train_robust_scaler, y_train)

GaussianNB()

In [46]:
accuracy_score(y_test, model_3.predict(x_test_robust_scaler))

0.8092751842751843

In [47]:
# it was observed that robust scaler gave the best results
# when compared to model without training and when done with standard scaling