# # Import numpy, pandas and matplotlib

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from scipy import stats
from sklearn.preprocessing import OneHotEncoder

# Read the dataset

In [2]:
census_income_train = pd.read_csv('C:/Users/Hitesh Chordiya/Desktop/DESKTOP FILES/Fordham 2nd semester/Data Mining/Project/census-income.data.csv', names=['age', 'workclass', 'wgt', 'education', 'education_id', \
                                      'marital_status', 'occupation', 'relationship', 'race', 'sex', \
                                      'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income'])

In [3]:
census_income_train.head()

Unnamed: 0,age,workclass,wgt,education,education_id,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
census_income_test = pd.read_csv('C:/Users/Hitesh Chordiya/Desktop/DESKTOP FILES/Fordham 2nd semester/Data Mining/Project/census-income.test.csv', names=['age', 'workclass', 'wgt', 'education', 'education_id', \
                                      'marital_status', 'occupation', 'relationship', 'race', 'sex', \
                                      'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income'])

In [5]:
census_income_test.head()

Unnamed: 0,age,workclass,wgt,education,education_id,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


# Data Preprocessing (Training Data)

Handling Missing and Duplicated Data

In [6]:
census_income_train.isna().sum()

age               0
workclass         0
wgt               0
education         0
education_id      0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [7]:
print('Duplicates = ', census_income_train.duplicated().sum())

Duplicates =  24


In [8]:
census_income_train = census_income_train[~census_income_train.duplicated()]
print('After removing Duplicates = ', census_income_train.duplicated().sum())

After removing Duplicates =  0


In [9]:
census_income_train.replace(' ?','Unknown', inplace=True)
census_income_train.replace('?','Unknown', inplace=True)

In [10]:
census_income_train.workclass.value_counts()

 Private             22673
 Self-emp-not-inc     2540
 Local-gov            2093
Unknown               1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64

In [11]:
census_income_train.native_country.value_counts()

 United-States                 29153
 Mexico                          639
Unknown                          582
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Japan                            62
 Guatemala                        62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 

In [12]:
census_income_train.occupation.value_counts()

 Prof-specialty       4136
 Craft-repair         4094
 Exec-managerial      4065
 Adm-clerical         3768
 Sales                3650
 Other-service        3291
 Machine-op-inspct    2000
Unknown               1843
 Transport-moving     1597
 Handlers-cleaners    1369
 Farming-fishing       992
 Tech-support          927
 Protective-serv       649
 Priv-house-serv       147
 Armed-Forces            9
Name: occupation, dtype: int64

Converting Categorical data to Numeric Data

In [13]:
census_income_train['sex'] = census_income_train['sex'].str.strip()
census_income_train['sex'] = census_income_train['sex'].replace({'Male':1,'Female':0})
census_income_train['income'] = census_income_train['income'].str.strip()
census_income_train['income'] = census_income_train['income'].replace({'>50K':1,'<=50K':0})

In [14]:
census_income_train.head()

Unnamed: 0,age,workclass,wgt,education,education_id,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,Cuba,0


Strip the data (Remove spaces)

In [15]:
columns = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']
for column in columns:
    census_income_train[column] = census_income_train[column].str.strip()

Normalise the training data using the Z score normalisation (age,wgt,capital_gain,capital_loss,hours_per_week)

In [16]:
def z_normalize(x):
    print(x.mean(), x.std())
    z = (x - x.mean())/x.std()
    return z

In [17]:
census_income_train.age = z_normalize(census_income_train.age)

38.585548759873376 13.637983518469866


In [18]:
census_income_train.wgt = z_normalize(census_income_train.wgt)

189780.84851092604 105556.47100949337


In [19]:
census_income_train.capital_gain = z_normalize(census_income_train.capital_gain)

1078.4437409718166 7387.957424191213


In [20]:
census_income_train.capital_loss = z_normalize(census_income_train.capital_loss)

87.36822694163568 403.10183306309347


In [21]:
census_income_train.hours_per_week = z_normalize(census_income_train.hours_per_week)

40.44032947106371 12.346889182125157


In [22]:
census_income_train.head()

Unnamed: 0,age,workclass,wgt,education,education_id,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.030389,State-gov,-1.063552,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,0.148289,-0.21674,-0.035663,United-States,0
1,0.83696,Self-emp-not-inc,-1.008653,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,-0.145973,-0.21674,-2.222449,United-States,0
2,-0.042935,Private,0.245036,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,-0.145973,-0.21674,-0.035663,United-States,0
3,1.056934,Private,0.425745,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,-0.145973,-0.21674,-0.035663,United-States,0
4,-0.776181,Private,1.408044,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,-0.145973,-0.21674,-0.035663,Cuba,0


# Data Preprocessing (Testing Data)

Handling Missing and Duplicated Data

In [23]:
census_income_test.isna().sum()

age               0
workclass         0
wgt               0
education         0
education_id      0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [24]:
print('Duplicates = ', census_income_test.duplicated().sum())

Duplicates =  5


In [25]:
census_income_test = census_income_test[~census_income_test.duplicated()]
print('After removing Duplicates = ', census_income_test.duplicated().sum())

After removing Duplicates =  0


In [26]:
census_income_test.replace(' ?','Unknown', inplace=True)
census_income_test.replace('?','Unknown', inplace=True)

In [27]:
census_income_test.workclass.value_counts()

 Private             11206
 Self-emp-not-inc     1321
 Local-gov            1043
Unknown                963
 State-gov             683
 Self-emp-inc          578
 Federal-gov           472
 Without-pay             7
 Never-worked            3
Name: workclass, dtype: int64

In [28]:
census_income_test.native_country.value_counts()

 United-States                 14657
 Mexico                          308
Unknown                          274
 Philippines                      97
 Puerto-Rico                      70
 Germany                          69
 Canada                           61
 India                            51
 El-Salvador                      49
 China                            47
 Cuba                             43
 England                          37
 South                            35
 Dominican-Republic               33
 Italy                            32
 Haiti                            31
 Portugal                         30
 Japan                            30
 Poland                           27
 Columbia                         26
 Jamaica                          25
 Guatemala                        24
 Greece                           20
 Vietnam                          19
 Ecuador                          17
 Iran                             16
 Peru                             15
 

In [29]:
census_income_test.occupation.value_counts()

 Prof-specialty       2031
 Exec-managerial      2019
 Craft-repair         2013
 Sales                1854
 Adm-clerical         1840
 Other-service        1628
 Machine-op-inspct    1019
Unknown                966
 Transport-moving      758
 Handlers-cleaners     702
 Tech-support          518
 Farming-fishing       495
 Protective-serv       334
 Priv-house-serv        93
 Armed-Forces            6
Name: occupation, dtype: int64

Converting Categorical data to Numeric Data

In [30]:
census_income_test['sex'] = census_income_test['sex'].str.strip()
census_income_test['sex'] = census_income_test['sex'].replace({'Male':1,'Female':0})
census_income_test['income'] = census_income_test['income'].str.strip()
census_income_test['income'] = census_income_test['income'].replace({'>50K.':1,'<=50K.':0})

In [31]:
census_income_test.head()

Unnamed: 0,age,workclass,wgt,education,education_id,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,1,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,1,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,1,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,1,7688,0,40,United-States,1
4,18,Unknown,103497,Some-college,10,Never-married,Unknown,Own-child,White,0,0,0,30,United-States,0


Strip the data (Remove spaces)

In [32]:
columns = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']
for column in columns:
    census_income_test[column] = census_income_test[column].str.strip()

Normalise the testing data using the Z score normalisation (age,wgt,capital_gain,capital_loss,hours_per_week)

In [33]:
census_income_test.age = z_normalize(census_income_test.age)

38.77088965347751 13.84948417194846


In [34]:
census_income_test.capital_gain = z_normalize(census_income_test.capital_gain)

1082.2374662079135 7585.0771334449155


In [35]:
census_income_test.capital_loss = z_normalize(census_income_test.capital_loss)

87.92627181125583 403.164257120746


In [36]:
census_income_test.hours_per_week = z_normalize(census_income_test.hours_per_week)

40.39450724993856 12.478902303501402


In [37]:
census_income_test.wgt = z_normalize(census_income_test.wgt)

189442.1219587122 105708.55455539147


In [38]:
census_income_test.head()

Unnamed: 0,age,workclass,wgt,education,education_id,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,-0.994325,Private,0.353423,11th,7,Never-married,Machine-op-inspct,Own-child,Black,1,-0.14268,-0.21809,-0.031614,United-States,0
1,-0.055662,Private,-0.942479,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,1,-0.14268,-0.21809,0.769739,United-States,0
2,-0.777711,Local-gov,1.39543,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,1,-0.14268,-0.21809,-0.031614,United-States,1
3,0.377567,Private,-0.275466,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,1,0.870889,-0.21809,-0.031614,United-States,1
4,-1.499759,Unknown,-0.813038,Some-college,10,Never-married,Unknown,Own-child,White,0,-0.14268,-0.21809,-0.832966,United-States,0


Comparing the training and testing data

In [39]:
len(census_income_train.columns) == len(census_income_test.columns)

True

In [40]:
print(len(census_income_train), len(census_income_test))

32537 16276


# Splitting the train and test data

In [41]:
X_train = census_income_train.iloc[:,:-1]
y_train = census_income_train.iloc[:,-1]
X_test = census_income_test.iloc[:,:-1]
y_test = census_income_test.iloc[:,-1]

# One Hot Encoding

Encode Categorical features

In [42]:
categorical_cols = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
encoder = OneHotEncoder(sparse_output=False)
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
X_test_encoded = encoder.transform(X_test[categorical_cols])

Combine Encoded Categorical features with numerical features

In [43]:
numerical_cols = ['age', 'education_id', 'capital_gain', 'capital_loss', 'hours_per_week']
X_train_processed = np.hstack((X_train[numerical_cols], X_train_encoded))
X_test_processed = np.hstack((X_test[numerical_cols], X_test_encoded))

# Random Forest Algorithm

In [44]:
def randomforest(X_train, y_train, X_test):
    random_forest = RandomForestClassifier(n_estimators = 80)
    random_forest.fit(X_train, y_train)
    y_pred = random_forest.predict(X_test)
    return y_pred

In [45]:
rm_trial = randomforest(X_train_processed, y_train, X_test_processed)

In [46]:
for i in range(4):
    print(['tn', 'fp', 'fn', 'tp'][i],confusion_matrix(rm_trial, y_test).ravel()[i])

tn 11414
fp 1552
fn 1016
tp 2294


In [47]:
print("Random Forest Algorithm")
print("Accuracy = ",accuracy_score(rm_trial, y_test)*100,"%")
print("Precision Score = ",precision_score(rm_trial, y_test)*100,"%")
print("Recall Score = ",recall_score(rm_trial, y_test)*100,"%")
print("F1 Score = ",f1_score(rm_trial, y_test)*100,"%")

Random Forest Algorithm
Accuracy =  84.22216760874907 %
Precision Score =  59.64638585543421 %
Recall Score =  69.30513595166164 %
F1 Score =  64.1140301844606 %


# K Nearest Neighbor Classifier Algorithm

In [48]:
def knn(X_train, y_train, X_test):    
    knn = KNeighborsClassifier(n_neighbors = 80)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)    
    return y_pred

In [49]:
knn_trial = knn(X_train_processed, y_train, X_test_processed)

In [50]:
for i in range(4):
    print(['tn', 'fp', 'fn', 'tp'][i],confusion_matrix(knn_trial, y_test).ravel()[i])

tn 11486
fp 1608
fn 944
tp 2238


In [51]:
print("K Nearest Neighbor Algorithm")
print("Accuracy = ",accuracy_score(knn_trial, y_test)*100,"%")
print("Precision Score = ",precision_score(knn_trial, y_test)*100,"%")
print("Recall Score = ",recall_score(knn_trial, y_test)*100,"%")
print("F1 Score = ",f1_score(knn_trial, y_test)*100,"%")

K Nearest Neighbor Algorithm
Accuracy =  84.32047186040796 %
Precision Score =  58.19032761310452 %
Recall Score =  70.33312382149592 %
F1 Score =  63.68810472396129 %


# Gaussian Naive Bayes Algorithm

In [52]:
def naivebayes(X_train, y_train, X_test):
    naive_bayes = GaussianNB()
    naive_bayes.fit(X_train,y_train)
    y_pred = naive_bayes.predict(X_test)
    return y_pred

In [53]:
nb_trial = naivebayes(X_train_processed, y_train, X_test_processed)

In [54]:
for i in range(4):
    print(['tn', 'fp', 'fn', 'tp'][i],confusion_matrix(nb_trial, y_test).ravel()[i])

tn 5688
fp 264
fn 6742
tp 3582


In [55]:
print("Gaussian Naive Bayes Algorithm")
print("Accuracy = ",accuracy_score(nb_trial, y_test)*100,"%")
print("Precision Score = ",precision_score(nb_trial, y_test)*100,"%")
print("Recall Score = ",recall_score(nb_trial, y_test)*100,"%")
print("F1 Score = ",f1_score(nb_trial, y_test)*100,"%")

Gaussian Naive Bayes Algorithm
Accuracy =  56.955025804866054 %
Precision Score =  93.13572542901716 %
Recall Score =  34.695854320030996 %
F1 Score =  50.55751587861679 %


# Logistic Regression Algorithm

In [56]:
def logistic_regression(X_train, y_train, X_test):
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)   
    return y_pred

In [58]:
lr_trial = logistic_regression(X_train_processed, y_train, X_test_processed)

In [59]:
for i in range(4):
    print(['tn', 'fp', 'fn', 'tp'][i],confusion_matrix(lr_trial, y_test).ravel()[i])

tn 11581
fp 1554
fn 849
tp 2292


In [60]:
print("Logistic Regression Algorithm")
print("Accuracy = ",accuracy_score(lr_trial, y_test)*100,"%")
print("Precision Score = ",precision_score(lr_trial, y_test)*100,"%")
print("Recall Score = ",recall_score(lr_trial, y_test)*100,"%")
print("F1 Score = ",f1_score(lr_trial, y_test)*100,"%")

Logistic Regression Algorithm
Accuracy =  85.23593020398133 %
Precision Score =  59.59438377535101 %
Recall Score =  72.97039159503342 %
F1 Score =  65.60755689136968 %


# Ensemble Technique(Random Forest,KNN,Naive Bayes,Logistic Regression)

In [61]:
def ensemble(X_train, y_train, X_test):    
    r_y = randomforest(X_train, y_train, X_test)
    k_y = knn(X_train, y_train, X_test)
    n_y = naivebayes(X_train, y_train, X_test)
    l_r = logistic_regression(X_train, y_train, X_test)
    final_y = []
    for i in range(len(X_test)):
        final_y.append(stats.mode([r_y[i],k_y[i], n_y[i], l_r[i]], keepdims=True)[0][0])
    return final_y

In [62]:
final_run = ensemble(X_train_processed, y_train, X_test_processed)

In [63]:
for i in range(4):
    print(['tn', 'fp', 'fn', 'tp'][i],confusion_matrix(final_run, y_test).ravel()[i])

tn 11624
fp 1547
fn 806
tp 2299


In [64]:
print("Ensemble Technique(Random Forest,KNN,Naive Bayes,Logistic Regression)")
print("Accuracy = ",accuracy_score(final_run, y_test)*100,"%")
print("Precision Score = ",precision_score(final_run, y_test)*100,"%")
print("Recall Score = ",recall_score(final_run, y_test)*100,"%")
print("F1 Score = ",f1_score(final_run, y_test)*100,"%")

Ensemble Technique(Random Forest,KNN,Naive Bayes,Logistic Regression)
Accuracy =  85.54313099041534 %
Precision Score =  59.77639105564223 %
Recall Score =  74.04186795491144 %
F1 Score =  66.14875557473745 %


# Ensemble Technique (Random Forest, KNN, Naive Bayes)

In [65]:
def ensemble_without_lr(X_train, y_train, X_test):    
    r_y = randomforest(X_train, y_train, X_test)
    k_y = knn(X_train, y_train, X_test)
    n_y = naivebayes(X_train, y_train, X_test)
    final_y = []
    for i in range(len(X_test)):
        final_y.append(stats.mode([r_y[i],k_y[i], n_y[i]], keepdims=True)[0][0])
    return final_y

In [66]:
final_run_without_lr = ensemble_without_lr(X_train_processed, y_train, X_test_processed)

In [67]:
print("Ensemble Technique (Random Forest, KNN, Naive Bayes)")
print("Accuracy = ",accuracy_score(final_run_without_lr, y_test)*100,"%")
print("Precision Score = ",precision_score(final_run_without_lr, y_test)*100,"%")
print("Recall Score = ",recall_score(final_run_without_lr, y_test)*100,"%")
print("F1 Score = ",f1_score(final_run_without_lr, y_test)*100,"%")

Ensemble Technique (Random Forest, KNN, Naive Bayes)
Accuracy =  84.27131973457853 %
Precision Score =  69.57878315132605 %
Recall Score =  65.81406787998031 %
F1 Score =  67.64408493427705 %
