In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
import numpy as np

### Reading the Dataset off the CSV file

In [2]:
data = pd.read_csv("income.csv", names=["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss", "hrs_per_week", "country", "income"])

In [3]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hrs_per_week,country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Converting Raw Data to Categorical data

In [4]:
for col in ["workclass", "education", "relationship", "marital_status", "occupation", "sex", "race", "country", "income"]:
    lst = data[col].unique()
    for k, cls in enumerate(lst):
        data[col][data[col] == cls] = k

data = data.astype(np.int32)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col][data[col] == cls] = k


### Checking Correlations with the Target Column

In [5]:
data.corrwith(data["income"])

age               0.234037
workclass         0.047203
fnlwgt           -0.009463
education        -0.046218
education_num     0.335154
marital_status    0.002644
occupation       -0.104691
relationship     -0.171294
race             -0.067713
sex              -0.215980
capital_gain      0.223329
capital_loss      0.150526
hrs_per_week      0.229689
country          -0.028478
income            1.000000
dtype: float64

### Extracting the Target column separately and drop fromthe dataset

In [6]:
income = pd.get_dummies(data["income"], drop_first=True)
data = data.drop("income", axis=1)

### Dropping the Columns that have less correcation with the respect to Target column

In [7]:
data = data.drop(["workclass", "fnlwgt", "education", "marital_status", "race", "country"], axis=1)

In [8]:
data.shape

(32561, 8)

In [9]:
data.head(20)

Unnamed: 0,age,education_num,occupation,relationship,sex,capital_gain,capital_loss,hrs_per_week
0,39,13,0,0,0,2174,0,40
1,50,13,1,1,0,0,0,13
2,38,9,2,0,0,0,0,40
3,53,7,2,1,0,0,0,40
4,28,13,3,2,1,0,0,40
5,37,14,1,2,1,0,0,40
6,49,5,4,0,1,0,0,16
7,52,9,1,1,0,0,0,45
8,31,14,3,0,1,14084,0,50
9,42,13,1,1,0,5178,0,40


### Splitting Data into Train and Test data

In [16]:
x_train, x_test, y_train, y_test = train_test_split(data, income, test_size=.25, random_state=8)
y_train, y_test = np.array(y_train).reshape(-1), np.array(y_test).reshape(-1)

In [36]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.tree  import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

In [20]:
def test_classifier(clf):
    y_pred = clf.predict(x_test)
    correct = (y_pred == y_test).sum()
    total = len(y_pred)
    accuracy = correct / total * 100
    roc_score = roc_auc_score(y_test, y_pred)
    return accuracy, roc_score

## Decision Tree

In [45]:
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

acc, roc = test_classifier(tree)
print("Accuracy of Random Forest: ", acc)
print("ROC Score of Random Forest:", roc)

Accuracy of Random Forest:  82.3731728288908
ROC Score of Random Forest: 0.7470761405403522


## Random Forest

In [None]:
randomForest = RandomForestClassifier(max_depth=8, max_features=8)
randomForest.fit(x_train, y_train)

acc, roc = test_classifier(randomForest)
print("Accuracy of Random Forest: ", acc)
print("ROC Score of Random Forest:", roc)

## Gradient Boosting

In [29]:
gradBoost = GradientBoostingClassifier(max_depth=8, max_features=8)
gradBoost.fit(x_train, y_train)

acc, roc = test_classifier(gradBoost)
print("Accuracy of Random Forest: ", acc)
print("ROC Score of Random Forest:", roc)

Accuracy of Random Forest:  87.12688858862548
ROC Score of Random Forest: 0.7945238966212018


## Bagging Classifier

In [33]:
bagging = BaggingClassifier()
bagging.fit(x_train, y_train)

acc, roc = test_classifier(bagging)
print("Accuracy of Random Forest: ", acc)
print("ROC Score of Random Forest:", roc)

Accuracy of Random Forest:  84.10514678786389
ROC Score of Random Forest: 0.7637430975316599


## AdaBoost Classifier

In [34]:
adaclf = AdaBoostClassifier()
adaclf.fit(x_train, y_train)

acc, roc = test_classifier(adaclf)
print("Accuracy of Random Forest: ", acc)
print("ROC Score of Random Forest:", roc)

Accuracy of Random Forest:  85.62830119149982
ROC Score of Random Forest: 0.7566571440184414
