In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, chi2
from xgboost import XGBClassifier
data = pd.read_csv("income.csv")

In [2]:
data.shape

(32561, 15)

In [3]:
#removing all '?' records
data=data.replace('?', np.nan)
data.dropna(axis = 0,inplace = True)

#removing education columns, since education and education.num are identical
data = data.drop('education', 1)


In [4]:
data.shape

(30162, 14)

## Label Encoding and Binary Encoder for Categorical Variable

In [5]:
#Handle native,country feature using Count/Frequency Encoding
data = data.rename(columns={'native.country': 'native'})
native_counts = data.native.value_counts().to_dict()
data.native=data.native.map(native_counts).astype(int)


In [6]:
lb_make = LabelEncoder()
for col in data.columns:
    if (data[col].dtypes=='object'):
        print(col+ ":", len(data[col].unique()), ' unique values')
        data[col]=lb_make.fit_transform(data[col]).astype(int)

workclass: 7  unique values
marital.status: 7  unique values
occupation: 14  unique values
relationship: 6  unique values
race: 5  unique values
sex: 2  unique values
income: 2  unique values


In [7]:
vals_to_replace_income = {'>50K':'1', '<=50K':'0'}

In [8]:
data=data.select_dtypes(include = np.number)
data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native,income
1,82,2,132870,9,6,3,1,4,0,0,4356,18,27504,0
3,54,2,140359,4,0,6,4,4,0,0,3900,40,27504,0
4,41,2,264663,10,5,9,3,4,0,0,3900,40,27504,0
5,34,2,216864,9,0,7,4,4,0,0,3770,45,27504,0
6,38,2,150601,6,5,0,4,4,1,0,3770,40,27504,0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30162 entries, 1 to 32560
Data columns (total 14 columns):
age               30162 non-null int64
workclass         30162 non-null int32
fnlwgt            30162 non-null int64
education.num     30162 non-null int64
marital.status    30162 non-null int32
occupation        30162 non-null int32
relationship      30162 non-null int32
race              30162 non-null int32
sex               30162 non-null int32
capital.gain      30162 non-null int64
capital.loss      30162 non-null int64
hours.per.week    30162 non-null int64
native            30162 non-null int32
income            30162 non-null int32
dtypes: int32(8), int64(6)
memory usage: 2.5 MB


In [10]:
X_data=data.drop(['income',], axis=1)
y=data['income']

In [11]:
bestfeatures = SelectKBest(chi2, k=8).fit(X_data,y)
bestfeature_df=pd.DataFrame({'Feature':list(X_data.columns),'Scores':bestfeatures.scores_})
bestfeature_df.sort_values(by='Scores',ascending=True)
print(bestfeature_df.nlargest(8,'Scores'))

           Feature        Scores
9     capital.gain  7.412819e+07
10    capital.loss  1.256099e+06
2           fnlwgt  1.423290e+05
12          native  1.139849e+05
0              age  7.927716e+03
11  hours.per.week  5.569209e+03
6     relationship  3.435488e+03
3    education.num  2.178306e+03


In [12]:
X=bestfeatures.transform(X_data)

## Scaling

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)


## Model Preparation

In [14]:
X.shape,y.shape
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
seed = 5
# prepare models
models = []
models.append(('LR', LogisticRegression(solver='lbfgs',max_iter=1000)))
models.append(('RIDGE', RidgeClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('DTC', DecisionTreeClassifier()))
models.append(('GNB', GaussianNB()))
models.append(('BNB', BernoulliNB()))
models.append(('SGD', SGDClassifier()))
models.append(('ADA', AdaBoostClassifier()))
models.append(('RFC', RandomForestClassifier(n_estimators=100)))
models.append(('XGB', XGBClassifier()))
models.append(('GBC', GradientBoostingClassifier()))

## Model Evaluation

In [16]:
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed)
	cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

LR: 0.740517 (0.154423)
RIDGE: 0.771182 (0.139524)
KNN: 0.730566 (0.116554)
DTC: 0.773362 (0.053212)
GNB: 0.784668 (0.037201)
BNB: 0.742606 (0.129074)
SGD: 0.691098 (0.224444)
ADA: 0.842087 (0.009522)
RFC: 0.810726 (0.038518)
XGB: 0.842021 (0.011453)
GBC: 0.842850 (0.011000)


## Stacking

In [17]:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
from sklearn.ensemble import VotingClassifier
estimators = []
model1 = AdaBoostClassifier()
estimators.append(('ADA', model1))
model2 = GradientBoostingClassifier()
estimators.append(('GBC', model2))
model3 = XGBClassifier()
estimators.append(('XGB', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, y, cv=kfold)
print(results.mean())

0.8429160376786186


## Hypertuning

In [18]:
gb=GradientBoostingClassifier(n_estimators=1000,learning_rate = 0.05,max_features=None,max_depth=3,random_state = None)
gb.fit(X_train, y_train)
print("Accuracy (training): {0:.3f}".format(gb.score(X_train, y_train)))
print("Accuracy (validation): {0:.3f}".format(gb.score(X_test, y_test)))
print()

Accuracy (training): 0.875
Accuracy (validation): 0.863



## Confusion Matrix and Classification Report

In [19]:
predicted = gb.predict(X_test)
matrix = confusion_matrix(y_test, predicted)
report = classification_report(y_test, predicted)
print("\033[1m"+"\nConfusion Matrix\n"+"\033[0m")
print(matrix)
print("\033[1m"+"\nClassification Report\n"+"\033[0m")
print(report)

[1m
Confusion Matrix
[0m
[[4240  267]
 [ 557  969]]
[1m
Classification Report
[0m
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      4507
           1       0.78      0.63      0.70      1526

    accuracy                           0.86      6033
   macro avg       0.83      0.79      0.81      6033
weighted avg       0.86      0.86      0.86      6033



## Creating Picking file for web page creation and deployment.

In [20]:
import pickle
pickle.dump(gb, open("model.pkl","wb"))