In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier,ExtraTreesClassifier
import category_encoders as ce
from sklearn.feature_selection import SelectKBest, chi2,VarianceThreshold
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("income.csv")

In [2]:
data.shape

(32561, 15)

In [3]:
data=data.replace('?', np.nan)
data.dropna(axis = 0,inplace = True)

In [4]:
data.shape

(30162, 15)

## Label Encoding using Binary Encoder for Categorical Variable

In [5]:
#Handle native,country feature using Count/Frequency Encoding
data = data.rename(columns={'native.country': 'native'})
native_counts = data.native.value_counts().to_dict()
data.native=data.native.map(native_counts).astype(int)
data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native,income
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,27504,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,27504,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,27504,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,27504,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,27504,<=50K


In [6]:
lb_make = LabelEncoder()
for col in data.columns:
    if (data[col].dtypes=='object'):
        print(col, len(data[col].unique()), ' unique values')
        if (len(data[col].unique()))==2:
            data[col]=lb_make.fit_transform(data[col]).astype(int)
        elif len(data[col].unique())>2 and len(data[col].unique())<=16:
            ce_bin = ce.BinaryEncoder(data[col])
            bin_col = ce_bin.fit_transform(data[col])
            data = pd.concat([data, bin_col],axis="columns")

workclass 7  unique values
education 16  unique values
marital.status 7  unique values
occupation 14  unique values
relationship 6  unique values
race 5  unique values
sex 2  unique values
income 2  unique values


In [7]:
vals_to_replace_income = {'>50K':'1', '<=50K':'0'}

In [8]:
data=data.select_dtypes(include = np.number)
data.head(5)

Unnamed: 0,age,fnlwgt,education.num,sex,capital.gain,capital.loss,hours.per.week,native,income,workclass_0,...,occupation_3,occupation_4,relationship_0,relationship_1,relationship_2,relationship_3,race_0,race_1,race_2,race_3
1,82,132870,9,0,0,4356,18,27504,0,0,...,0,1,0,0,0,1,0,0,0,1
3,54,140359,4,0,0,3900,40,27504,0,0,...,1,0,0,0,1,0,0,0,0,1
4,41,264663,10,0,0,3900,40,27504,0,0,...,1,1,0,0,1,1,0,0,0,1
5,34,216864,9,0,0,3770,45,27504,0,0,...,0,0,0,0,1,0,0,0,0,1
6,38,150601,6,1,0,3770,40,27504,0,0,...,0,1,0,0,1,0,0,0,0,1


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30162 entries, 1 to 32560
Data columns (total 35 columns):
age                 30162 non-null int64
fnlwgt              30162 non-null int64
education.num       30162 non-null int64
sex                 30162 non-null int32
capital.gain        30162 non-null int64
capital.loss        30162 non-null int64
hours.per.week      30162 non-null int64
native              30162 non-null int32
income              30162 non-null int32
workclass_0         30162 non-null int64
workclass_1         30162 non-null int64
workclass_2         30162 non-null int64
workclass_3         30162 non-null int64
education_0         30162 non-null int64
education_1         30162 non-null int64
education_2         30162 non-null int64
education_3         30162 non-null int64
education_4         30162 non-null int64
marital.status_0    30162 non-null int64
marital.status_1    30162 non-null int64
marital.status_2    30162 non-null int64
marital.status_3    30162 non-

In [10]:
X=data.drop(['income'], axis=1)
y=data['income']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)

## Model Preparation

In [12]:
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('RIDGE', RidgeClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('DTC', DecisionTreeClassifier()))
models.append(('GNB', GaussianNB()))
models.append(('BNB', BernoulliNB()))
models.append(('SGD', SGDClassifier()))
models.append(('ADA', AdaBoostClassifier()))
models.append(('RFC', RandomForestClassifier()))
models.append(('XGB', XGBClassifier()))
models.append(('GBC', GradientBoostingClassifier()))

## MODEL EVALUATION

In [13]:
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed)
	cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

LR: 0.746749 (0.141488)
RIDGE: 0.805826 (0.111650)
LDA: 0.813482 (0.086389)
KNN: 0.730599 (0.116913)
DTC: 0.779793 (0.040790)
GNB: 0.784833 (0.036922)
BNB: 0.748528 (0.024727)
SGD: 0.777272 (0.041379)
ADA: 0.842651 (0.010588)
RFC: 0.813513 (0.063176)
XGB: 0.846629 (0.010990)
GBC: 0.844641 (0.016876)


In [14]:
num_trees = 100
max_features = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = ExtraTreesClassifier(n_estimators=num_trees, max_features=max_features)
results = model_selection.cross_val_score(model, X, y, cv=kfold)
print(results.mean())

0.8051927670697172


In [15]:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
from sklearn.ensemble import VotingClassifier
estimators = []
model1 = AdaBoostClassifier()
estimators.append(('ADA', model1))
model2 = GradientBoostingClassifier()
estimators.append(('GBC', model2))
model3 = XGBClassifier()
estimators.append(('XGB', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, y, cv=kfold)
print(results.mean())

0.846165176730622


In [16]:
gb=GradientBoostingClassifier(n_estimators=1000,learning_rate = 0.05,max_features=None,max_depth=3,random_state = None)
gb.fit(X_train, y_train)
print("Accuracy (training): {0:.3f}".format(gb.score(X_train, y_train)))
print("Accuracy (validation): {0:.3f}".format(gb.score(X_test, y_test)))
print()

Accuracy (training): 0.884
Accuracy (validation): 0.870



In [18]:
import pickle
pickle.dump(gb, open("model.pkl","wb"))