# Predict whether income exceeds 50K/yr based on census data

## Importing Libraries

In [114]:
import numpy as np
import pandas as pd

## Importing Datasets

In [115]:
dataset=pd.read_csv('adult.csv',header=None)
x=dataset.iloc[:,:-1].values
y=dataset.iloc[:,-1].values

In [116]:
print(x)

[[39 ' State-gov' 77516 ... 0 40 ' United-States']
 [50 ' Self-emp-not-inc' 83311 ... 0 13 ' United-States']
 [38 ' Private' 215646 ... 0 40 ' United-States']
 ...
 [58 ' Private' 151910 ... 0 40 ' United-States']
 [22 ' Private' 201490 ... 0 20 ' United-States']
 [52 ' Self-emp-inc' 287927 ... 0 40 ' United-States']]


# Data Preprocessing

## Dataset Cleaning

In [117]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
imputer.fit(x[:,1:])
x[:,1:]=imputer.transform(x[:,1:])

## Label Encoding

In [118]:
from sklearn.preprocessing import LabelEncoder
le1=LabelEncoder()
le3=LabelEncoder()
le5=LabelEncoder()
le6=LabelEncoder()
le7=LabelEncoder()
le8=LabelEncoder()
le9=LabelEncoder()
le13=LabelEncoder()
le=LabelEncoder()
x[:,1]=le1.fit_transform(x[:,1])
x[:,3]=le3.fit_transform(x[:,3])
x[:,5]=le5.fit_transform(x[:,5])
x[:,6]=le6.fit_transform(x[:,6])
x[:,7]=le7.fit_transform(x[:,7])
x[:,8]=le8.fit_transform(x[:,8])
x[:,9]=le9.fit_transform(x[:,9])
x[:,13]=le13.fit_transform(x[:,13])
y=le.fit_transform(y)

In [119]:
print(x)

[[39 7 77516 ... 0 40 39]
 [50 6 83311 ... 0 13 39]
 [38 4 215646 ... 0 40 39]
 ...
 [58 4 151910 ... 0 40 39]
 [22 4 201490 ... 0 20 39]
 [52 5 287927 ... 0 40 39]]


In [120]:
print(y)

[0 0 0 ... 0 0 1]


## Splitting Dataset into Training set and Test set

In [121]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.2,random_state=0)

## Feature Scaling

In [122]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

# Training Dataset 

In [123]:
from xgboost import XGBClassifier
classifier=XGBClassifier()
classifier.fit(X_train,Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

## Making Confusion Matrix

In [124]:
from sklearn.metrics import confusion_matrix,accuracy_score
y_pred=classifier.predict(X_test)
cm=confusion_matrix(Y_test,y_pred)
print(cm)
accuracy_score(Y_test,y_pred)

[[4635  283]
 [ 631  964]]


0.8596652848149854

In [125]:
print(y_pred)

[0 0 0 ... 1 0 0]


## Applying K-fold Cross Validation

In [126]:
from sklearn.model_selection import cross_val_score
accuracies=cross_val_score(estimator=classifier,X=X_train,y=Y_train,cv=10)
print('Accuracy: {:.2f} Standard Deviation: {:.2f}'.format(accuracies.mean()*100,accuracies.std()*100))

Accuracy: 86.38 Standard Deviation: 0.48


## Predicting Test set Result

In [127]:
print(np.concatenate((y_pred.reshape(len(y_pred),1),Y_test.reshape(len(Y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 ...
 [1 1]
 [0 0]
 [0 1]]


### Single Prediction
age=40, workclass=private, employ_inc=80000, 
bachelor,9,divorced, Exec-managerial,Husband,
white,male,0,1000,50,united-state

In [130]:
result=classifier.predict(sc.transform([[40,4,80000,9,9,0,4,0,4,1,0,1000,50,39]]))
if result==[0]:
  print('Person makes Below 50K/year')
else:
  print('Person makes Above 50K/year')

Person makes Below 50K/year
