# Adult income Prediction using KNN 

In [1]:
#importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Reading data
data = pd.read_csv("adult.csv")
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [52]:
# data analysis phase

In [3]:
data.describe()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [4]:
data.shape

(48842, 15)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [6]:
data_cat = [i for i in data.columns if data[i].dtype =='O']
data_cat

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'native-country',
 'income']

In [7]:
data_disc = [i for i in data.columns if len(data[i].unique())<25 and i not in data_cat]
data_disc

['educational-num']

In [8]:
data_cont = [ i for i in data.columns if len(data[i].unique())>25]
data_cont

['age',
 'fnlwgt',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country']

In [9]:
for i in data_cat:
    print(i,len(data[i].unique()))
    print(data[i].unique())

workclass 9
['Private' 'Local-gov' '?' 'Self-emp-not-inc' 'Federal-gov' 'State-gov'
 'Self-emp-inc' 'Without-pay' 'Never-worked']
education 16
['11th' 'HS-grad' 'Assoc-acdm' 'Some-college' '10th' 'Prof-school'
 '7th-8th' 'Bachelors' 'Masters' 'Doctorate' '5th-6th' 'Assoc-voc' '9th'
 '12th' '1st-4th' 'Preschool']
marital-status 7
['Never-married' 'Married-civ-spouse' 'Widowed' 'Divorced' 'Separated'
 'Married-spouse-absent' 'Married-AF-spouse']
occupation 15
['Machine-op-inspct' 'Farming-fishing' 'Protective-serv' '?'
 'Other-service' 'Prof-specialty' 'Craft-repair' 'Adm-clerical'
 'Exec-managerial' 'Tech-support' 'Sales' 'Priv-house-serv'
 'Transport-moving' 'Handlers-cleaners' 'Armed-Forces']
relationship 6
['Own-child' 'Husband' 'Not-in-family' 'Unmarried' 'Wife' 'Other-relative']
race 5
['Black' 'White' 'Asian-Pac-Islander' 'Other' 'Amer-Indian-Eskimo']
gender 2
['Male' 'Female']
native-country 42
['United-States' '?' 'Peru' 'Guatemala' 'Mexico' 'Dominican-Republic'
 'Ireland' 'Germ

In [53]:
# encoding all binary classified categorical data

In [10]:
data['gender'] = data['gender'].map({'Male' : 1,'Female' : 0})

In [11]:
data['income'] = data['income'].map({'<=50K':0,'>50K':1})

In [12]:
# removing null values from 'native-country','occupation' and 'workclass'
for i in ['native-country','occupation','workclass']:
    data[i] = np.where(data[i]=='?',data[i].mode(),data[i])

In [13]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,1,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,1,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,1,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,1,7688,0,40,United-States,1
4,18,Private,103497,Some-college,10,Never-married,Prof-specialty,Own-child,White,0,0,0,30,United-States,0


In [14]:
data_cat.remove('gender')

In [None]:
# label encoding to all other multi class categorical data

In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [16]:
df = data.copy()
for i in data_cat:
    le.fit(df[i])
    df[i] = le.transform(df[i])
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,3,226802,1,7,4,6,3,2,1,0,0,40,38,0
1,38,3,89814,11,9,2,4,0,4,1,0,0,50,38,0
2,28,1,336951,7,12,2,10,0,4,1,0,0,40,38,1
3,44,3,160323,15,10,2,6,0,2,1,7688,0,40,38,1
4,18,3,103497,15,10,4,9,3,4,0,0,0,30,38,0


In [17]:
data=df

In [18]:
# preparing dependant and independant features
x = data.drop('income',axis=1)
y = data['income']

In [19]:
# standard scaling whole data
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
x_scaled = scale.fit_transform(x)

In [20]:
# finding vif factor for all features
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif["vif"] = [variance_inflation_factor(x_scaled,i) for i in range(x_scaled.shape[1])]
vif["Features"] = x.columns

vif

Unnamed: 0,vif,Features
0,1.15728,age
1,1.010449,workclass
2,1.01298,fnlwgt
3,1.15895,education
4,1.216647,educational-num
5,1.12443,marital-status
6,1.014318,occupation
7,1.675179,relationship
8,1.034084,race
9,1.551937,gender


In [21]:
# train test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=36)

In [22]:
# model declaration and model training
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(x_train,y_train)

KNeighborsClassifier()

In [50]:
# predicting values from trained model and storing it in y_pred
y_pred = model.predict(x_test)

In [25]:
# accuracy of model
model.score(x_test,y_test)

0.7755143822295015

In [42]:
# making a dictionary for the perspective of grid search CV.
pgrid = {
        'algorithm' : ['ball_tree', 'kd_tree', 'brute'],
        'leaf_size' : [18,20,23,25],
        'n_neighbors' : [5,7,9,11]
        }

In [43]:
from sklearn.model_selection import GridSearchCV

In [44]:
# calling grid search object
gs = GridSearchCV(model,pgrid,verbose=3,cv=10, scoring='accuracy')

In [45]:
# getting grid search object trained and to perform hyperparameter tuning
gs.fit(x_train,y_train)

Fitting 10 folds for each of 48 candidates, totalling 480 fits
[CV] algorithm=ball_tree, leaf_size=18, n_neighbors=5 ................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  algorithm=ball_tree, leaf_size=18, n_neighbors=5, score=0.769, total=   6.6s
[CV] algorithm=ball_tree, leaf_size=18, n_neighbors=5 ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.5s remaining:    0.0s


[CV]  algorithm=ball_tree, leaf_size=18, n_neighbors=5, score=0.779, total=   5.5s
[CV] algorithm=ball_tree, leaf_size=18, n_neighbors=5 ................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   12.1s remaining:    0.0s


[CV]  algorithm=ball_tree, leaf_size=18, n_neighbors=5, score=0.770, total=   6.2s
[CV] algorithm=ball_tree, leaf_size=18, n_neighbors=5 ................
[CV]  algorithm=ball_tree, leaf_size=18, n_neighbors=5, score=0.777, total=   5.8s
[CV] algorithm=ball_tree, leaf_size=18, n_neighbors=5 ................
[CV]  algorithm=ball_tree, leaf_size=18, n_neighbors=5, score=0.770, total=   6.6s
[CV] algorithm=ball_tree, leaf_size=18, n_neighbors=5 ................
[CV]  algorithm=ball_tree, leaf_size=18, n_neighbors=5, score=0.778, total=   7.1s
[CV] algorithm=ball_tree, leaf_size=18, n_neighbors=5 ................
[CV]  algorithm=ball_tree, leaf_size=18, n_neighbors=5, score=0.771, total=   6.2s
[CV] algorithm=ball_tree, leaf_size=18, n_neighbors=5 ................
[CV]  algorithm=ball_tree, leaf_size=18, n_neighbors=5, score=0.777, total=   6.4s
[CV] algorithm=ball_tree, leaf_size=18, n_neighbors=5 ................
[CV]  algorithm=ball_tree, leaf_size=18, n_neighbors=5, score=0.781, total= 

[CV]  algorithm=ball_tree, leaf_size=20, n_neighbors=7, score=0.783, total=   6.6s
[CV] algorithm=ball_tree, leaf_size=20, n_neighbors=7 ................
[CV]  algorithm=ball_tree, leaf_size=20, n_neighbors=7, score=0.778, total=   6.0s
[CV] algorithm=ball_tree, leaf_size=20, n_neighbors=7 ................
[CV]  algorithm=ball_tree, leaf_size=20, n_neighbors=7, score=0.790, total=   5.7s
[CV] algorithm=ball_tree, leaf_size=20, n_neighbors=7 ................
[CV]  algorithm=ball_tree, leaf_size=20, n_neighbors=7, score=0.792, total=   6.1s
[CV] algorithm=ball_tree, leaf_size=20, n_neighbors=9 ................
[CV]  algorithm=ball_tree, leaf_size=20, n_neighbors=9, score=0.781, total=   6.0s
[CV] algorithm=ball_tree, leaf_size=20, n_neighbors=9 ................
[CV]  algorithm=ball_tree, leaf_size=20, n_neighbors=9, score=0.790, total=   7.2s
[CV] algorithm=ball_tree, leaf_size=20, n_neighbors=9 ................
[CV]  algorithm=ball_tree, leaf_size=20, n_neighbors=9, score=0.782, total= 

[CV]  algorithm=ball_tree, leaf_size=23, n_neighbors=11, score=0.788, total=   5.5s
[CV] algorithm=ball_tree, leaf_size=23, n_neighbors=11 ...............
[CV]  algorithm=ball_tree, leaf_size=23, n_neighbors=11, score=0.794, total=   5.4s
[CV] algorithm=ball_tree, leaf_size=23, n_neighbors=11 ...............
[CV]  algorithm=ball_tree, leaf_size=23, n_neighbors=11, score=0.788, total=   5.3s
[CV] algorithm=ball_tree, leaf_size=23, n_neighbors=11 ...............
[CV]  algorithm=ball_tree, leaf_size=23, n_neighbors=11, score=0.797, total=   5.4s
[CV] algorithm=ball_tree, leaf_size=23, n_neighbors=11 ...............
[CV]  algorithm=ball_tree, leaf_size=23, n_neighbors=11, score=0.794, total=   5.5s
[CV] algorithm=ball_tree, leaf_size=23, n_neighbors=11 ...............
[CV]  algorithm=ball_tree, leaf_size=23, n_neighbors=11, score=0.798, total=   5.6s
[CV] algorithm=ball_tree, leaf_size=23, n_neighbors=11 ...............
[CV]  algorithm=ball_tree, leaf_size=23, n_neighbors=11, score=0.789, 

[CV]  algorithm=kd_tree, leaf_size=18, n_neighbors=5, score=0.770, total=   5.0s
[CV] algorithm=kd_tree, leaf_size=18, n_neighbors=5 ..................
[CV]  algorithm=kd_tree, leaf_size=18, n_neighbors=5, score=0.778, total=   4.9s
[CV] algorithm=kd_tree, leaf_size=18, n_neighbors=5 ..................
[CV]  algorithm=kd_tree, leaf_size=18, n_neighbors=5, score=0.771, total=   5.0s
[CV] algorithm=kd_tree, leaf_size=18, n_neighbors=5 ..................
[CV]  algorithm=kd_tree, leaf_size=18, n_neighbors=5, score=0.777, total=   5.1s
[CV] algorithm=kd_tree, leaf_size=18, n_neighbors=5 ..................
[CV]  algorithm=kd_tree, leaf_size=18, n_neighbors=5, score=0.781, total=   5.2s
[CV] algorithm=kd_tree, leaf_size=18, n_neighbors=5 ..................
[CV]  algorithm=kd_tree, leaf_size=18, n_neighbors=5, score=0.783, total=   5.1s
[CV] algorithm=kd_tree, leaf_size=18, n_neighbors=7 ..................
[CV]  algorithm=kd_tree, leaf_size=18, n_neighbors=7, score=0.776, total=   5.0s
[CV] al

[CV]  algorithm=kd_tree, leaf_size=20, n_neighbors=7, score=0.790, total=   6.7s
[CV] algorithm=kd_tree, leaf_size=20, n_neighbors=7 ..................
[CV]  algorithm=kd_tree, leaf_size=20, n_neighbors=7, score=0.792, total=   5.1s
[CV] algorithm=kd_tree, leaf_size=20, n_neighbors=9 ..................
[CV]  algorithm=kd_tree, leaf_size=20, n_neighbors=9, score=0.781, total=   5.5s
[CV] algorithm=kd_tree, leaf_size=20, n_neighbors=9 ..................
[CV]  algorithm=kd_tree, leaf_size=20, n_neighbors=9, score=0.790, total=   5.3s
[CV] algorithm=kd_tree, leaf_size=20, n_neighbors=9 ..................
[CV]  algorithm=kd_tree, leaf_size=20, n_neighbors=9, score=0.782, total=   5.3s
[CV] algorithm=kd_tree, leaf_size=20, n_neighbors=9 ..................
[CV]  algorithm=kd_tree, leaf_size=20, n_neighbors=9, score=0.792, total=   5.0s
[CV] algorithm=kd_tree, leaf_size=20, n_neighbors=9 ..................
[CV]  algorithm=kd_tree, leaf_size=20, n_neighbors=9, score=0.790, total=   5.0s
[CV] al

[CV]  algorithm=kd_tree, leaf_size=23, n_neighbors=11, score=0.788, total=   5.1s
[CV] algorithm=kd_tree, leaf_size=23, n_neighbors=11 .................
[CV]  algorithm=kd_tree, leaf_size=23, n_neighbors=11, score=0.797, total=   5.0s
[CV] algorithm=kd_tree, leaf_size=23, n_neighbors=11 .................
[CV]  algorithm=kd_tree, leaf_size=23, n_neighbors=11, score=0.794, total=   5.2s
[CV] algorithm=kd_tree, leaf_size=23, n_neighbors=11 .................
[CV]  algorithm=kd_tree, leaf_size=23, n_neighbors=11, score=0.798, total=   5.2s
[CV] algorithm=kd_tree, leaf_size=23, n_neighbors=11 .................
[CV]  algorithm=kd_tree, leaf_size=23, n_neighbors=11, score=0.789, total=   5.2s
[CV] algorithm=kd_tree, leaf_size=23, n_neighbors=11 .................
[CV]  algorithm=kd_tree, leaf_size=23, n_neighbors=11, score=0.792, total=   5.2s
[CV] algorithm=kd_tree, leaf_size=23, n_neighbors=11 .................
[CV]  algorithm=kd_tree, leaf_size=23, n_neighbors=11, score=0.801, total=   5.6s


[CV]  algorithm=brute, leaf_size=18, n_neighbors=5, score=0.771, total=  18.1s
[CV] algorithm=brute, leaf_size=18, n_neighbors=5 ....................
[CV]  algorithm=brute, leaf_size=18, n_neighbors=5, score=0.777, total=  17.5s
[CV] algorithm=brute, leaf_size=18, n_neighbors=5 ....................
[CV]  algorithm=brute, leaf_size=18, n_neighbors=5, score=0.782, total=  21.3s
[CV] algorithm=brute, leaf_size=18, n_neighbors=5 ....................
[CV]  algorithm=brute, leaf_size=18, n_neighbors=5, score=0.783, total=  31.3s
[CV] algorithm=brute, leaf_size=18, n_neighbors=7 ....................
[CV]  algorithm=brute, leaf_size=18, n_neighbors=7, score=0.776, total=  19.9s
[CV] algorithm=brute, leaf_size=18, n_neighbors=7 ....................
[CV]  algorithm=brute, leaf_size=18, n_neighbors=7, score=0.785, total=  17.2s
[CV] algorithm=brute, leaf_size=18, n_neighbors=7 ....................
[CV]  algorithm=brute, leaf_size=18, n_neighbors=7, score=0.778, total=  15.5s
[CV] algorithm=brute,

[CV]  algorithm=brute, leaf_size=20, n_neighbors=9, score=0.790, total=  16.6s
[CV] algorithm=brute, leaf_size=20, n_neighbors=9 ....................
[CV]  algorithm=brute, leaf_size=20, n_neighbors=9, score=0.782, total=  18.0s
[CV] algorithm=brute, leaf_size=20, n_neighbors=9 ....................
[CV]  algorithm=brute, leaf_size=20, n_neighbors=9, score=0.792, total=  16.2s
[CV] algorithm=brute, leaf_size=20, n_neighbors=9 ....................
[CV]  algorithm=brute, leaf_size=20, n_neighbors=9, score=0.790, total=  16.5s
[CV] algorithm=brute, leaf_size=20, n_neighbors=9 ....................
[CV]  algorithm=brute, leaf_size=20, n_neighbors=9, score=0.788, total=  17.5s
[CV] algorithm=brute, leaf_size=20, n_neighbors=9 ....................
[CV]  algorithm=brute, leaf_size=20, n_neighbors=9, score=0.783, total=  21.7s
[CV] algorithm=brute, leaf_size=20, n_neighbors=9 ....................
[CV]  algorithm=brute, leaf_size=20, n_neighbors=9, score=0.784, total=  17.4s
[CV] algorithm=brute,

[CV]  algorithm=brute, leaf_size=23, n_neighbors=11, score=0.789, total=  16.6s
[CV] algorithm=brute, leaf_size=23, n_neighbors=11 ...................
[CV]  algorithm=brute, leaf_size=23, n_neighbors=11, score=0.792, total=  16.7s
[CV] algorithm=brute, leaf_size=23, n_neighbors=11 ...................
[CV]  algorithm=brute, leaf_size=23, n_neighbors=11, score=0.801, total=  16.5s
[CV] algorithm=brute, leaf_size=23, n_neighbors=11 ...................
[CV]  algorithm=brute, leaf_size=23, n_neighbors=11, score=0.800, total=  14.8s
[CV] algorithm=brute, leaf_size=25, n_neighbors=5 ....................
[CV]  algorithm=brute, leaf_size=25, n_neighbors=5, score=0.769, total=  15.9s
[CV] algorithm=brute, leaf_size=25, n_neighbors=5 ....................
[CV]  algorithm=brute, leaf_size=25, n_neighbors=5, score=0.779, total=  18.1s
[CV] algorithm=brute, leaf_size=25, n_neighbors=5 ....................
[CV]  algorithm=brute, leaf_size=25, n_neighbors=5, score=0.770, total=  18.0s
[CV] algorithm=br

[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed: 76.5min finished


GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                         'leaf_size': [18, 20, 23, 25],
                         'n_neighbors': [5, 7, 9, 11]},
             scoring='accuracy', verbose=3)

In [46]:
# obtaining best parameters from grid search
gs.best_params_

{'algorithm': 'ball_tree', 'leaf_size': 18, 'n_neighbors': 11}

In [47]:
# model training over best parameters
model = KNeighborsClassifier(algorithm='ball_tree',leaf_size=18,n_neighbors=11)
model.fit(x_train,y_train)

KNeighborsClassifier(algorithm='ball_tree', leaf_size=18, n_neighbors=11)

In [49]:
# getting accuracy on model
print("accuracy on training set:", model.score(x_train,y_train))
print("accuracy on test set:", model.score(x_test,y_test))

accuracy on training set: 0.8135029304123051
accuracy on test set: 0.7920974511208926


In [51]:
import pickle
pickle.dump(model,open("adult_income_prediction.pkl","wb"))