In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report



In [2]:
### Loading the data sets
columns = ['Age','Workclass','fnlgwt','Education','Education num','Marital Status',
           'Occupation','Relationship','Race','Sex','Capital Gain','Capital Loss',
           'Hours/Week','Native country','Income']
train = pd.read_csv('adult-training.csv', names=columns)
test = pd.read_csv('adult-test.csv', names=columns, skiprows=1)

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
Age               32561 non-null int64
Workclass         32561 non-null object
fnlgwt            32561 non-null int64
Education         32561 non-null object
Education num     32561 non-null int64
Marital Status    32561 non-null object
Occupation        32561 non-null object
Relationship      32561 non-null object
Race              32561 non-null object
Sex               32561 non-null object
Capital Gain      32561 non-null int64
Capital Loss      32561 non-null int64
Hours/Week        32561 non-null int64
Native country    32561 non-null object
Income            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
train.describe()

Unnamed: 0,Age,fnlgwt,Education num,Capital Gain,Capital Loss,Hours/Week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [5]:
train.head(10)

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Native country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [6]:
train.isnull().count()

Age               32561
Workclass         32561
fnlgwt            32561
Education         32561
Education num     32561
Marital Status    32561
Occupation        32561
Relationship      32561
Race              32561
Sex               32561
Capital Gain      32561
Capital Loss      32561
Hours/Week        32561
Native country    32561
Income            32561
dtype: int64

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns
hmap = train.corr()
plt.subplots(figsize=(12, 9))
sns.heatmap(hmap, vmax=.8,annot=True,cmap="BrBG", square=True);

In [8]:
train_clean = train.replace(' ?', np.nan).dropna()
test_clean = test.replace(' ?', np.nan).dropna()

In [9]:
train.isnull().values.any()

False

# #Fit scaler on train data only. Transform training and testing set

In [10]:
numerical_col = ["Age", "fnlgwt", "Education num", "Capital Gain",
                     "Capital Loss", "Hours/Week"]
scaler = StandardScaler()
train_clean[numerical_col] = scaler.fit_transform(train_clean[numerical_col])
test_clean[numerical_col] = scaler.transform(test_clean[numerical_col])

In [11]:
## splitting DataSet
Y_train = train_clean["Income"]
X_train = train_clean.drop("Income", axis=1)

Y_test = test_clean["Income"]
X_test = test_clean.drop("Income", axis=1)

In [12]:
data = pd.concat([X_train,X_test])
dataEncoded = pd.get_dummies(data)           # One hot Encoding generates over 104 Columns xD

#print (dataEncoded.head())
X_trainEncoded = dataEncoded[:len(X_train)]
X_testEncoded = dataEncoded[len(X_train):]

print (Y_train.head(10))

Y_trainEncoded = Y_train.replace([' <=50K',' >50K' ] , [0,1] )
Y_testEncoded = Y_test.replace([' <=50K.' , ' >50K.'] , [0,1])

0     <=50K
1     <=50K
2     <=50K
3     <=50K
4     <=50K
5     <=50K
6     <=50K
7      >50K
8      >50K
9      >50K
Name: Income, dtype: object


In [22]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
clf = None
parameters = {}

clf = GaussianNB()


clf_gs = GridSearchCV(clf ,parameters, verbose=1)

clf_gs.fit(X_trainEncoded,Y_trainEncoded)

Y_pred = clf_gs.predict(X_testEncoded)
#print (Y_pred)

print(confusion_matrix(Y_testEncoded,Y_pred))

acc = accuracy_score(Y_testEncoded, Y_pred)

print ("Naive Bayes Approach")
print("Model Accuracy: ",acc*100.0)
precision= precision_score(Y_testEncoded, Y_pred, average= "weighted")  
print("Model precision:", precision*100)
print("recall score:", recall_score(Y_testEncoded, Y_pred, average='weighted')*100)

scores = cross_val_score(clf_gs, X_testEncoded, Y_testEncoded, cv=5)
scores.mean()

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.4s finished


[[5077 6283]
 [ 251 3449]]
Naive Bayes Approach
Model Accuracy:  56.613545816733065
Model precision: 80.58503935526056
recall score: 56.613545816733065
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s finished


0.5471447543160691

In [26]:
lrn = LogisticRegression(penalty = 'l1', C = .001, class_weight='balanced')

lrn.fit(X_trainEncoded, Y_trainEncoded)
Y_pred = lrn.predict(X_testEncoded)
data1=(confusion_matrix(Y_testEncoded,Y_pred))

acc = accuracy_score(Y_testEncoded, Y_pred)

print ("Logistic Regression Approach")
print("Model Accuracy: ",acc*100.0)
precision= precision_score(Y_testEncoded, Y_pred, average= "weighted")  
print("Model precision:", precision*100)
print("recall score:", recall_score(Y_testEncoded, Y_pred, average='weighted')*100)

scores = cross_val_score(lrn, X_testEncoded, Y_testEncoded, cv=5)
scores

Logistic Regression Approach
Model Accuracy:  75.1792828685259
Model precision: 84.34969480378861
recall score: 75.1792828685259


array([0.65737052, 0.64741036, 0.64010624, 0.6500664 , 0.65205843])

In [27]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

In [28]:
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100,max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_trainEncoded,Y_trainEncoded)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [29]:
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100,
 max_depth=3, min_samples_leaf=5)
clf_entropy.fit(X_trainEncoded,Y_trainEncoded)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [30]:
y_pred = clf_gini.predict(X_testEncoded)
y_pred

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

In [31]:
print(" Model Accuracy is ", accuracy_score(Y_testEncoded,y_pred)*100)

 Model Accuracy is  83.92430278884461


In [32]:
data1=(confusion_matrix(Y_testEncoded,Y_pred))
data1

array([[7987, 3373],
       [ 365, 3335]], dtype=int64)

In [64]:
precision= precision_score(Y_testEncoded, y_pred, average= "weighted")  
print("Model precision:", precision*100)
print("recall score:", recall_score(Y_testEncoded, y_pred, average='weighted')*100)

Model precision: 83.14452460189196
recall score: 83.92430278884461


In [33]:
scores = cross_val_score(clf_gs, X_testEncoded, Y_testEncoded, cv=5)
scores.mean()

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished


0.5471447543160691