In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import scipy.stats as stats

from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


In [2]:
data = pd.read_csv('data/adult.csv', header=None, skipinitialspace=True)

In [3]:
data.columns = ['age','workclass','fnlwgt','education','educationNum','marital',
           'occupation','relationship','race','sex','capGain','capLoss',
          'HoursWeek','NativeCountry','category']


In [4]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educationNum,marital,occupation,relationship,race,sex,capGain,capLoss,HoursWeek,NativeCountry,category
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
data.loc[data.category == '<=50K', 'category'] = 0
data.loc[data.category == '>50K', 'category'] = 1

In [6]:
data.category.head()

0    0
1    0
2    0
3    0
4    0
Name: category, dtype: int64

In [7]:
data.category.tail()

32556    0
32557    1
32558    0
32559    0
32560    1
Name: category, dtype: int64

In [8]:
# 7406609000 whatsapp of Ashok

In [9]:
data.category

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        1
8        1
9        1
10       1
11       1
12       0
13       0
14       1
15       0
16       0
17       0
18       0
19       1
20       1
21       0
22       0
23       0
24       0
25       1
26       0
27       1
28       0
29       0
        ..
32531    0
32532    1
32533    1
32534    0
32535    0
32536    1
32537    0
32538    1
32539    1
32540    0
32541    0
32542    0
32543    0
32544    0
32545    1
32546    0
32547    0
32548    0
32549    0
32550    0
32551    0
32552    0
32553    0
32554    1
32555    0
32556    0
32557    1
32558    0
32559    0
32560    1
Name: category, Length: 32561, dtype: int64

In [10]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educationNum', 'marital',
       'occupation', 'relationship', 'race', 'sex', 'capGain', 'capLoss',
       'HoursWeek', 'NativeCountry', 'category'],
      dtype='object')

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age              32561 non-null int64
workclass        32561 non-null object
fnlwgt           32561 non-null int64
education        32561 non-null object
educationNum     32561 non-null int64
marital          32561 non-null object
occupation       32561 non-null object
relationship     32561 non-null object
race             32561 non-null object
sex              32561 non-null object
capGain          32561 non-null int64
capLoss          32561 non-null int64
HoursWeek        32561 non-null int64
NativeCountry    32561 non-null object
category         32561 non-null int64
dtypes: int64(7), object(8)
memory usage: 3.7+ MB


In [12]:
data.replace(['$', '?', '*', '-'], np.nan, inplace=True)

In [13]:
object_types = data.dtypes[data.dtypes == 'object']

In [14]:

for i in object_types.index:
    print(i)
    print(set(data[i]))
    # data[i] = labelencoder.fit_transform(data[i])

workclass
{nan, 'Without-pay', 'Local-gov', 'Private', 'State-gov', 'Self-emp-not-inc', 'Never-worked', 'Self-emp-inc', 'Federal-gov'}
education
{'1st-4th', 'Bachelors', 'Doctorate', 'Preschool', '5th-6th', 'Masters', 'Assoc-acdm', '11th', 'HS-grad', '12th', '9th', 'Some-college', 'Assoc-voc', '7th-8th', 'Prof-school', '10th'}
marital
{'Married-spouse-absent', 'Never-married', 'Divorced', 'Married-civ-spouse', 'Married-AF-spouse', 'Widowed', 'Separated'}
occupation
{'Craft-repair', nan, 'Priv-house-serv', 'Adm-clerical', 'Exec-managerial', 'Protective-serv', 'Transport-moving', 'Armed-Forces', 'Other-service', 'Farming-fishing', 'Machine-op-inspct', 'Tech-support', 'Handlers-cleaners', 'Sales', 'Prof-specialty'}
relationship
{'Unmarried', 'Not-in-family', 'Husband', 'Own-child', 'Wife', 'Other-relative'}
race
{'Asian-Pac-Islander', 'Other', 'Amer-Indian-Eskimo', 'White', 'Black'}
sex
{'Female', 'Male'}
NativeCountry
{nan, 'Cuba', 'El-Salvador', 'France', 'Japan', 'Hungary', 'Jamaica', 

In [15]:
all_nulls = data.isnull().sum()
all_nulls

age                 0
workclass        1836
fnlwgt              0
education           0
educationNum        0
marital             0
occupation       1843
relationship        0
race                0
sex                 0
capGain             0
capLoss             0
HoursWeek           0
NativeCountry     583
category            0
dtype: int64

In [16]:
all_not_nulls = data.notnull().sum()
all_not_nulls

age              32561
workclass        30725
fnlwgt           32561
education        32561
educationNum     32561
marital          32561
occupation       30718
relationship     32561
race             32561
sex              32561
capGain          32561
capLoss          32561
HoursWeek        32561
NativeCountry    31978
category         32561
dtype: int64

In [17]:
def get_null_percentage(data):
    all_nulls = data.isnull().sum()
    all_not_nulls = data.notnull().sum()
    return all_nulls / all_not_nulls * 100

all_nulls / all_not_nulls * 100

age              0.000000
workclass        5.975590
fnlwgt           0.000000
education        0.000000
educationNum     0.000000
marital          0.000000
occupation       5.999740
relationship     0.000000
race             0.000000
sex              0.000000
capGain          0.000000
capLoss          0.000000
HoursWeek        0.000000
NativeCountry    1.823128
category         0.000000
dtype: float64

In [18]:
from collections import Counter

Counter(data['workclass'])

Counter({'State-gov': 1298,
         'Self-emp-not-inc': 2541,
         'Private': 22696,
         'Federal-gov': 960,
         'Local-gov': 2093,
         nan: 1836,
         'Self-emp-inc': 1116,
         'Without-pay': 14,
         'Never-worked': 7})

In [19]:
# replace the workclass with maximum occuring value

data['workclass'].replace(np.nan, 'Private', inplace=True)

In [20]:
Counter(data['occupation']).most_common()

[('Prof-specialty', 4140),
 ('Craft-repair', 4099),
 ('Exec-managerial', 4066),
 ('Adm-clerical', 3770),
 ('Sales', 3650),
 ('Other-service', 3295),
 ('Machine-op-inspct', 2002),
 (nan, 1843),
 ('Transport-moving', 1597),
 ('Handlers-cleaners', 1370),
 ('Farming-fishing', 994),
 ('Tech-support', 928),
 ('Protective-serv', 649),
 ('Priv-house-serv', 149),
 ('Armed-Forces', 9)]

In [21]:
data['occupation'].replace(np.nan, 'Prof-speciality', inplace=True)

In [22]:
data.dropna(inplace=True)

In [23]:
get_null_percentage(data)

age              0.0
workclass        0.0
fnlwgt           0.0
education        0.0
educationNum     0.0
marital          0.0
occupation       0.0
relationship     0.0
race             0.0
sex              0.0
capGain          0.0
capLoss          0.0
HoursWeek        0.0
NativeCountry    0.0
category         0.0
dtype: float64

In [24]:
object_types.index

Index(['workclass', 'education', 'marital', 'occupation', 'relationship',
       'race', 'sex', 'NativeCountry'],
      dtype='object')

In [25]:
# data1 = pd.get_dummies(data, columns=object_types.index)
# data1.head()


labelencoder = LabelEncoder()
for i in object_types.index:
    print(i)
    print(set(data[i]))
    data[i] = labelencoder.fit_transform(data[i])
    
    


workclass
{'Without-pay', 'Local-gov', 'Private', 'State-gov', 'Self-emp-not-inc', 'Never-worked', 'Self-emp-inc', 'Federal-gov'}
education
{'1st-4th', 'Bachelors', 'Doctorate', 'Preschool', '5th-6th', 'Masters', 'Assoc-acdm', '11th', 'HS-grad', '10th', '9th', 'Some-college', '7th-8th', 'Prof-school', '12th', 'Assoc-voc'}
marital
{'Married-spouse-absent', 'Never-married', 'Divorced', 'Married-civ-spouse', 'Married-AF-spouse', 'Widowed', 'Separated'}
occupation
{'Craft-repair', 'Priv-house-serv', 'Adm-clerical', 'Exec-managerial', 'Protective-serv', 'Transport-moving', 'Armed-Forces', 'Other-service', 'Prof-speciality', 'Farming-fishing', 'Machine-op-inspct', 'Tech-support', 'Handlers-cleaners', 'Sales', 'Prof-specialty'}
relationship
{'Unmarried', 'Not-in-family', 'Husband', 'Own-child', 'Wife', 'Other-relative'}
race
{'Asian-Pac-Islander', 'Other', 'Amer-Indian-Eskimo', 'White', 'Black'}
sex
{'Female', 'Male'}
NativeCountry
{'Cuba', 'El-Salvador', 'France', 'Japan', 'Hungary', 'Jamaic

In [26]:
X = data.iloc[:, :-1]
y = data.category

In [27]:
from sklearn.preprocessing import scale
X = pd.DataFrame(scale(X), columns=X.columns)



  


In [28]:
X.columns


Index(['age', 'workclass', 'fnlwgt', 'education', 'educationNum', 'marital',
       'occupation', 'relationship', 'race', 'sex', 'capGain', 'capLoss',
       'HoursWeek', 'NativeCountry'],
      dtype='object')

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

n_folds = 5
# models = [ LogisticRegression(), RandomForestClassifier(), SVC() ]
          
import warnings
warnings.filterwarnings('ignore')

def test_model(model, X, y, tuned_parameters):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = GridSearchCV(model, tuned_parameters, cv=n_folds)
    clf.fit(X_train, y_train)

    best_model = clf.best_estimator_
    
    y_predict = best_model.predict(X_test)
    
    score = accuracy_score(y_test, y_predict)
    cross_tab = pd.crosstab(y_test, y_predict)
    report = classification_report(y_test, y_predict)
    
    print(best_model)
    print(score)
    print(cross_tab)
    print(report)


models = [
    (RandomForestClassifier(), [{'n_estimators': [10, 15, 20],
                     'criterion': ['gini', 'entropy']
                    , 'max_depth': range(2, 15, 2)
                     , 'random_state': [7, 42]
                                 #,'max_features': [10]
                    }]),
    (MLPClassifier(), [{
        'max_iter': [100, 200],
        'hidden_layer_sizes': [(59, 50), (59), (100, 100)],
                      'learning_rate_init': [0.1, 0.001],
                      'random_state': [30],
              'alpha': [0.001, 0.1, 1]
            
    }]),
    (SVC(), [{
        'kernel': ['rbf', 'linear'],
        'C': [20, 150, 200],
        'gamma': [5, 10]
    }])
    
]
for model, tuned_parameters in models:
    test_model(model, X, y, tuned_parameters)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=14, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
            oob_score=False, random_state=7, verbose=0, warm_start=False)
0.8605378361475923
col_0        0    1
category           
0         4578  278
1          614  926
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      4856
           1       0.77      0.60      0.67      1540

   micro avg       0.86      0.86      0.86      6396
   macro avg       0.83      0.77      0.79      6396
weighted avg       0.85      0.86      0.85      6396

MLPClassifier(activation='relu', alpha=0.1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=59, l