In [5]:
#import 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

In [6]:
adults = pd.read_csv('adult.data.csv',names=['Age','workclass','fnlwgt','education','education_num','marital_status','occupation','relationship','race','sex','capital_gain','capital_loss','hours_per_week','native_country','label'])
adults_test = pd.read_csv('adult.test.csv',names=['Age','workclass','fnlwgt','education','education_num','marital_status','occupation','relationship','race','sex','capital_gain','capital_loss','hours_per_week','native_country','label'])

In [7]:
adults.head()

Unnamed: 0,Age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [8]:
adults_test.head()

Unnamed: 0,Age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,label
0,|1x3 Cross validator,,,,,,,,,,,,,,
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.


In [9]:
train_data = adults.drop('label',axis=1)

test_data = adults_test.drop('label',axis=1)

data = train_data.append(test_data)

label = adults['label'].append(adults_test['label'])

In [10]:
data.head()

Unnamed: 0,Age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States
1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States
2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States
3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States
4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba


In [11]:
full_dataset = adults.append(adults_test)

In [12]:
data_binary = pd.get_dummies(data)

data_binary.head()

Unnamed: 0,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,Age_17,Age_18,Age_19,Age_20,Age_21,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
0,77516.0,13.0,2174.0,0.0,40.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,83311.0,13.0,0.0,0.0,13.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,215646.0,9.0,0.0,0.0,40.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,234721.0,7.0,0.0,0.0,40.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,338409.0,13.0,0.0,0.0,40.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
x_train, x_test, y_train, y_test = train_test_split(data_binary, label, test_size=0.3)

In [14]:
models = []

models.append(("Logistic Regression:",LogisticRegression()))
models.append(("Naive Bayes Gaussian:",GaussianNB()))
models.append(("Naive Bayes Bernouli:",BernoulliNB()))
models.append(("Naive Bayes Multinomial:",MultinomialNB()))

In [15]:
results = []
names = []
for name,model in models:
    #kfold = KFold(n_splits=10, random_state=0)
    accuracy = cross_val_score(model,x_train,y_train,scoring = "accuracy")
    names.append(name)
    results.append(accuracy)
    
for i in range(len(names)):
    print(names[i], results[i].mean() * 100)



Logistic Regression: 59.5264809819995
Naive Bayes Gaussian: 52.7756943312304
Naive Bayes Bernouli: 76.7709879045201
Naive Bayes Multinomial: 39.59854588912319
