In [19]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from keras.utils import np_utils

# Read dataset to pandas dataframe
dataset = pd.read_csv("data.csv")
dataset.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [20]:
#How to pre-process the data?
# fix dataset
# 0,4,9,10,11 is continous so select only descrete columns
cat_columns = []
for i in [1,2,4,5,6,7,8,12,13]:
    cat_columns.append(dataset.columns[i])


In [21]:
# replace empty values
dataset = dataset.replace(" ?", pd.NaT)
dataset.dropna(inplace=True)
#dataset

In [22]:
for i in cat_columns:
    dataset[i] = dataset[i].astype('category')
    
 # represet them using numerical values
for i in cat_columns:
    dataset[i] = pd.Categorical(dataset[i]).codes
    
dataset.dtypes

age                int64
 workclass          int8
 education          int8
 education-num     int64
 marital-status     int8
 occupation         int8
 relationship       int8
 race               int8
 sex                int8
 capital-gain      int64
 capital-loss      int64
 hours-per-week    int64
 native-country     int8
 income             int8
dtype: object

In [23]:
X = np.zeros((13, len(dataset)))
y = np.array(dataset[dataset.columns[13]])


In [24]:
#assign values to the array
for i in range(len(dataset.columns)-1):
    X[i] = np.array(dataset[dataset.columns[i]])
X = X.transpose()

In [42]:
# How to handle the over-fitting problem?
# setting testing and training data limit
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

# K Nearest Neighbor

In [43]:
# training and predictions
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [44]:
# predict test data
y_pred = classifier.predict(X_test)
print(y_pred)


[1 0 0 ... 0 0 0]


In [45]:
# How to compare the designed machine learning models?
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[4117  416]
 [ 564  936]]
             precision    recall  f1-score   support

          0       0.88      0.91      0.89      4533
          1       0.69      0.62      0.66      1500

avg / total       0.83      0.84      0.83      6033



# Logistic Regression 

In [46]:
# configs
epochs = 100
batch_size = 88
output_classes = 2

y_train = np_utils.to_categorical(y_train, output_classes)
y_test = np_utils.to_categorical(y_test, output_classes)


In [47]:
# Set up the logistic regression model
model = Sequential()
model.add(Dense(2, input_dim=(13), kernel_initializer='normal', activation='softmax'))
model.compile(optimizer=SGD(lr=0.05), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
print(model.summary())
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 2)                 28        
Total params: 28
Trainable params: 28
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 2)                 28        
Total params: 28
Trainable params: 28
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
E

In [48]:
evaluation = model.evaluate(X_test, y_test, verbose=1)
print(evaluation[1])


0.7513674788761154


# Neural Network

In [49]:
y_train

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]], dtype=float32)

In [50]:
y_test

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [51]:
X_test

array([[49.,  2., 12., ...,  0., 45., 38.],
       [22.,  2., 15., ...,  0., 40., 38.],
       [17.,  2.,  1., ...,  0., 15., 38.],
       ...,
       [36.,  1.,  8., ...,  0., 40., 38.],
       [33.,  2.,  1., ...,  0., 40., 38.],
       [39.,  2., 15., ...,  0., 40., 38.]])

In [52]:
# configs
epochs = 50
batch_size = 88
output_classes = 2
layer1 = 13
layer2 = 12
layer3 = 20

# Set up the logistic regression model
model = Sequential()
model.add(Dense(layer2, input_dim=layer1, activation='relu'))
model.add(Dense(layer3, activation='relu'))
model.add(Dense(output_classes, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x5a0386ba20>

In [53]:
evaluation = model.evaluate(X_test, y_test, verbose=1)



In [54]:
evaluation[1]

0.7849328692686928