In [2]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [22]:
# Create a list of column names
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
        'occupation', 'relationship', 'race', 'sex', 'capital-gain', 
        'capital-loss', 'hours-per-week', 'native-country', 'target']

# read the raw data from the source into a DataFrame
df_raw = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None, names = cols)

# encode character string categorical data into numeric data using one-hot encoding
dummy_cols = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]

df_encoded = df_raw

for c in dummy_cols:
   df_encoded = pd.concat([df_encoded, pd.get_dummies(df_encoded[c], drop_first=True)], axis=1)

df_encoded.drop(dummy_cols, axis = 1, inplace = True)


# encode the target column
df_encoded.loc[df_encoded['target'] == ' <=50K', 'target'] = 0
df_encoded.loc[df_encoded['target'] == ' >50K', 'target'] = 1

# separate features and target
X_train = df_encoded.drop(['target'], axis = 1)
y_train = df_encoded['target']

print(X_train.shape)
print(y_train.shape)

(32561, 100)
(32561,)


In [23]:
# Create a list of column names
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
        'occupation', 'relationship', 'race', 'sex', 'capital-gain', 
        'capital-loss', 'hours-per-week', 'native-country', 'target']

# read the test data from the source into a DataFrame
df_test = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', header=None, names = cols, skiprows=[0])

# encode character string categorical data into numeric data using one-hot encoding
dummy_cols = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]

df_encoded = df_test

for c in dummy_cols:
   df_encoded = pd.concat([df_encoded, pd.get_dummies(df_encoded[c], drop_first=True)], axis=1)

df_encoded.drop(dummy_cols, axis = 1, inplace = True)

# encode the target column
df_encoded.loc[df_encoded['target'] == ' <=50K.', 'target'] = 0
df_encoded.loc[df_encoded['target'] == ' >50K.', 'target'] = 1

# this is necessary because the training data contains this country, and the test data does not
df_encoded[' Holand-Netherlands'] = 0

# Separate features and target
X_test = df_encoded.drop(['target'], axis = 1)
y_test = df_encoded['target']

print(X_test.shape)
print(y_test.shape)

(16281, 100)
(16281,)


In [25]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 100 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   age                          32561 non-null  int64
 1   fnlwgt                       32561 non-null  int64
 2   education-num                32561 non-null  int64
 3   capital-gain                 32561 non-null  int64
 4   capital-loss                 32561 non-null  int64
 5   hours-per-week               32561 non-null  int64
 6    Federal-gov                 32561 non-null  uint8
 7    Local-gov                   32561 non-null  uint8
 8    Never-worked                32561 non-null  uint8
 9    Private                     32561 non-null  uint8
 10   Self-emp-inc                32561 non-null  uint8
 11   Self-emp-not-inc            32561 non-null  uint8
 12   State-gov                   32561 non-null  uint8
 13   Without-pay                 32561 non-null  

In [26]:
X_train.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,Federal-gov,Local-gov,Never-worked,Private,...,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [27]:
#ensure type is integer
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [30]:
# logistic regression

lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)

lr_score = lr.score(X_test, y_test)
lr_predictions = lr.predict(X_test)

print('Accuracy of Logistic Regression: {:.3f}'.format(lr_score))

# svm

svm = LinearSVC(max_iter=10000)
svm.fit(X_train, y_train)
svm_score = svm.score(X_test, y_test)
svm_predictions = svm.predict(X_test)

print('Accuracy of SVM: {:.3f}'.format(svm_score))

# knn

# k = 7
knn = KNeighborsClassifier(n_neighbors=7)

# fit the model
knn.fit(X_train, y_train)

# how well did it do?
knn_7_score = knn.score(X_test, y_test)
knn_7_predictions = knn.predict(X_test)

print('Accuracy of KNN (k = 7): {:.3f}'.format(knn_7_score))

# k = 7
knn = KNeighborsClassifier(n_neighbors=5)

# fit the model
knn.fit(X_train, y_train)

# how well did it do?
knn_5_score = knn.score(X_test, y_test)
knn_5_predictions = knn.predict(X_test)

print('Accuracy of KNN (k = 5): {:.3f}'.format(knn_5_score))

# descision tree

dt = DecisionTreeClassifier()

dt.fit(X_train,y_train)

dt_score = dt.score(X_test, y_test)
dt_predictions = dt.predict(X_test)
print('Accuracy of Decision Tree: {:.3f} '.format(dt_score))

# random forest

rf = RandomForestClassifier(n_estimators = 22, random_state = 40)

rf.fit(X_train,y_train)

rf_score = rf.score(X_test, y_test)
rf_predictions = rf.predict(X_test)

print('Accuracy of Random Forest: {:.3f}'.format(rf_score))


Accuracy of Logistic Regression: 0.798




Accuracy of SVM: 0.774
Accuracy of KNN (k = 7): 0.785
Accuracy of KNN (k = 5): 0.777
Accuracy of Decision Tree: 0.799 
Accuracy of Random Forest: 0.856


In [31]:
predictions_dictionary = {'Logistic Regression' : lr_predictions,
                          'KNN_7' : knn_7_predictions, 
                          'KNN_5': knn_5_predictions,
                          'SVM' : svm_predictions,
                          'Decision Tree' : dt_predictions, 
                          'Random Forest' : rf_predictions,
                          'Actual': y_test}

predictions_df = pd.DataFrame(predictions_dictionary)
predictions_df

Unnamed: 0,Logistic Regression,KNN_7,KNN_5,SVM,Decision Tree,Random Forest,Actual
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,1,1,1
3,1,1,1,0,1,1,1
4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
16276,0,0,0,0,0,0,0
16277,0,0,0,0,0,0,0
16278,0,0,0,0,1,1,0
16279,1,1,1,0,0,0,0
