# Establish Baseline Model Scores
#### 9/12/2020
---

## Load and Process Dataset
### Import Libraries

In [60]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
#from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

### Read in `adult.csv` data

In [40]:
df = pd.read_csv('UCI dataset/adult.csv').sample(frac=1)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
12734,42,Private,250536,Some-college,10,Separated,Other-service,Unmarried,Black,Female,0,0,21,Haiti,<=50K
18715,29,Self-emp-not-inc,322238,HS-grad,9,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,40,United-States,<=50K
29854,32,Private,222548,HS-grad,9,Divorced,Adm-clerical,Own-child,White,Female,0,0,40,United-States,<=50K
14664,22,Private,188274,HS-grad,9,Never-married,Transport-moving,Own-child,White,Male,0,0,40,United-States,<=50K
31805,45,Private,330087,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K


In [41]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

### Split data into test and training

In [42]:
len(df)*.2

6512.200000000001

In [64]:
#subset the data into test and training data and labels
df_test_data = df.drop(columns=['income'])[:6500]
df_test_labels = df[['income']][:6500]
df_train_data = df.drop(columns=['income'])[6500:]
df_train_labels = df[['income']][6500:]

In [44]:
len(df_test_data)

6500

In [45]:
df_test_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
12734,42,Private,250536,Some-college,10,Separated,Other-service,Unmarried,Black,Female,0,0,21,Haiti
18715,29,Self-emp-not-inc,322238,HS-grad,9,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,40,United-States
29854,32,Private,222548,HS-grad,9,Divorced,Adm-clerical,Own-child,White,Female,0,0,40,United-States
14664,22,Private,188274,HS-grad,9,Never-married,Transport-moving,Own-child,White,Male,0,0,40,United-States
31805,45,Private,330087,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States


### Pre-process categorical variables

In [66]:
cat_var = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
for f in cat_var:
    enc = preprocessing.LabelEncoder()
    df_train_data[f] = enc.fit_transform(df_train_data[f])
    df_test_data[f] = enc.transform(df_test_data[f])

### Scale Features

In [73]:
scaler = preprocessing.StandardScaler()
df_train_data = pd.DataFrame(scaler.fit_transform(df_train_data), columns = df_train_data.columns)
df_test_data = pd.DataFrame(scaler.transform(df_test_data), columns = df_train_data.columns)

## Train Models and Establish Baseline Scores

### K-Nearest Neighbors

In [74]:
results = []
for k in [1, 2, 3, 4, 5, 6, 7, 8, 9]:
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(df_train_data, df_train_labels)
    acc = knn_model.score(df_test_data, df_test_labels)
    results.append([k, acc])

In [75]:
results

[[1, 0.7964615384615384],
 [2, 0.8124615384615385],
 [3, 0.820923076923077],
 [4, 0.8226153846153846],
 [5, 0.825076923076923],
 [6, 0.8306153846153846],
 [7, 0.8303076923076923],
 [8, 0.831076923076923],
 [9, 0.831076923076923]]

### Multinomial NB

In [87]:
# results = []
# for a in [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0, 100.0]:
#   mnb_model = MultinomialNB(alpha=a)
#   mnb_model.fit(df_train_data, df_train_labels)
#   acc = mnb_model.score(df_test_data, df_test_labels)
#   results.append([a, acc])

In [50]:
results

[[0.0, 0.7806153846153846],
 [0.0001, 0.7806153846153846],
 [0.001, 0.7806153846153846],
 [0.01, 0.7806153846153846],
 [0.1, 0.7806153846153846],
 [0.5, 0.7806153846153846],
 [1.0, 0.7806153846153846],
 [2.0, 0.7806153846153846],
 [10.0, 0.7806153846153846],
 [100.0, 0.7806153846153846]]

### Bernoulli NB

In [77]:
results = []
for a in [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0, 100.0]:
  bnb_model = BernoulliNB(alpha=a)
  bnb_model.fit(df_train_data, df_train_labels)
  acc = bnb_model.score(df_test_data, df_test_labels)
  results.append([a, acc])

In [78]:
results

[[0.0, 0.8096923076923077],
 [0.0001, 0.8096923076923077],
 [0.001, 0.8096923076923077],
 [0.01, 0.8096923076923077],
 [0.1, 0.8096923076923077],
 [0.5, 0.8096923076923077],
 [1.0, 0.8096923076923077],
 [2.0, 0.8096923076923077],
 [10.0, 0.8095384615384615],
 [100.0, 0.8084615384615385]]

### Linear Regression

In [79]:
df_train_label_dummy = pd.get_dummies(df_train_labels['income'], prefix='income')
df_test_label_dummy = pd.get_dummies(df_test_labels['income'], prefix='income')

In [80]:
linear_model = LinearRegression()
linear_model.fit(df_train_data, df_train_label_dummy)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [81]:
lm_predicted = linear_model.predict(df_test_data)
r2 = linear_model.score(df_test_data, df_test_label_dummy)
r2

0.25121586191717027

In [82]:
def get_linear_model_accuracy(predicted, actual):
    df_predicted = pd.DataFrame(columns = actual.columns)
    results = []
    for i in range(len(predicted)):
        it = predicted[i]
        add = [0, 0]
        add[it.argmax()] = 1
        df_predicted.loc[i] = add
  
    for r in range(len(df_predicted)):
        results.append(np.array_equal(df_predicted.iloc[r].values, actual.iloc[r].values))
  
    return(results.count(True) / len(results))

In [83]:
get_linear_model_accuracy(lm_predicted, df_test_label_dummy)

0.804923076923077

### Logistic Regression

In [84]:
results = []
for c in [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0, 100.0]:
    log_model = LogisticRegression(C=c)
    log_model.fit(df_train_data, df_train_labels)
    acc = log_model.score(df_test_data, df_test_labels)
    results.append([c, acc])

In [85]:
results

[[0.0001, 0.7698461538461538],
 [0.001, 0.8084615384615385],
 [0.01, 0.8207692307692308],
 [0.1, 0.8221538461538461],
 [0.5, 0.8223076923076923],
 [1.0, 0.8223076923076923],
 [2.0, 0.8221538461538461],
 [10.0, 0.8221538461538461],
 [100.0, 0.8221538461538461]]

## Feature Reduction

### PCA

In [86]:
pca = PCA()
x = pca.fit_transform(df_train_data)
pca.explained_variance_ratio_

array([0.1507991 , 0.1006954 , 0.09013516, 0.08067175, 0.07735078,
       0.07343012, 0.06785188, 0.06629245, 0.06139593, 0.06026377,
       0.05345741, 0.04849914, 0.04191859, 0.02723853])

### Try KNN after dropping last column

In [90]:
df_train_data_red = df_train_data.drop(columns=['native.country'])
df_test_data_red = df_test_data.drop(columns=['native.country'])

In [91]:
results = []
for k in [1, 2, 3, 4, 5, 6, 7, 8, 9]:
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(df_train_data_red, df_train_labels)
    acc = knn_model.score(df_test_data_red, df_test_labels)
    results.append([k, acc])

In [92]:
results

[[1, 0.7967692307692308],
 [2, 0.8124615384615385],
 [3, 0.822],
 [4, 0.825076923076923],
 [5, 0.8272307692307692],
 [6, 0.8312307692307692],
 [7, 0.8318461538461538],
 [8, 0.8292307692307692],
 [9, 0.8324615384615385]]

### Drop last two columns

In [93]:
df_train_data_red = df_train_data.drop(columns=['native.country', 'hours.per.week'])
df_test_data_red = df_test_data.drop(columns=['native.country', 'hours.per.week'])

In [94]:
results = []
for k in [1, 2, 3, 4, 5, 6, 7, 8, 9]:
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(df_train_data_red, df_train_labels)
    acc = knn_model.score(df_test_data_red, df_test_labels)
    results.append([k, acc])

In [95]:
results

[[1, 0.7984615384615384],
 [2, 0.8187692307692308],
 [3, 0.8236923076923077],
 [4, 0.8264615384615385],
 [5, 0.8304615384615385],
 [6, 0.8306153846153846],
 [7, 0.8296923076923077],
 [8, 0.8301538461538461],
 [9, 0.8303076923076923]]