# Establish Baseline Model Scores
#### 9/12/2020
---

## Load and Process Dataset
### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

### Read in `adult.csv` data

In [3]:
df = pd.read_csv('../UCI dataset/adult.csv').drop(columns=['fnlwgt']).sample(frac=1)
df.head()

Unnamed: 0,age,workclass,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
11521,41,Private,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K
2802,32,Private,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,7298,0,40,United-States,>50K
2104,46,Self-emp-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,Amer-Indian-Eskimo,Female,15024,0,40,United-States,>50K
11151,30,Private,Some-college,10,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,44,United-States,<=50K
22701,47,Private,Masters,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,?,>50K


In [4]:
df.isnull().sum()

age               0
workclass         0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

### Split data into test and training

In [5]:
len(df)*.2

6512.200000000001

In [6]:
#subset the data into test and training data and labels
df_test_data = df.drop(columns=['income'])[:6500]
df_test_labels = df[['income']][:6500]
df_train_data = df.drop(columns=['income'])[6500:]
df_train_labels = df[['income']][6500:]

In [7]:
len(df_test_data)

6500

In [8]:
df_test_data.head()

Unnamed: 0,age,workclass,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
11521,41,Private,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States
2802,32,Private,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,7298,0,40,United-States
2104,46,Self-emp-inc,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,Amer-Indian-Eskimo,Female,15024,0,40,United-States
11151,30,Private,Some-college,10,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,44,United-States
22701,47,Private,Masters,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,?


### Pre-process categorical variables

In [9]:
cat_var = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
for f in cat_var:
    enc = preprocessing.LabelEncoder()
    df_train_data[f] = enc.fit_transform(df_train_data[f])
    df_test_data[f] = enc.transform(df_test_data[f])

In [10]:
scaler = preprocessing.StandardScaler()
df_train_data = pd.DataFrame(scaler.fit_transform(df_train_data), columns = df_train_data.columns)
df_test_data = pd.DataFrame(scaler.transform(df_test_data), columns = df_train_data.columns)

## Train Models and Establish Baseline Scores

### K-Nearest Neighbors

In [11]:
results = []
for k in [1, 2, 3, 4, 5, 6, 7, 8, 9]:
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(df_train_data, df_train_labels)
    acc = knn_model.score(df_test_data, df_test_labels)
    results.append([k, acc])

In [12]:
results

[[1, 0.8095384615384615],
 [2, 0.8212307692307692],
 [3, 0.8306153846153846],
 [4, 0.8303076923076923],
 [5, 0.8307692307692308],
 [6, 0.8363076923076923],
 [7, 0.8355384615384616],
 [8, 0.8383076923076923],
 [9, 0.8383076923076923]]

### Multinomial NB

In [13]:
# results = []
# for a in [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0, 100.0]:
#   mnb_model = MultinomialNB(alpha=a)
#   mnb_model.fit(df_train_data, df_train_labels)
#   acc = mnb_model.score(df_test_data, df_test_labels)
#   results.append([a, acc])

In [14]:
results

[[1, 0.8095384615384615],
 [2, 0.8212307692307692],
 [3, 0.8306153846153846],
 [4, 0.8303076923076923],
 [5, 0.8307692307692308],
 [6, 0.8363076923076923],
 [7, 0.8355384615384616],
 [8, 0.8383076923076923],
 [9, 0.8383076923076923]]

### Bernoulli NB

In [15]:
results = []
for a in [0.0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0, 100.0]:
  bnb_model = BernoulliNB(alpha=a)
  bnb_model.fit(df_train_data, df_train_labels)
  acc = bnb_model.score(df_test_data, df_test_labels)
  results.append([a, acc])

In [16]:
results

[[0.0, 0.8053846153846154],
 [0.0001, 0.8053846153846154],
 [0.001, 0.8053846153846154],
 [0.01, 0.8053846153846154],
 [0.1, 0.8053846153846154],
 [0.5, 0.8053846153846154],
 [1.0, 0.8053846153846154],
 [2.0, 0.8053846153846154],
 [10.0, 0.8052307692307692],
 [100.0, 0.804923076923077]]

### Linear Regression

In [17]:
df_train_label_dummy = pd.get_dummies(df_train_labels['income'], prefix='income')
df_test_label_dummy = pd.get_dummies(df_test_labels['income'], prefix='income')

In [18]:
linear_model = LinearRegression()
linear_model.fit(df_train_data, df_train_label_dummy)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
lm_predicted = linear_model.predict(df_test_data)
r2 = linear_model.score(df_test_data, df_test_label_dummy)
r2

0.270709359911425

In [20]:
def get_linear_model_accuracy(predicted, actual):
    df_predicted = pd.DataFrame(columns = actual.columns)
    results = []
    for i in range(len(predicted)):
        it = predicted[i]
        add = [0, 0]
        add[it.argmax()] = 1
        df_predicted.loc[i] = add
  
    for r in range(len(df_predicted)):
        results.append(np.array_equal(df_predicted.iloc[r].values, actual.iloc[r].values))
  
    return(results.count(True) / len(results))

In [21]:
get_linear_model_accuracy(lm_predicted, df_test_label_dummy)

0.8067692307692308

### Logistic Regression

In [22]:
results = []
for c in [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0, 100.0]:
    log_model = LogisticRegression(C=c)
    log_model.fit(df_train_data, df_train_labels)
    acc = log_model.score(df_test_data, df_test_labels)
    results.append([c, acc])

In [23]:
results

[[0.0001, 0.7673846153846153],
 [0.001, 0.812923076923077],
 [0.01, 0.823076923076923],
 [0.1, 0.8252307692307692],
 [0.5, 0.8252307692307692],
 [1.0, 0.8253846153846154],
 [2.0, 0.8253846153846154],
 [10.0, 0.8252307692307692],
 [100.0, 0.8252307692307692]]

## Feature Reduction

### PCA

In [24]:
pca = PCA()
x = pca.fit_transform(df_train_data)
pca.explained_variance_ratio_

array([0.16251784, 0.1080403 , 0.09648913, 0.08649965, 0.08014672,
       0.07857995, 0.07163409, 0.06598222, 0.06509081, 0.0573466 ,
       0.05276388, 0.04529701, 0.02961181])

### Try KNN after dropping last column

In [25]:
df_train_data_red = df_train_data.drop(columns=['native.country'])
df_test_data_red = df_test_data.drop(columns=['native.country'])

In [26]:
results = []
for k in [1, 2, 3, 4, 5, 6, 7, 8, 9]:
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(df_train_data_red, df_train_labels)
    acc = knn_model.score(df_test_data_red, df_test_labels)
    results.append([k, acc])

In [27]:
results

[[1, 0.8078461538461539],
 [2, 0.8224615384615385],
 [3, 0.8301538461538461],
 [4, 0.8301538461538461],
 [5, 0.8329230769230769],
 [6, 0.8383076923076923],
 [7, 0.838],
 [8, 0.8398461538461538],
 [9, 0.8378461538461538]]

### Drop last two columns

In [28]:
df_train_data_red = df_train_data.drop(columns=['native.country', 'hours.per.week'])
df_test_data_red = df_test_data.drop(columns=['native.country', 'hours.per.week'])

In [29]:
results = []
for k in [1, 2, 3, 4, 5, 6, 7, 8, 9]:
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(df_train_data_red, df_train_labels)
    acc = knn_model.score(df_test_data_red, df_test_labels)
    results.append([k, acc])

In [30]:
results

[[1, 0.7978461538461539],
 [2, 0.8186153846153846],
 [3, 0.8253846153846154],
 [4, 0.8301538461538461],
 [5, 0.8298461538461539],
 [6, 0.8375384615384616],
 [7, 0.8356923076923077],
 [8, 0.8378461538461538],
 [9, 0.8386153846153847]]