# Establish Baseline Model Scores
#### 10/23/2020
---

## Load and Process Dataset
### Import Libraries

In [94]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import wandb
import warnings
warnings.filterwarnings('ignore')

In [112]:
wandb.init(project="baseline_1994")



VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
_step,8
_runtime,198
_timestamp,1603582046


0,1
_step,▁▂▃▄▅▅▆▇█
_runtime,▁▁▂▂█████
_timestamp,▁▁▂▂█████


### Read in `adult.csv` data

In [71]:
df = pd.read_csv('../UCI dataset/adult.csv').drop(columns=['fnlwgt']).sample(frac=1)
df.head()

Unnamed: 0,age,workclass,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
18814,45,Private,Assoc-voc,11,Divorced,Other-service,Not-in-family,White,Female,0,0,8,United-States,<=50K
12788,29,Private,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Wife,White,Female,0,0,40,Mexico,>50K
9377,57,Self-emp-not-inc,Some-college,10,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,38,United-States,>50K
2294,67,?,Bachelors,13,Married-civ-spouse,?,Husband,White,Male,9386,0,60,United-States,>50K
13584,36,Private,Some-college,10,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,<=50K


In [72]:
df.isnull().sum()

age               0
workclass         0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [73]:
# df["income"].replace({'<=50K': 0, '>50K': 1}, inplace=True)
# df.head()

In [93]:
X = df.drop(columns=['income'], axis = 1)
y = df[['income']].to_numpy()

In [98]:
labels = np.unique(y)
features = list(X.columns)

### Pre-process categorical variables

In [99]:
cat_var = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
for f in cat_var:
    enc = preprocessing.LabelEncoder()
    X[f] = enc.fit_transform(X[f])

scaler = preprocessing.StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

### Split data into test and training

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [101]:
len(X_test)

6513

## Train Models and Establish Baseline Scores

### Random Forest

In [104]:
# Train model, get predictions
model = RandomForestClassifier()
model.fit(X, y)
y_pred = model.predict(X_test)
y_probas = model.predict_proba(X_test)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

In [105]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26048 entries, 11784 to 18628
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             26048 non-null  float64
 1   workclass       26048 non-null  float64
 2   education       26048 non-null  float64
 3   education.num   26048 non-null  float64
 4   marital.status  26048 non-null  float64
 5   occupation      26048 non-null  float64
 6   relationship    26048 non-null  float64
 7   race            26048 non-null  float64
 8   sex             26048 non-null  float64
 9   capital.gain    26048 non-null  float64
 10  capital.loss    26048 non-null  float64
 11  hours.per.week  26048 non-null  float64
 12  native.country  26048 non-null  float64
dtypes: float64(13)
memory usage: 2.8 MB
None


In [106]:
# Visualize model performance
wandb.sklearn.plot_classifier(
    model, X_train, X_test, y_train, y_test, y_pred, y_probas, labels,
    is_binary=True, model_name='RandomForest', feature_names=features)

wandb: 
wandb: Plotting RandomForest.
wandb: Logged feature importances.
wandb: Logged learning curve.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision recall curve.


In [90]:
# visualize model
wandb.sklearn.plot_classifier(model, X_train, X_test, y_train, y_test, y_pred, y_probas, labels, True, 'RandomForest', features)
wandb.log({'roc': wandb.plots.ROC(y_test, y_probas, labels)})
wandb.log({'pr': wandb.plots.precision_recall(y_test, y_probas, labels)})

wandb: 
wandb: Plotting RandomForest.
wandb: Logged feature importances.
wandb: Logged learning curve.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision recall curve.


### K-Nearest Neighbors

In [110]:
# Train model, get predictions
model = KNeighborsClassifier(n_neighbors=9)
model.fit(X, y)
y_pred = model.predict(X_test)
y_probas = model.predict_proba(X_test)
# importances = model.feature_importances_
# indices = np.argsort(importances)[::-1]

In [111]:
# visualize model
wandb.sklearn.plot_classifier(model, X_train, X_test, y_train, y_test, y_pred, y_probas, labels, True, 'KNN', features)
wandb.log({'roc': wandb.plots.ROC(y_test, y_probas, labels)})
wandb.log({'pr': wandb.plots.precision_recall(y_test, y_probas, labels)})

wandb: 
wandb: Plotting KNN.
wandb: Logged feature importances.
wandb: Logged learning curve.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision recall curve.


### Logistic Regression

In [114]:
# Train model, get predictions
model = LogisticRegression()
model.fit(X, y)
y_pred = model.predict(X_test)
y_probas = model.predict_proba(X_test)

In [115]:
# visualize model
wandb.sklearn.plot_classifier(model, X_train, X_test, y_train, y_test, y_pred, y_probas, labels, True, 'LogisticRegression', features)
wandb.log({'roc': wandb.plots.ROC(y_test, y_probas, labels)})
wandb.log({'pr': wandb.plots.precision_recall(y_test, y_probas, labels)})

wandb: 
wandb: Plotting LogisticRegression.
wandb: Logged feature importances.
wandb: Logged learning curve.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision recall curve.
