# Establish Baseline Model Scores
#### 10/23/2020
---

## Load and Process Dataset
### Import Libraries

In [14]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import wandb
import warnings
warnings.filterwarnings('ignore')

In [2]:
wandb.init(project="baseline_1994")

wandb: Currently logged in as: apraturu (use `wandb login --relogin` to force relogin)


### Read in `adult.csv` data

In [3]:
df = pd.read_csv('../UCI dataset/adult.csv').drop(columns=['fnlwgt']).sample(frac=1)
df.head()

Unnamed: 0,age,workclass,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
30032,45,Self-emp-inc,Bachelors,13,Divorced,Exec-managerial,Unmarried,White,Male,0,0,50,United-States,>50K
700,50,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1902,50,United-States,>50K
22036,32,Local-gov,Bachelors,13,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,40,United-States,<=50K
31272,32,Private,Some-college,10,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,60,United-States,<=50K
15033,17,Private,11th,7,Never-married,Adm-clerical,Own-child,White,Female,0,0,32,United-States,<=50K


In [4]:
df.isnull().sum()

age               0
workclass         0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [5]:
# df["income"].replace({'<=50K': 0, '>50K': 1}, inplace=True)
# df.head()

In [6]:
X = df.drop(columns=['income'], axis = 1)
y = df[['income']].to_numpy()

In [7]:
labels = np.unique(y)
features = list(X.columns)

### Pre-process categorical variables

In [8]:
cat_var = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
for f in cat_var:
    enc = preprocessing.LabelEncoder()
    X[f] = enc.fit_transform(X[f])

scaler = preprocessing.StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

### Split data into test and training

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
len(X_test)

6513

## Train Models and Establish Baseline Scores

### Logistic Regression

In [11]:
# Train model, get predictions
model = LogisticRegression()
model.fit(X, y)
y_pred = model.predict(X_test)
y_probas = model.predict_proba(X_test)
importances = model.coef_
indices = np.argsort(importances)[::-1]

In [12]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26048 entries, 24672 to 2667
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             26048 non-null  float64
 1   workclass       26048 non-null  float64
 2   education       26048 non-null  float64
 3   education.num   26048 non-null  float64
 4   marital.status  26048 non-null  float64
 5   occupation      26048 non-null  float64
 6   relationship    26048 non-null  float64
 7   race            26048 non-null  float64
 8   sex             26048 non-null  float64
 9   capital.gain    26048 non-null  float64
 10  capital.loss    26048 non-null  float64
 11  hours.per.week  26048 non-null  float64
 12  native.country  26048 non-null  float64
dtypes: float64(13)
memory usage: 2.8 MB
None


In [13]:
# Visualize model performance
wandb.sklearn.plot_classifier(
    model, X_train, X_test, y_train, y_test, y_pred, y_probas, labels,
    is_binary=True, model_name='LogisticRegression', feature_names=features)

wandb: 
wandb: Plotting RandomForest.
wandb: Logged feature importances.
wandb: Logged learning curve.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision recall curve.


### K-Nearest Neighbors

In [110]:
# Train model, get predictions
model = KNeighborsClassifier(n_neighbors=9)
model.fit(X, y)
y_pred = model.predict(X_test)
y_probas = model.predict_proba(X_test)
# importances = model.feature_importances_
# indices = np.argsort(importances)[::-1]

In [111]:
# visualize model
wandb.sklearn.plot_classifier(model, X_train, X_test, y_train, y_test, y_pred, y_probas, labels, True, 'KNN', features)
wandb.log({'roc': wandb.plots.ROC(y_test, y_probas, labels)})
wandb.log({'pr': wandb.plots.precision_recall(y_test, y_probas, labels)})

wandb: 
wandb: Plotting KNN.
wandb: Logged feature importances.
wandb: Logged learning curve.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision recall curve.


### Logistic Regression

In [10]:
# Train model, get predictions
model = LogisticRegression()
model.fit(X, y)
y_pred = model.predict(X_test)
y_probas = model.predict_proba(X_test)
importances = model.coef_
indices = np.argsort(importances)[::-1]

In [11]:
# Visualize model performance
wandb.sklearn.plot_classifier(
    model, X_train, X_test, y_train, y_test, y_pred, y_probas, labels,
    is_binary=True, model_name='LogisticRegression', feature_names=features)

wandb: 
wandb: Plotting LogisticRegression.
wandb: Logged feature importances.
wandb: Logged learning curve.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision recall curve.


In [15]:
print(classification_report(y_test, y_pred, target_names=labels))

              precision    recall  f1-score   support

       <=50K       0.85      0.94      0.89      4965
        >50K       0.72      0.46      0.56      1548

    accuracy                           0.83      6513
   macro avg       0.78      0.70      0.73      6513
weighted avg       0.82      0.83      0.81      6513

