# Establish Baseline Model Scores
#### 10/23/2020
---

## Load and Process Dataset
### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import wandb
import warnings
warnings.filterwarnings('ignore')

In [2]:
wandb.init(project="baseline_ALL")

wandb: Currently logged in as: apraturu (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.10.10 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


### Read in `adult.csv` data

In [19]:
df = pd.read_csv('../final_data/2019_ALL_joins_reduced.csv').sample(frac=1)
df.head()

Unnamed: 0,state,person-weight,age,citizenship-status,class-worker,education,marital-status,occupation,race,sex,...,med_gross_rent,unemployment_rate,insured_rate,less_than_9th,hs_grad,associates_degree,bach_degree,grad_or_prof_degree,bach_degree_or_higher,Incarceration rate per 100000
542280,HI,87,28,Born-US,Fed-gov,some-college-great1,Married,"MIL-Military, Rank Not Specified",white-alone,female,...,2281,2.5,97.4,2.7,23.7,13.5,22.4,10.8,33.2,417.040052
185424,NY,176,70,naturalized-citizen,Local-gov,HSgrad,Married,MGR-Other Managers,white-alone,male,...,1507,3.4,88.7,12.3,33.6,7.6,12.8,8.1,20.9,491.823193
1075329,VA,28,29,Born-US,Private-prof,associate,Never-married,HLS-Home Health Aides,white-alone,female,...,992,3.4,92.5,5.1,37.3,7.6,13.1,7.5,20.6,815.395629
374519,TX,271,60,Born-US,Self-emp-inc,bachelor,Married,MGR-Other Managers,white-alone,male,...,1327,3.2,92.1,0.5,11.0,6.2,36.2,22.3,58.5,1063.428253
920201,NY,298,55,not-US-citizen,Private-prof,HSgrad,Married,TRN-Driver/Sales Workers And Truck Drivers,black-aa-alone,male,...,1507,3.4,88.7,12.3,33.6,7.6,12.8,8.1,20.9,491.823193


In [4]:
df.loc[df['income'] <= 50000, 'income_adjusted'] = '<=50K'
df.loc[df['income'] > 50000, 'income_adjusted'] = '>50K' 

df = df.drop(columns=['income']).rename(columns={'income_adjusted': 'income'})#.dropna()

df['field-of-degree'] = df['field-of-degree'].fillna('Unknown')
df['stem-degree'] = df['stem-degree'].fillna('Unknown')
# df['less_than_9th'] = df['less_than_9th'].fillna(-1.0)
# df['9th_to_12th'] = df['9th_to_12th'].fillna(-1.0)
# df['hs_grad'] = df['hs_grad'].fillna(-1.0)
# df['some_college'] = df['some_college'].fillna(-1.0)
# df['associates_degree'] = df['associates_degree'].fillna(-1.0)
# df['bach_degree'] = df['bach_degree'].fillna(-1.0)
# df['grad_or_prof_degree'] = df['grad_or_prof_degree'].fillna(-1.0)
# df['hs_or_higher'] = df['hs_or_higher'].fillna(-1.0)
# df['bach_degree_or_higher'] = df['bach_degree_or_higher'].fillna(-1.0)

# df = df.dropna()

df.head()

Unnamed: 0,Geographic-division,region,state,RT,person-weight,age,citizenship-status,class-worker,education,marital-status,...,Incarceration rate: White alone,Incarceration rate: Black or African American alone,Incarceration rate: American Indian and Alaska Native alone,Incarceration rate: Asian alone,Incarceration rate: Native Hawaiian and other Pacific Islander alone,Incarceration rate: Some other race alone,Incarceration rate: Two or more races,Incarceration rate: Hispanic or Latino,"Incarceration rate: White alone, not Hispanic or Latino",income
106168,Pacific,West,CA,P,114,23,Born-US,Private-prof,HSgrad,Never-married,...,605.996084,3036.268547,996.414012,94.424879,617.788428,703.884463,188.775488,756.679936,453.449136,<=50K
1445032,West South Central,South,TX,P,70,68,naturalized-citizen,Self-emp-not-inc,bachelor,Married,...,843.762174,2854.646835,584.891093,172.611124,323.236055,1079.147366,278.644656,971.734147,768.328062,>50K
750930,New England,Northeast,MA,P,59,49,Born-US,Private-nonprof,master,Never-married,...,310.660339,1502.308942,981.432361,64.328355,719.748088,387.349214,111.62596,927.581119,241.032739,>50K
794855,East North Central,Midwest,MI,P,30,46,Born-US,Private-prof,HSgrad,Divorced,...,388.319031,2168.510714,924.089216,89.42103,268.817204,170.714621,161.080936,588.278432,373.754663,<=50K
956655,Middle Atlantic,Northeast,NJ,P,69,39,Born-US,Self-emp-not-inc,master,Married,...,274.710876,1991.905885,726.934472,50.983429,1577.390733,451.295464,312.522108,608.946824,218.298491,<=50K


In [5]:
df.isnull().sum()

Geographic-division                                                     0
region                                                                  0
state                                                                   0
RT                                                                      0
person-weight                                                           0
age                                                                     0
citizenship-status                                                      0
class-worker                                                            0
education                                                               0
marital-status                                                          0
occupation                                                              0
relationship                                                            0
race                                                                    0
sex                                   

In [6]:
# df["income"].replace({'<=50K': 0, '>50K': 1}, inplace=True)
# df.head()

In [7]:
X = df.drop(columns=['income'], axis = 1)
y = df[['income']].to_numpy()

In [8]:
labels = np.unique(y)
features = list(X.columns)

### Pre-process categorical variables

In [9]:
cat_var = ['Geographic-division', 'region', 'state', 'RT', 'citizenship-status', 'class-worker', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'field-of-degree', 'place-of-birth', 'stem-degree']
for f in cat_var:
    enc = preprocessing.LabelEncoder()
    X[f] = enc.fit_transform(X[f])

scaler = preprocessing.StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

### Split data into test and training

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
len(X_test)

334460

## Train Models and Establish Baseline Scores

### Random Forest

In [12]:
# Train model, get predictions
model = RandomForestClassifier()
model.fit(X, y)
y_pred = model.predict(X_test)
y_probas = model.predict_proba(X_test)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

In [13]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337840 entries, 233994 to 1122761
Data columns (total 44 columns):
 #   Column                                                                Non-Null Count    Dtype  
---  ------                                                                --------------    -----  
 0   Geographic-division                                                   1337840 non-null  float64
 1   region                                                                1337840 non-null  float64
 2   state                                                                 1337840 non-null  float64
 3   RT                                                                    1337840 non-null  float64
 4   person-weight                                                         1337840 non-null  float64
 5   age                                                                   1337840 non-null  float64
 6   citizenship-status                                                   

In [14]:
# Visualize model performance
wandb.sklearn.plot_classifier(
    model, X_train, X_test, y_train, y_test, y_pred, y_probas, labels,
    is_binary=True, model_name='RandomForest', feature_names=features)

wandb: 
wandb: Plotting RandomForest.
wandb: Logged feature importances.
wandb: Logged learning curve.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision recall curve.


### K-Nearest Neighbors

In [110]:
# Train model, get predictions
model = KNeighborsClassifier(n_neighbors=9)
model.fit(X, y)
y_pred = model.predict(X_test)
y_probas = model.predict_proba(X_test)
# importances = model.feature_importances_
# indices = np.argsort(importances)[::-1]

In [111]:
# visualize model
wandb.sklearn.plot_classifier(model, X_train, X_test, y_train, y_test, y_pred, y_probas, labels, True, 'KNN', features)
wandb.log({'roc': wandb.plots.ROC(y_test, y_probas, labels)})
wandb.log({'pr': wandb.plots.precision_recall(y_test, y_probas, labels)})

wandb: 
wandb: Plotting KNN.
wandb: Logged feature importances.
wandb: Logged learning curve.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision recall curve.


### Logistic Regression

In [10]:
# Train model, get predictions
model = LogisticRegression()
model.fit(X, y)
y_pred = model.predict(X_test)
y_probas = model.predict_proba(X_test)
importances = model.coef_
indices = np.argsort(importances)[::-1]

In [11]:
# Visualize model performance
wandb.sklearn.plot_classifier(
    model, X_train, X_test, y_train, y_test, y_pred, y_probas, labels,
    is_binary=True, model_name='LogisticRegression', feature_names=features)

wandb: 
wandb: Plotting LogisticRegression.
wandb: Logged feature importances.
wandb: Logged learning curve.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision recall curve.


In [15]:
print(classification_report(y_test, y_pred, target_names=labels))

              precision    recall  f1-score   support

       <=50K       0.85      0.94      0.89      4965
        >50K       0.72      0.46      0.56      1548

    accuracy                           0.83      6513
   macro avg       0.78      0.70      0.73      6513
weighted avg       0.82      0.83      0.81      6513

