# Run Models on 2019 Data with Additional Variables
#### 10/23/2020
---

## Load and Process Dataset
### Import Libraries

In [15]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import wandb
import warnings
warnings.filterwarnings('ignore')

In [2]:
wandb.init(project="2019_newvar")

wandb: Currently logged in as: apraturu (use `wandb login --relogin` to force relogin)


### Read in `adult.csv` data

In [3]:
df = pd.read_csv('../final_data/2019_new-var_mapped_processed.csv').sample(frac=1)
df.head()

Unnamed: 0,Geographic-division,area-code,region,state,RT,person-weight,age,citizenship-status,class-worker,education,...,occupation,relationship,race,sex,extra-income,hours-per-week,field-of-degree,place-of-birth,income,stem-degree
1225619,Pacific,1316,West,OR,P,72,41,Born-US,Private-prof,HSgrad,...,PRD-Sewing Machine Operators,20,white-alone,female,4900.0,40.0,,Oregon/OR,50900.0,
97505,Pacific,8302,West,CA,P,122,59,naturalized-citizen,State-gov,bachelor,...,MED-Licensed Practical And Licensed Vocational...,20,asian-alone,male,0.0,40.0,Nursing,Philippines,65000.0,No
120849,Pacific,3765,West,CA,P,56,45,Born-US,Local-gov,master,...,EDU-Secondary School Teachers,20,white-alone,female,50.0,40.0,English Language And Literature,Maryland/MD,100050.0,No
1107845,South Atlantic,1208,South,NC,P,85,23,Born-US,Private-prof,HSgrad,...,CON-Roofers,30,black-aa-alone,male,0.0,40.0,,North Carolina/NC,40000.0,
1146847,East North Central,4603,Midwest,OH,P,98,19,Born-US,Private-prof,HSgrad,...,SAL-Cashiers,33,black-aa-alone,female,0.0,25.0,,Ohio/OH,2800.0,


In [4]:
df.loc[df['income'] <= 50000, 'income_adjusted'] = '<=50K'
df.loc[df['income'] > 50000, 'income_adjusted'] = '>50K' 

df = df.drop(columns=['income']).rename(columns={'income_adjusted': 'income'})#.dropna()

df = df.fillna('Unknown') 

df.head()

Unnamed: 0,Geographic-division,area-code,region,state,RT,person-weight,age,citizenship-status,class-worker,education,...,occupation,relationship,race,sex,extra-income,hours-per-week,field-of-degree,place-of-birth,stem-degree,income
1225619,Pacific,1316,West,OR,P,72,41,Born-US,Private-prof,HSgrad,...,PRD-Sewing Machine Operators,20,white-alone,female,4900.0,40.0,Unknown,Oregon/OR,Unknown,>50K
97505,Pacific,8302,West,CA,P,122,59,naturalized-citizen,State-gov,bachelor,...,MED-Licensed Practical And Licensed Vocational...,20,asian-alone,male,0.0,40.0,Nursing,Philippines,No,>50K
120849,Pacific,3765,West,CA,P,56,45,Born-US,Local-gov,master,...,EDU-Secondary School Teachers,20,white-alone,female,50.0,40.0,English Language And Literature,Maryland/MD,No,>50K
1107845,South Atlantic,1208,South,NC,P,85,23,Born-US,Private-prof,HSgrad,...,CON-Roofers,30,black-aa-alone,male,0.0,40.0,Unknown,North Carolina/NC,Unknown,<=50K
1146847,East North Central,4603,Midwest,OH,P,98,19,Born-US,Private-prof,HSgrad,...,SAL-Cashiers,33,black-aa-alone,female,0.0,25.0,Unknown,Ohio/OH,Unknown,<=50K


In [5]:
df.isnull().sum()

Geographic-division    0
area-code              0
region                 0
state                  0
RT                     0
person-weight          0
age                    0
citizenship-status     0
class-worker           0
education              0
marital-status         0
occupation             0
relationship           0
race                   0
sex                    0
extra-income           0
hours-per-week         0
field-of-degree        0
place-of-birth         0
stem-degree            0
income                 0
dtype: int64

In [6]:
X = df.drop(columns=['income'], axis = 1)
y = df[['income']].to_numpy()

In [7]:
labels = np.unique(y)
features = list(X.columns)

### Pre-process categorical variables

In [8]:
cat_var = ['Geographic-division', 'area-code', 'region', 'state', 'RT', 'citizenship-status', 'class-worker', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'field-of-degree', 'place-of-birth', 'stem-degree']
for f in cat_var:
    enc = preprocessing.LabelEncoder()
    X[f] = enc.fit_transform(X[f])

scaler = preprocessing.StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

### Split data into test and training

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
len(X_test)

334460

## Train Models and Establish Baseline Scores

### Logistic Regression

In [11]:
# Train model, get predictions
model = LogisticRegression()
model.fit(X, y)
y_pred = model.predict(X_test)
y_probas = model.predict_proba(X_test)
importances = model.coef_
indices = np.argsort(importances)[::-1]

In [12]:
importances

array([[-0.05479919,  0.08587449,  0.0568646 , -0.01906436,  0.        ,
        -0.0201033 ,  0.4745447 , -0.13005679, -0.15901108,  0.13087329,
        -0.12714358, -0.13297093, -0.53083999,  0.03172119,  0.31119723,
         1.62583974,  0.81150639, -0.64839353, -0.01693457,  0.08221367]])

In [13]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337840 entries, 348592 to 194782
Data columns (total 20 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   Geographic-division  1337840 non-null  float64
 1   area-code            1337840 non-null  float64
 2   region               1337840 non-null  float64
 3   state                1337840 non-null  float64
 4   RT                   1337840 non-null  float64
 5   person-weight        1337840 non-null  float64
 6   age                  1337840 non-null  float64
 7   citizenship-status   1337840 non-null  float64
 8   class-worker         1337840 non-null  float64
 9   education            1337840 non-null  float64
 10  marital-status       1337840 non-null  float64
 11  occupation           1337840 non-null  float64
 12  relationship         1337840 non-null  float64
 13  race                 1337840 non-null  float64
 14  sex                  1337840 non-null  float64

In [14]:
# Visualize model performance
wandb.sklearn.plot_classifier(
    model, X_train, X_test, y_train, y_test, y_pred, y_probas, labels,
    is_binary=True, model_name='LogisticRegression', feature_names=features)

wandb: 
wandb: Plotting LogisticRegression.
wandb: Logged feature importances.
wandb: Logged learning curve.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision recall curve.


In [16]:
print(classification_report(y_test, y_pred, target_names=labels))

              precision    recall  f1-score   support

       <=50K       0.77      0.84      0.80    201977
        >50K       0.72      0.62      0.66    132483

    accuracy                           0.75    334460
   macro avg       0.74      0.73      0.73    334460
weighted avg       0.75      0.75      0.75    334460

