# Run Models on 2019 Data Joined with Incarceration Rates
#### 10/23/2020
---

## Load and Process Dataset
### Import Libraries

In [15]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import wandb
import warnings
warnings.filterwarnings('ignore')

In [2]:
wandb.init(project="2019_newvar_incarceration")

wandb: Currently logged in as: apraturu (use `wandb login --relogin` to force relogin)


### Read in `adult.csv` data

In [3]:
df = pd.read_csv('../final_data/2019_new-var_inc.csv').sample(frac=1)
df.head()

Unnamed: 0,Geographic-division,area-code,region,state,RT,person-weight,age,citizenship-status,class-worker,education,...,Incarceration rate per 100000,Incarceration rate: White alone,Incarceration rate: Black or African American alone,Incarceration rate: American Indian and Alaska Native alone,Incarceration rate: Asian alone,Incarceration rate: Native Hawaiian and other Pacific Islander alone,Incarceration rate: Some other race alone,Incarceration rate: Two or more races,Incarceration rate: Hispanic or Latino,"Incarceration rate: White alone, not Hispanic or Latino"
1261943,Middle Atlantic,2500,Northeast,PA,P,138,29,Born-US,Private-prof,grade9,...,770.091965,398.086234,3268.51706,1031.926387,152.110643,629.619491,3194.532582,385.981878,2000.805936,374.554764
1255857,Middle Atlantic,2801,Northeast,PA,P,38,41,Born-US,Private-prof,HSgrad,...,770.091965,398.086234,3268.51706,1031.926387,152.110643,629.619491,3194.532582,385.981878,2000.805936,374.554764
1630800,South Atlantic,700,South,WV,P,40,46,Born-US,Local-gov,master,...,895.361777,644.889505,7360.116596,2561.394243,378.848944,3271.028037,6357.247916,652.125857,4585.054787,621.634335
584673,East North Central,2302,Midwest,IN,P,43,28,Born-US,Private-prof,bachelor,...,751.009978,563.579549,2814.353133,888.311126,75.141011,553.662692,434.471537,177.48102,781.35625,542.121532
548597,East North Central,202,Midwest,IL,P,82,43,Born-US,Private-prof,associate,...,552.022691,283.017521,2128.25236,821.145054,42.594227,345.679012,472.828333,149.319613,472.090346,258.014658


In [4]:
df.loc[df['income'] <= 50000, 'income_adjusted'] = '<=50K'
df.loc[df['income'] > 50000, 'income_adjusted'] = '>50K' 

df = df.drop(columns=['income']).rename(columns={'income_adjusted': 'income'})#.dropna()

df['field-of-degree'] = df['field-of-degree'].fillna('Unknown')
df['stem-degree'] = df['stem-degree'].fillna('Unknown')

df = df.dropna()

df.head()

Unnamed: 0,Geographic-division,area-code,region,state,RT,person-weight,age,citizenship-status,class-worker,education,...,Incarceration rate: White alone,Incarceration rate: Black or African American alone,Incarceration rate: American Indian and Alaska Native alone,Incarceration rate: Asian alone,Incarceration rate: Native Hawaiian and other Pacific Islander alone,Incarceration rate: Some other race alone,Incarceration rate: Two or more races,Incarceration rate: Hispanic or Latino,"Incarceration rate: White alone, not Hispanic or Latino",income
1261943,Middle Atlantic,2500,Northeast,PA,P,138,29,Born-US,Private-prof,grade9,...,398.086234,3268.51706,1031.926387,152.110643,629.619491,3194.532582,385.981878,2000.805936,374.554764,<=50K
1255857,Middle Atlantic,2801,Northeast,PA,P,38,41,Born-US,Private-prof,HSgrad,...,398.086234,3268.51706,1031.926387,152.110643,629.619491,3194.532582,385.981878,2000.805936,374.554764,>50K
1630800,South Atlantic,700,South,WV,P,40,46,Born-US,Local-gov,master,...,644.889505,7360.116596,2561.394243,378.848944,3271.028037,6357.247916,652.125857,4585.054787,621.634335,<=50K
584673,East North Central,2302,Midwest,IN,P,43,28,Born-US,Private-prof,bachelor,...,563.579549,2814.353133,888.311126,75.141011,553.662692,434.471537,177.48102,781.35625,542.121532,<=50K
548597,East North Central,202,Midwest,IL,P,82,43,Born-US,Private-prof,associate,...,283.017521,2128.25236,821.145054,42.594227,345.679012,472.828333,149.319613,472.090346,258.014658,<=50K


In [5]:
df.isnull().sum()

Geographic-division                                                     0
area-code                                                               0
region                                                                  0
state                                                                   0
RT                                                                      0
person-weight                                                           0
age                                                                     0
citizenship-status                                                      0
class-worker                                                            0
education                                                               0
marital-status                                                          0
occupation                                                              0
relationship                                                            0
race                                  

In [6]:
X = df.drop(columns=['income'], axis = 1)
y = df[['income']].to_numpy()

In [7]:
labels = np.unique(y)
features = list(X.columns)

In [8]:
X.columns

Index(['Geographic-division', 'area-code', 'region', 'state', 'RT',
       'person-weight', 'age', 'citizenship-status', 'class-worker',
       'education', 'marital-status', 'occupation', 'relationship', 'race',
       'sex', 'extra-income', 'hours-per-week', 'field-of-degree',
       'place-of-birth', 'stem-degree', 'Incarceration rate per 100000',
       'Incarceration rate: White alone',
       'Incarceration rate: Black or African American alone',
       'Incarceration rate: American Indian and Alaska Native alone',
       'Incarceration rate: Asian alone',
       'Incarceration rate: Native Hawaiian and other Pacific Islander alone',
       'Incarceration rate: Some other race alone',
       'Incarceration rate: Two or more races',
       'Incarceration rate: Hispanic or Latino',
       'Incarceration rate: White alone, not Hispanic or Latino'],
      dtype='object')

### Pre-process categorical variables

In [9]:
cat_var = ['Geographic-division', 'area-code', 'region', 'state', 'RT', 'citizenship-status', 'class-worker', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'field-of-degree', 'place-of-birth', 'stem-degree']
for f in cat_var:
    enc = preprocessing.LabelEncoder()
    X[f] = enc.fit_transform(X[f])

scaler = preprocessing.StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

### Split data into test and training

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
len(X_test)

334460

## Train Models and Establish Baseline Scores

### Logistic Regression

In [12]:
# Train model, get predictions
model = LogisticRegression()
model.fit(X, y)
y_pred = model.predict(X_test)
y_probas = model.predict_proba(X_test)
importances = model.coef_
indices = np.argsort(importances)[::-1]

In [13]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337840 entries, 1442516 to 1397067
Data columns (total 30 columns):
 #   Column                                                                Non-Null Count    Dtype  
---  ------                                                                --------------    -----  
 0   Geographic-division                                                   1337840 non-null  float64
 1   area-code                                                             1337840 non-null  float64
 2   region                                                                1337840 non-null  float64
 3   state                                                                 1337840 non-null  float64
 4   RT                                                                    1337840 non-null  float64
 5   person-weight                                                         1337840 non-null  float64
 6   age                                                                 

In [14]:
# Visualize model performance
wandb.sklearn.plot_classifier(
    model, X_train, X_test, y_train, y_test, y_pred, y_probas, labels,
    is_binary=True, model_name='LogisticRegression', feature_names=features)

wandb: 
wandb: Plotting LogisticRegression.
wandb: Logged feature importances.
wandb: Logged learning curve.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision recall curve.


In [16]:
print(classification_report(y_test, y_pred, target_names=labels))

              precision    recall  f1-score   support

       <=50K       0.77      0.84      0.81    201617
        >50K       0.72      0.63      0.67    132843

    accuracy                           0.76    334460
   macro avg       0.75      0.73      0.74    334460
weighted avg       0.75      0.76      0.75    334460

