# Run Models on 2019 Data Joined with Educational Attainment
#### 10/30/2020
---

## Load and Process Dataset
### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import wandb
import warnings
warnings.filterwarnings('ignore')

In [2]:
wandb.init(project="2019_newvar_educational-attainment")

wandb: Currently logged in as: apraturu (use `wandb login --relogin` to force relogin)


### Read in `adult.csv` data

In [6]:
df = pd.read_csv('../final_data/2019_new-var_educational_attainment.csv').sample(frac=1)
df.head()

Unnamed: 0,Geographic-division,region,state,RT,person-weight,age,citizenship-status,class-worker,education,marital-status,...,full PUMA,less_than_9th,9th_to_12th,hs_grad,some_college,associates_degree,bach_degree,grad_or_prof_degree,hs_or_higher,bach_degree_or_higher
1472652,West South Central,South,48,P,157,31,Born-US,Private-prof,HSgrad,Never-married,...,4802101,4.7,7.4,27.8,25.7,8.4,17.7,8.4,87.9,26.1
722689,New England,Northeast,25,P,56,47,Born-US,Self-emp-inc,bachelor,Married,...,2501400,1.3,2.3,11.5,10.0,5.8,32.5,36.5,96.4,69.1
902432,Mountain,West,32,P,46,18,Born-US,Private-nonprof,no-school,Never-married,...,3200404,10.7,13.3,31.4,23.7,7.8,7.9,5.1,76.0,13.0
1625028,Pacific,West,53,P,111,33,Born-US,Private-prof,master,Married,...,5311615,1.3,4.1,20.5,24.9,11.4,25.6,12.1,94.6,37.8
463770,South Atlantic,South,13,P,45,35,Born-US,Private-prof,bachelor,Married,...,1300800,7.5,11.1,33.7,19.5,9.5,11.4,7.3,81.4,18.6


In [7]:
df.loc[df['income'] <= 50000, 'income_adjusted'] = '<=50K'
df.loc[df['income'] > 50000, 'income_adjusted'] = '>50K' 

df = df.drop(columns=['income']).rename(columns={'income_adjusted': 'income'})#.dropna()

df['field-of-degree'] = df['field-of-degree'].fillna('Unknown')
df['stem-degree'] = df['stem-degree'].fillna('Unknown')
# df['less_than_9th'] = df['less_than_9th'].fillna(-1.0)
# df['9th_to_12th'] = df['9th_to_12th'].fillna(-1.0)
# df['hs_grad'] = df['hs_grad'].fillna(-1.0)
# df['some_college'] = df['some_college'].fillna(-1.0)
# df['associates_degree'] = df['associates_degree'].fillna(-1.0)
# df['bach_degree'] = df['bach_degree'].fillna(-1.0)
# df['grad_or_prof_degree'] = df['grad_or_prof_degree'].fillna(-1.0)
# df['hs_or_higher'] = df['hs_or_higher'].fillna(-1.0)
# df['bach_degree_or_higher'] = df['bach_degree_or_higher'].fillna(-1.0)

# df = df.dropna()

df.head()

Unnamed: 0,Geographic-division,region,state,RT,person-weight,age,citizenship-status,class-worker,education,marital-status,...,less_than_9th,9th_to_12th,hs_grad,some_college,associates_degree,bach_degree,grad_or_prof_degree,hs_or_higher,bach_degree_or_higher,income
1472652,West South Central,South,48,P,157,31,Born-US,Private-prof,HSgrad,Never-married,...,4.7,7.4,27.8,25.7,8.4,17.7,8.4,87.9,26.1,<=50K
722689,New England,Northeast,25,P,56,47,Born-US,Self-emp-inc,bachelor,Married,...,1.3,2.3,11.5,10.0,5.8,32.5,36.5,96.4,69.1,>50K
902432,Mountain,West,32,P,46,18,Born-US,Private-nonprof,no-school,Never-married,...,10.7,13.3,31.4,23.7,7.8,7.9,5.1,76.0,13.0,<=50K
1625028,Pacific,West,53,P,111,33,Born-US,Private-prof,master,Married,...,1.3,4.1,20.5,24.9,11.4,25.6,12.1,94.6,37.8,>50K
463770,South Atlantic,South,13,P,45,35,Born-US,Private-prof,bachelor,Married,...,7.5,11.1,33.7,19.5,9.5,11.4,7.3,81.4,18.6,<=50K


In [8]:
# df = df.dropna()
# df.shape

df.isnull().sum()

Geographic-division      0
region                   0
state                    0
RT                       0
person-weight            0
age                      0
citizenship-status       0
class-worker             0
education                0
marital-status           0
occupation               0
relationship             0
race                     0
sex                      0
extra-income             0
hours-per-week           0
field-of-degree          0
place-of-birth           0
stem-degree              0
full PUMA                0
less_than_9th            0
9th_to_12th              0
hs_grad                  0
some_college             0
associates_degree        0
bach_degree              0
grad_or_prof_degree      0
hs_or_higher             0
bach_degree_or_higher    0
income                   0
dtype: int64

In [9]:
X = df.drop(columns=['income'], axis = 1)
y = df[['income']].to_numpy()

In [10]:
labels = np.unique(y)
features = list(X.columns)

### Pre-process categorical variables

In [11]:
cat_var = ['Geographic-division', 'region', 'state', 'RT', 'citizenship-status', 'class-worker', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'field-of-degree', 'place-of-birth', 'stem-degree']
for f in cat_var:
    enc = preprocessing.LabelEncoder()
    X[f] = enc.fit_transform(X[f])

scaler = preprocessing.StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

### Split data into test and training

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
len(X_test)

334460

## Train Models and Establish Baseline Scores

### Logistic Regression

In [14]:
# Train model, get predictions
model = LogisticRegression()
model.fit(X, y)
y_pred = model.predict(X_test)
y_probas = model.predict_proba(X_test)
importances = model.coef_
indices = np.argsort(importances)[::-1]

In [15]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337840 entries, 698431 to 413666
Data columns (total 29 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Geographic-division    1337840 non-null  float64
 1   region                 1337840 non-null  float64
 2   state                  1337840 non-null  float64
 3   RT                     1337840 non-null  float64
 4   person-weight          1337840 non-null  float64
 5   age                    1337840 non-null  float64
 6   citizenship-status     1337840 non-null  float64
 7   class-worker           1337840 non-null  float64
 8   education              1337840 non-null  float64
 9   marital-status         1337840 non-null  float64
 10  occupation             1337840 non-null  float64
 11  relationship           1337840 non-null  float64
 12  race                   1337840 non-null  float64
 13  sex                    1337840 non-null  float64
 14  extra-income  

In [16]:
# Visualize model performance
wandb.sklearn.plot_classifier(
    model, X_train, X_test, y_train, y_test, y_pred, y_probas, labels,
    is_binary=True, model_name='LogisticRegression', feature_names=features)

wandb: 
wandb: Plotting LogisticRegression.
wandb: Logged feature importances.
wandb: Logged learning curve.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision recall curve.


In [17]:
print(classification_report(y_test, y_pred, target_names=labels))

              precision    recall  f1-score   support

       <=50K       0.78      0.84      0.81    201905
        >50K       0.73      0.64      0.68    132555

    accuracy                           0.76    334460
   macro avg       0.75      0.74      0.74    334460
weighted avg       0.76      0.76      0.76    334460

