# Run Models on 2019 Data with Additional Variables
#### 10/23/2020
---

## Load and Process Dataset
### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import wandb
import warnings
warnings.filterwarnings('ignore')

In [2]:
wandb.init(project="2019_newvar")

wandb: Currently logged in as: apraturu (use `wandb login --relogin` to force relogin)


### Read in `adult.csv` data

In [8]:
df = pd.read_csv('../final_data/2019_new-var_mapped_processed.csv').sample(frac=0.5)
df.head()

Unnamed: 0,Geographic-division,area-code,region,state,RT,person-weight,age,citizenship-status,class-worker,education,...,occupation,relationship,race,sex,extra-income,hours-per-week,field-of-degree,place-of-birth,income,stem-degree
1483269,West South Central,4608,South,TX,P,41,18,Born-US,Private-prof,grade11,...,SAL-Cashiers,25,white-alone,male,0.0,25.0,,Texas/TX,320.0,
477914,South Atlantic,3005,South,GA,P,57,19,Born-US,Private-prof,HSgrad,...,EAT-Fast Food And Counter Workers,25,white-alone,male,0.0,11.0,,Georgia/GA,4600.0,
887788,Mountain,100,West,MT,P,37,48,Born-US,Private-prof,associate,...,LGL-Paralegals And Legal Assistants,21,white-alone,female,0.0,41.0,,Colorado/CO,40000.0,
406542,South Atlantic,11704,South,FL,P,115,29,Born-US,Self-emp-not-inc,associate,...,MGR-Other Managers,20,white-alone,male,80.0,42.0,,Texas/TX,341080.0,
983759,Middle Atlantic,1600,Northeast,NY,P,27,51,Born-US,Private-nonprof,no-school,...,"PRD-Miscellaneous Production Workers, Includin...",38,white-alone,female,0.0,25.0,,New York/NY,9900.0,


In [9]:
df.loc[df['income'] <= 50000, 'income_adjusted'] = '<=50K'
df.loc[df['income'] > 50000, 'income_adjusted'] = '>50K' 

df = df.drop(columns=['income']).rename(columns={'income_adjusted': 'income'}).dropna()

df.head()

Unnamed: 0,Geographic-division,area-code,region,state,RT,person-weight,age,citizenship-status,class-worker,education,...,occupation,relationship,race,sex,extra-income,hours-per-week,field-of-degree,place-of-birth,stem-degree,income
1303390,Middle Atlantic,3900,Northeast,PA,P,90,58,Born-US,Private-prof,bachelor,...,BUS-Management Analysts,20,white-alone,male,20.0,55.0,Business Management And Administration,Pennsylvania/PA,No,>50K
1429061,West South Central,4801,South,TX,P,126,71,Born-US,Fed-gov,master,...,FIN-Credit Counselors And Loan Officers,20,black-aa-alone,male,0.0,40.0,Psychology,New York/NY,Yes,<=50K
42215,Mountain,109,West,AZ,P,64,26,Born-US,Fed-gov,bachelor,...,MGR-Marketing Managers,20,white-alone,male,0.0,75.0,Applied Mathematics,Arizona/AZ,Yes,<=50K
703724,South Atlantic,1006,South,MD,P,52,37,Born-US,State-gov,prof-school,...,MED-Physicians,20,white-alone,female,300.0,50.0,Multi-Disciplinary Or General Science,Delaware/DE,Yes,>50K
1643752,East North Central,20000,Midwest,WI,P,186,35,Born-US,Private-prof,bachelor,...,MGR-Computer And Information Systems Managers,20,white-alone,female,0.0,30.0,French German Latin And Other Common Foreign L...,Wisconsin/WI,No,<=50K


In [10]:
df.isnull().sum()

Geographic-division    0
area-code              0
region                 0
state                  0
RT                     0
person-weight          0
age                    0
citizenship-status     0
class-worker           0
education              0
marital-status         0
occupation             0
relationship           0
race                   0
sex                    0
extra-income           0
hours-per-week         0
field-of-degree        0
place-of-birth         0
stem-degree            0
income                 0
dtype: int64

In [14]:
X = df.drop(columns=['income'], axis = 1)
y = df[['income']].to_numpy()

In [15]:
labels = np.unique(y)
features = list(X.columns)

### Pre-process categorical variables

In [17]:
cat_var = ['Geographic-division', 'area-code', 'region', 'state', 'RT', 'citizenship-status', 'class-worker', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'field-of-degree', 'place-of-birth', 'stem-degree']
for f in cat_var:
    enc = preprocessing.LabelEncoder()
    X[f] = enc.fit_transform(X[f])

scaler = preprocessing.StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

### Split data into test and training

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [19]:
len(X_test)

61555

## Train Models and Establish Baseline Scores

### Random Forest

In [20]:
# Train model, get predictions
model = RandomForestClassifier()
model.fit(X, y)
y_pred = model.predict(X_test)
y_probas = model.predict_proba(X_test)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

In [21]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246216 entries, 98796 to 193821
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Geographic-division  246216 non-null  float64
 1   area-code            246216 non-null  float64
 2   region               246216 non-null  float64
 3   state                246216 non-null  float64
 4   RT                   246216 non-null  float64
 5   person-weight        246216 non-null  float64
 6   age                  246216 non-null  float64
 7   citizenship-status   246216 non-null  float64
 8   class-worker         246216 non-null  float64
 9   education            246216 non-null  float64
 10  marital-status       246216 non-null  float64
 11  occupation           246216 non-null  float64
 12  relationship         246216 non-null  float64
 13  race                 246216 non-null  float64
 14  sex                  246216 non-null  float64
 15  extra-income 

In [22]:
# Visualize model performance
wandb.sklearn.plot_classifier(
    model, X_train, X_test, y_train, y_test, y_pred, y_probas, labels,
    is_binary=True, model_name='RandomForest', feature_names=features)

wandb: 
wandb: Plotting RandomForest.
wandb: Logged feature importances.
wandb: Logged learning curve.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision recall curve.
