# Run Models on 2019 Data Joined with Median Home Value and Gross Rent
#### 10/23/2020
---

## Load and Process Dataset
### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import wandb
import warnings
warnings.filterwarnings('ignore')

In [2]:
wandb.init(project="2019_newvar_home-value_gross-rent")

wandb: Currently logged in as: apraturu (use `wandb login --relogin` to force relogin)


### Read in `adult.csv` data

In [22]:
df = pd.read_csv('../final_data/2019_new-var_home-value_gross-rent.csv').sample(frac=0.5)
df.head()

Unnamed: 0,Geographic-division,region,state,RT,person-weight,age,citizenship-status,class-worker,education,marital-status,...,sex,extra-income,hours-per-week,field-of-degree,place-of-birth,income,stem-degree,full PUMA,med_home_value,med_gross_rent
997315,Middle Atlantic,Northeast,36,P,74,50,Born-US,Private-prof,some-college-great1,Divorced,...,female,0.0,50.0,,Massachusetts/MA,56000.0,,3604005,1822200,2332.0
1270198,Middle Atlantic,Northeast,42,P,63,76,Born-US,Private-prof,HSgrad,Married,...,male,0.0,30.0,,Pennsylvania/PA,63000.0,,4201805,90700,691.0
1366849,East South Central,South,47,P,99,62,Born-US,Private-prof,grade8,Never-married,...,male,0.0,32.0,,Tennessee/TN,28000.0,,4700700,162600,741.0
1484936,West South Central,South,48,P,60,40,Born-US,State-gov,bachelor,Never-married,...,female,0.0,40.0,Accounting,Texas/TX,45000.0,No,4800501,126100,882.0
78115,Pacific,West,6,P,24,20,Born-US,Private-prof,some-college-great1,Never-married,...,female,0.0,27.0,,California/CA,6500.0,,603711,647300,1711.0


In [23]:
df.loc[df['income'] <= 50000, 'income_adjusted'] = '<=50K'
df.loc[df['income'] > 50000, 'income_adjusted'] = '>50K' 

df['med_home_value'] = pd.to_numeric(df['med_home_value'],errors='coerce')


df = df.drop(columns=['income']).rename(columns={'income_adjusted': 'income'}).dropna()

df.head()

Unnamed: 0,Geographic-division,region,state,RT,person-weight,age,citizenship-status,class-worker,education,marital-status,...,sex,extra-income,hours-per-week,field-of-degree,place-of-birth,stem-degree,full PUMA,med_home_value,med_gross_rent,income
1484936,West South Central,South,48,P,60,40,Born-US,State-gov,bachelor,Never-married,...,female,0.0,40.0,Accounting,Texas/TX,No,4800501,126100.0,882.0,<=50K
1502141,West South Central,South,48,P,72,51,Born-US,Private-nonprof,bachelor,Married,...,female,0.0,40.0,Anthropology And Archeology,Texas/TX,Yes,4805305,579300.0,1386.0,>50K
1462128,West South Central,South,48,P,46,30,Born-US,Private-nonprof,master,Married,...,female,0.0,40.0,Psychology,Nevada/NV,Yes,4804612,343700.0,1158.0,<=50K
1184622,East North Central,Midwest,39,P,293,28,Born-US,Private-nonprof,bachelor,Never-married,...,male,0.0,28.0,Biology,Michigan/MI,Yes,3904104,328700.0,1133.0,<=50K
243777,Pacific,West,6,P,138,54,Born-US,Private-prof,bachelor,Married,...,male,5000.0,40.0,Mechanical Engineering,Texas/TX,Yes,608702,908000.0,1885.0,>50K


In [24]:
df.isnull().sum()

Geographic-division    0
region                 0
state                  0
RT                     0
person-weight          0
age                    0
citizenship-status     0
class-worker           0
education              0
marital-status         0
occupation             0
relationship           0
race                   0
sex                    0
extra-income           0
hours-per-week         0
field-of-degree        0
place-of-birth         0
stem-degree            0
full PUMA              0
med_home_value         0
med_gross_rent         0
income                 0
dtype: int64

In [25]:
X = df.drop(columns=['income'], axis = 1)
y = df[['income']].to_numpy()

In [26]:
labels = np.unique(y)
features = list(X.columns)

### Pre-process categorical variables

In [27]:
cat_var = ['Geographic-division', 'region', 'state', 'RT', 'citizenship-status', 'class-worker', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'field-of-degree', 'place-of-birth', 'stem-degree']
for f in cat_var:
    enc = preprocessing.LabelEncoder()
    X[f] = enc.fit_transform(X[f])

scaler = preprocessing.StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

### Split data into test and training

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [29]:
len(X_test)

55969

## Train Models and Establish Baseline Scores

### Random Forest

In [30]:
# Train model, get predictions
model = RandomForestClassifier()
model.fit(X, y)
y_pred = model.predict(X_test)
y_probas = model.predict_proba(X_test)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

In [31]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 223876 entries, 212808 to 15119
Data columns (total 22 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Geographic-division  223876 non-null  float64
 1   region               223876 non-null  float64
 2   state                223876 non-null  float64
 3   RT                   223876 non-null  float64
 4   person-weight        223876 non-null  float64
 5   age                  223876 non-null  float64
 6   citizenship-status   223876 non-null  float64
 7   class-worker         223876 non-null  float64
 8   education            223876 non-null  float64
 9   marital-status       223876 non-null  float64
 10  occupation           223876 non-null  float64
 11  relationship         223876 non-null  float64
 12  race                 223876 non-null  float64
 13  sex                  223876 non-null  float64
 14  extra-income         223876 non-null  float64
 15  hours-per-wee

In [32]:
# Visualize model performance
wandb.sklearn.plot_classifier(
    model, X_train, X_test, y_train, y_test, y_pred, y_probas, labels,
    is_binary=True, model_name='RandomForest', feature_names=features)

wandb: 
wandb: Plotting RandomForest.
wandb: Logged feature importances.
wandb: Logged learning curve.
wandb: Logged confusion matrix.
wandb: Logged summary metrics.
wandb: Logged class proportions.
wandb: Logged calibration curve.
wandb: Logged roc curve.
wandb: Logged precision recall curve.
