# Random Forest

Here we explore the usage of RF

In [13]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

For now we will be ok with 30 depth which is already too much.

In [14]:
MAX_DEPTH = 30

First we will load the data

In [15]:
features_all_df = pd.read_csv("training_set_features.csv", index_col="respondent_id")
labels_all_df = pd.read_csv("training_set_labels.csv", index_col="respondent_id")

features_raw_df, features_test_raw_df, labels_df, labels_test_df = train_test_split(
    features_all_df,
    labels_all_df,
    test_size=0.33,
    shuffle=True,
    stratify=labels_all_df,
    random_state=31415
)

features_raw_df

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10992,2.0,2.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Married,Own,Not in Labor Force,mlyzmhmf,"MSA, Not Principle City",2.0,0.0,,
1828,3.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,lzgpxyit,Non-MSA,3.0,1.0,pxcmvdjn,cmhcxjea
5273,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,"> $75,000",Married,Own,Employed,dqpwygqj,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea
22497,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,"> $75,000",Married,Own,Not in Labor Force,kbazzjca,"MSA, Not Principle City",1.0,2.0,,
6821,0.0,2.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,,...,"> $75,000",Married,Own,Employed,oxchjgsf,"MSA, Not Principle City",1.0,0.0,xicduogh,xtkaffoo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10335,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,lzgpxyit,Non-MSA,1.0,0.0,,
2122,2.0,2.0,0.0,,0.0,1.0,0.0,0.0,1.0,0.0,...,,Not Married,,,kbazzjca,"MSA, Principle City",2.0,0.0,,
17062,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,"> $75,000",Married,Rent,Employed,lrircsnp,"MSA, Not Principle City",1.0,3.0,haxffmxo,dcjcmpih
25010,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,lrircsnp,"MSA, Not Principle City",2.0,0.0,,


In [19]:
def ordinalize(features: pd.DataFrame, raw: pd.DataFrame, name: str) -> pd.DataFrame:
    ordinal_df = pd.DataFrame(
        OrdinalEncoder().fit_transform(raw[[name]])[1],
        columns=[name],
    )
    
    return pd.concat(
        [features.drop(name, axis=1), ordinal_df],
        axis=1,   
    )
    
def onehotize(features: pd.DataFrame, raw: pd.DataFrame, name: str) -> pd.DataFrame:
    enc = OneHotEncoder(sparse=False)
    onehot_df = pd.DataFrame(
        enc.fit_transform(raw[name]),
        columns=enc.get_feature_names_out(),
    )
    onehot_df.index = raw.index
    return pd.concat(
        [features.drop(name, axis=1), onehot_df],
        axis=1,
    )

def normalize(in_df: pd.DataFrame) -> pd.DataFrame:
    out_df = in_df
    out_df = onehotize(out_df, in_df, ["employment_industry", "employment_occupation", "race", "hhs_geo_region", "census_msa"])
    #out_df = ordinalize(out_df, in_df, "age_group")
    #out_df = ordinalize(out_df, in_df, "education")
    #out_df = ordinalize(out_df, in_df, "sex")
    #out_df = ordinalize(out_df, in_df, "income_poverty")
    #out_df = ordinalize(out_df, in_df, "marital_status")
    #out_df = ordinalize(out_df, in_df, "rent_or_own")
    #out_df = ordinalize(out_df, in_df, "employment_status")
    return out_df.replace(np.nan, -1)


features_df = normalize(features_raw_df)
features_test_df = normalize(features_test_raw_df)

features_df

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,hhs_geo_region_fpwskwrf,hhs_geo_region_kbazzjca,hhs_geo_region_lrircsnp,hhs_geo_region_lzgpxyit,hhs_geo_region_mlyzmhmf,hhs_geo_region_oxchjgsf,hhs_geo_region_qufhixun,"census_msa_MSA, Not Principle City","census_msa_MSA, Principle City",census_msa_Non-MSA
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10992,2.0,2.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1828,3.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
5273,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22497,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6821,0.0,2.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10335,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2122,2.0,2.0,0.0,-1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
17062,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25010,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [5]:
regress_multirf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, max_depth=MAX_DEPTH, random_state=0))

How we will fit:

In [6]:
regress_multirf.fit(features_df, labels_df)

ValueError: Found input variables with inconsistent numbers of samples: [17894, 17893]