<a href="https://colab.research.google.com/github/andrew-ryabchenko/DS-Unit-2-Kaggle-Challenge/blob/master/Kaggle_Challenge_Random_Forest_Tuning_and_Final_Submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install category_encoders;



In [136]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, r2_score
from category_encoders import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from statistics import mean
from sklearn.tree import DecisionTreeClassifier
from category_encoders.ordinal import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector

In [266]:
train_labels = pd.read_csv('https://raw.githubusercontent.com/andrew-ryabchenko/DS-Unit-2-Kaggle-Challenge/master/train_labels.csv')

train_data = pd.read_csv('https://raw.githubusercontent.com/andrew-ryabchenko/DS-Unit-2-Kaggle-Challenge/master/train_features.csv', parse_dates = ['date_recorded'])

test_features = pd.read_csv('https://raw.githubusercontent.com/andrew-ryabchenko/DS-Unit-2-Kaggle-Challenge/master/test_features.csv', parse_dates = ['date_recorded'])

pop_data = pd.read_csv('https://raw.githubusercontent.com/andrew-ryabchenko/DS-Unit-2-Kaggle-Challenge/master/tanzania_pop_2012.csv')

data = pd.merge(left=train_data, right=train_labels, on='id')

In [267]:
assert len(data) == len(train_data) == len(train_labels), 'Ooops... something went wrong...'

* Columns to merge on in **pop_data**
> Reg_Name, Dis_Name, Ward_Name

* Columns to merge on in **data**
> region, lga, ward


In [283]:
def merge(data, population):
  data = data.copy()
  population = population.copy()
  # rename Dis_Name value 'Moshi Municipal' to 'Moshi Urban' so it matches training data
  population['Dis_Name'] = population['Dis_Name'].apply(lambda x: 'Moshi Urban' if x == 'Moshi Municipal' else x)

  # create a subset of pop_data with only features we need
  population = population[['Reg_Name', 'Dis_Name', 'Ward_Name', 'total_both', 'area_km2', 'Pop_Den', 'ward_type']]

  # Prepare object values for merging
  population['Reg_Name'] = population['Reg_Name'].str.strip().str.lower()
  population['Ward_Name'] = population['Ward_Name'].str.strip().str.lower()
  population['Dis_Name'] = population['Dis_Name'].str.strip().str.lower()

  data['region'] = data['region'].str.strip().str.lower()
  data['lga'] = data['lga'].str.strip().str.lower()
  data['ward'] = data['ward'].str.strip().str.lower()

  # Give columns the same names in both datasets
  population.rename({'Dis_Code':'district_code', 'Reg_Code':'region_code', 'Reg_Name': 'region', 'Dis_Name':'lga', 'Ward_Name': 'ward'}, axis = 1, inplace=True)
  
  # Merge
  data_full = pd.merge(left = data, right = population, on = ['lga','region','ward'], how = 'left')

  assert len(data) == len(data_full), 'Ooops... Something went wrong...'

  return data_full

In [282]:
def wrangle(data):
  data = data.copy()
  # Drop high-cardinality features
  
  mask = data.select_dtypes('object').nunique() > 130
  cols_to_drop = (data.select_dtypes('object').nunique() > 130)[mask].index
  data.drop(columns=cols_to_drop, inplace=True)
  data.drop(columns = ['extraction_type_group','extraction_type_class', 'quantity_group', 'date_recorded', 'scheme_management',
                       'population', 'num_private','payment_type','waterpoint_type_group', 'recorded_by', 'public_meeting', 'permit'], inplace = True)

  #Impute Columns
  columns_n = make_column_selector(dtype_include = 'number')
  columns_0 = make_column_selector(dtype_include = 'object')

  imp_o = SimpleImputer(strategy='most_frequent')
  imp_n = SimpleImputer()

  data[columns_n(data)] = imp_n.fit_transform(data[columns_n(data)])
  data[columns_o(data)] = imp_o.fit_transform(data[columns_o(data)])

  # Fix null island issue
  if (len(data) > 59000):
    data = data[~(data['longitude']==0)]

  return data

In [284]:
data_clean = wrangle(merge(data, pop_data))

In [285]:
# Create feature matrix and target vector

X = data_clean.drop(columns=['id', 'status_group'])
y = data_clean['status_group']

assert len(X) == len(y), 'X and y are diffferent lenght'

In [290]:
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.33, random_state=42)

model = make_pipeline(
    OrdinalEncoder(),
    RandomForestClassifier(random_state=42,max_depth = 35, max_samples= 0.4, n_estimators=300)
)

model.fit(X_train, y_train);

In [292]:
model.score(X_val, y_val)

0.8068929229150223

In [291]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [293]:
params = {'randomforestclassifier__max_depth': range(15,86,10),
          'randomforestclassifier__max_samples': np.arange(0,0.8,0.1),
          'randomforestclassifier__n_estimators': range(10,101,10)
          }

In [None]:
grid_search = RandomizedSearchCV(
    model,
    params,
    n_jobs = -1,
    verbose=True,
    cv=5,
    random_state = 42
)
grid_search.fit(X_train,y_train)

In [295]:
grid_search.best_params_

{'randomforestclassifier__max_depth': 85,
 'randomforestclassifier__max_samples': 0.30000000000000004,
 'randomforestclassifier__n_estimators': 80}

In [298]:
tuned_model = grid_search.best_estimator_

In [300]:
#create submission
XT = wrangle(merge(test_features, pop_data))
predictions = tuned_model.predict(XT.drop(columns=['id']))
assert len(predictions) == len(test_features), 'Ooops, something went wrong...'

In [301]:
DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge/master/data/'
sample_submission = pd.read_csv(DATA_PATH+'waterpumps/sample_submission.csv')
submission = sample_submission.copy()
submission['status_group'] = predictions
submission.to_csv('kaggle_waterpump_challenge_submission_4.csv', index=False)