# Training

## Training Setup

In [8]:
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

# Load pandas
import pandas as pd

# Load numpy
import numpy as np

# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

def training_setup (data, target):
    # Set random seed
    np.random.seed(0)

    # Split the data into training and testing sets
    train, test = train_test_split(data, test_size=0.2)

    print('Number of observations in the training data:', len(train))
    print('Number of observations in the test data:',len(test))

    # Create a serie with the target variable
    target = target

    # Create a list of the feature column's names
    features = [c for c in data.columns if c != target]
    
    return train, test, target, features


## Random Forest Classifier

In [9]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint


def training (features, target, n_iter, cv, weight=True):
    # **********Hyperparameter tuning************** 
    
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start=int(len(features)/4), stop=int(len(features)/2), num=10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(2, 5, num = 3)]
    #max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Add weight in case of imbalance classes
    class_weight = ["balanced" if weight else None]
    #class_weight = [{0: 1, 1: w} for w in [10,20,30,50]] if weight else [None]
    
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap,
                   'class_weight': class_weight}
    #pprint(random_grid)


    # Use the random grid to search for best hyperparameters

    # First create the base model to tune
    rf = RandomForestClassifier(n_estimators = 50, criterion = 'entropy',random_state=42)

    # Random search of parameters, using 10 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = n_iter, cv = cv, 
                                   verbose=1, random_state=42, n_jobs = -1)

    # Train the model on training data
    trained_model = rf_random.fit(features, target);

    # Look at parameters used by our current forest
    #pprint(rf_random.best_estimator_.get_params())
    
    return trained_model
    

# Testing

In [10]:
def testing (trained_model, testset):
    # Apply the Classifier we trained to the test data (which, remember, it has never seen before)
    predictions = trained_model.predict(testset)
    
    return predictions