In [23]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV


# data preprocessing


train = pd.read_csv("train_final.csv")
test = pd.read_csv("test_final.csv")

y= train['Y']
features = ['f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11','f12','f13','f14','f15','f16','f17','f18','f19','f20','f21','f22','f23','f24']
Xtrain = train[features]
ID = train['Id']

Xtest = test[features]
ID_test = test['Id']

# FIRST ATTEMPT
# # # Instantiate model with 1000 decision trees
# rf = RandomForestRegressor(n_estimators = 300, random_state = 42)
# # Train the model on training data
# rf.fit(Xtrain, y);
# y_pred= rf.predict(Xtest)


In [25]:
#         RANDOMIZED SEARCH 

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 

# Number of trees in random forest
n_estimators = [200, 1000, 2000]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [10, 20, 50, 100]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 20, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(Xtrain,y)
%timeit
rf_random.best_params_

{'n_estimators': [200, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 50, 100], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  4.4min finished


{'n_estimators': 200,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 50,
 'bootstrap': False}

In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [40, 50, 60, 110],
    'max_features': ['auto'],
    'min_samples_leaf': [1,2,3],
    'min_samples_split': [2,3,5],
    'n_estimators': [1000]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [34]:
grid_search.fit(Xtrain,y)
grid_search.best_params_

best_grid = grid_search.best_estimator_
# grid_accuracy = evaluate(best_grid, Xtest, y)
# print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.5min


KeyboardInterrupt: 

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    return accuracy
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(Xtrain, y)


In [None]:
best_grid

In [31]:
y_pred = best_grid.predict(Xtest)
pred = pd.DataFrame({"Id": ID_test, "Y": y_pred})
pred.to_csv("Prediction.csv", index = False)
solution = pd.DataFrame({"Id": ID_test, "Y": y_pred})
solution.to_csv("Forest.csv", index = False)
solution.head()

Unnamed: 0,Id,Y
0,16384,1
1,16385,1
2,16386,1
3,16387,1
4,16388,1
