In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [None]:
def get_files(top_quant, mh_quant, ml_quant, low_quant):
    file_path = 'data/cats_{}_{}_{}_{}'.format(top_quant, mh_quant, ml_quant, low_quant)
    X = pd.read_csv('{}/train_table.csv'.format(file_path))
    y = pd.read_csv('{}/target.csv'.format(file_path))
    X_final = pd.read_csv('{}/tournament_table.csv'.format(file_path))
    ids = pd.read_csv('{}/ids.csv'.format(file_path))
    return file_path, X, y, X_final, ids

In [None]:
def pick_best_model_parameters(model, parameters, X_train, y_train):
    clf = GridSearchCV(model, parameters, cv=4, n_jobs=-1)
    clf.fit(X_train, y_train)
    print(clf.best_params_)
    return clf.best_estimator_

In [None]:
def run_models(X, y, models):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    
    l = []
    
    start_time = time.time()
    for model in models:
        print('\nTraining Model', model)
        (clf, parameters) = models[model]
        l.append(
            (model,
             pick_best_model_parameters(
                 clf,
                 parameters,
                 X_train,
                 y_train)))
    
    return l, X_test, y_test

In [None]:
def get_results(l, X_test, y_test, X_final):
    final_preds = {}
    model_metrics = {}
    for model, clf in l:
        y_pred = clf.predict(X_test)
        msq = mean_squared_error(y_test, y_pred)
        logloss = log_loss(y_test, y_pred)

        print('\nModel: {}\n'.format(model))
        print('Mean Squared Error: {}'.format(msq))
        print('Log Loss: {}'.format(logloss))
        
        y_final_pred = clf.predict(X_final)
        y_final_pred = pd.DataFrame(y_final_pred, columns=['probability'])
        final_preds[model] = y_final_pred
        model_metrics[model] = (msq, logloss)
    
    model_metrics_df = pd.DataFrame(model_metrics)
    return final_preds, model_metrics_df

In [None]:
def predictions_csv(ids, final_preds, model_metrics_df, file_path):
    for model in final_preds.keys():
        df = pd.merge(ids, final_preds[model], left_index=True, right_index=True)
        print(df.head())
        df.to_csv('{}/predictions.csv'.format(file_path), index=False)
        model_metrics_df[model].to_csv('{}/model_metrics.csv'.format(file_path), index=False)

In [None]:
def main(top_quant, mh_quant, ml_quant, low_quant, models):
    file_path, X, y, X_final, ids = get_files(top_quant, mh_quant, ml_quant, low_quant)
    l, X_test, y_test = run_models(X, y, models)
    final_preds, model_metrics_df = get_results(l, X_test, y_test, X_final)
    predictions_csv(ids, final_preds, model_metrics_df, file_path)

In [None]:
#top_quant, mh_quant, ml_quant, low_quant = [0.85, 0.65, 0.35, 0.15]
top_quant, mh_quant, ml_quant, low_quant = [0.8, 0.6, 0.4, 0.2]

In [None]:
forest_parameters = {'n_estimators': [200, 250, 300],
                     'max_features': ["auto", 20, 30],
                     "bootstrap": [True, False],
                     "min_samples_leaf": [1, 3],
                     'max_depth': [5, 10]}
SVR_parameters = {'C': [0.5, 0.6, 0.7, 1.0]}
grad_parameters = {'n_estimators': [100, 150, 200],
                   'max_depth': [3, 10],
                   "max_features": [None, 1, 5],
                   "max_leaf_nodes": [None, 5],
                   "warm_start": [True, False]
                   }

In [None]:
models = {
    'RandomForestRegressor': (RandomForestRegressor(), forest_parameters),
    #'GradientBoostingRegressor': (GradientBoostingRegressor(), grad_parameters),
    #'SVR': (SVR(), SVR_parameters)
}

In [None]:
main(top_quant, mh_quant, ml_quant, low_quant, models)