In [2]:
import pandas as pd
import numpy as np
from time import time
from pprint import PrettyPrinter
import pickle 

from sklearn.linear_model import BayesianRidge, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation
from sklearn.model_selection import GridSearchCV

import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
%matplotlib inline
pp = PrettyPrinter()

## Load

In [3]:
df_train = pd.read_csv('../data/train_standard.csv', parse_dates=['date'], dtype={'fullVisitorId': 'str'})

In [4]:
df_train.head()

Unnamed: 0,date,fullVisitorId,visitNumber,visitStartTime,device.browser,device.isMobile,geoNetwork.continent,totals.bounces,totals.hits,totals.pageviews,totals.transactionRevenue,trafficSource.isTrueDirect,referred_channelGrouping,social_channelGrouping,device.OS_chrome,classifier_predict_probas
0,2016-09-02,1131660440785968503,1,1,1,0,0,1.0,1,1.0,0.0,0,0,0,0,0.0
1,2016-09-02,377306020877927890,1,0,0,0,0,1.0,1,1.0,0.0,0,0,0,0,0.0
2,2016-09-02,3895546263509774583,1,0,1,0,0,1.0,1,1.0,0.0,0,0,0,0,0.0
3,2016-09-02,4763447161404445595,1,0,0,0,0,1.0,1,1.0,0.0,0,0,0,0,0.0
4,2016-09-02,27294437909732085,2,1,1,1,0,1.0,1,1.0,0.0,1,0,0,0,0.0


# Funcs

In [5]:
def grid_classifier(model, train, target, parameters={}, cv=4, timed=True, verbose=True, plot=False):
    '''
    For tuning models.
    
    
        Parameters:
        -----------
    model - sklearn model
    train - array of training data
    target - array of target data
    parameters - dict of parameters the GSearcher will run over
    cv - int, k-fold number
    timed - boolean, gives printout of timings
    verbose - boolean, printout the cv results
    plot - boolean, makes a seaborn plot of the cv results, one parameter only
    
        Returns :
        ---------
    gsearcher - fitted and searched GridSearchCV object
    '''
    if timed:
        start = time()
        passed = time() - start
        print(f'Seconds to start: {passed:.2f}')
    
    gsearcher = GridSearchCV(
        estimator=model, 
        param_grid=parameters,
        scoring='roc_auc',
        n_jobs=4,
        cv=cv)
    gsearcher.fit(train, target)
    results = gsearcher.cv_results_
    
    if timed:
        passed = time() - start
        print(f'Seconds to fit: {passed:.2f}')
    if verbose:
        print('Grid Scores: \n')
        pp.pprint(results)
        print(f'Best Model Parameters: {gsearcher.best_params_}')
        print(f'Best Model Score: {gsearcher.best_score_:.4f}')
    
    if plot:
        AUC_means = results['mean_test_score']
        x = range(len(AUC_means))
        model_name = str(model.__class__).split('.')[-1][:-2]
        param = list(results['params'][0].keys())[0]
        labels = [str(pair[param]) for pair in results['params']]

        plt.figure(figsize=(8,6))
        plt.scatter(x=x, y=AUC_means)
        plt.title(f'{model_name} over {param}')
        plt.xlabel(param)
        plt.ylabel('AUC Scores')
        plt.xticks(x, labels, rotation=60)
    
    return gsearcher

In [6]:
def model_validate(model, train, target, cv=5, timed=True, verbose=True, feature_importance=False, feature_cols=None):
    '''
    For validating a model's score on a single set of parameters.
    
        Paramters:
        ----------
    model - sklearn model, parameters set on instantiation
    train - array of training data
    target - array of target data
    cv - int, k-fold number
    timed - boolean, gives printout of timings
    verbose - boolean, printout the cv_results
    feature_importance - boolean, makes a seaborn plot of the feature importance, only works for ensembles
    feature_cols - list of strings, column labels for the feature importance plot, ensemble only
    
        Returns:
        ---------
    model - the fitted sklearn model

    '''
    # Fit and Predict
    if timed:
        start = time()
        passed = time() - start
        print(f'Seconds to start: {passed:.2f}')
    model.fit(train, target)
    if timed:
        passed = time() - start
        print(f'Seconds to fit: {passed:.2f}')
    
    # CV
    cv_score = cross_validation.cross_val_score(model, train, target, cv=cv, scoring='roc_auc')
    if verbose:
        print(f'CV Score : Mean - {np.mean(cv_score):.3f} | Std - {np.std(cv_score):.3f} | Min - {np.min(cv_score):.3f} | Max - {np.max(cv_score):.3f}')
    if timed:
        passed = time() - start
        print(f'Seconds to CV: {passed:.2f}')
    
    # Plots feature importance, for ensemble models only
    if feature_importance:
        feat_imp = pd.Series(model.feature_importances_, index=feature_cols).sort_values(ascending=False)
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')
        plt.xticks(rotation=60)
    
    return model

---
# Modeling

---
## BayesianRidge

---
## LinearRegression

---
## Ridge

---
## Lasso

---
## ElasticNet