In [1]:
# test
# %run './base.ipynb'

## Preprocessing
- impute (various kinds)/dummy values/drop missing values
- outlier removal?
- Scale (standardize)/don't scale 
- feature selection (PCA, etc.)/none

## Models
- Linear Regression (Anna)
- SVR (Moritz)
- GradientBoostingRegressor (David)

## Params
- cv=10
- scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error']
- Best params wählen nach MSE, aber auch r^2 notieren für Vergleichbarkeit

## AutoML
- 

## Datasets
- Bike sharing (Kaggle) ()
    - https://www.kaggle.com/c/184702-tu-ml-ws-18-bike-sharing#_=_
    - large samples (train = 8690), small dimension (15)
    - attribute characteristics: numeric, date?
- Student performance (Kaggle) (Moritz)
    - https://www.kaggle.com/c/184702-tu-ml-ws-18-student-performance
    - small samples (train = 198), medium dimension (32)
    - attribute characteristics: numeric, categorical 
- Blog feedback (David)
    - https://archive.ics.uci.edu/ml/datasets/BlogFeedback
    - very large samples (60021), large dimension (281)
    - attribute characteristics: numeric
- Forest fires (Anna)
    - https://archive.ics.uci.edu/ml/datasets/Forest+Fires
    - medium samples (513), small dimension (13) 
    - attribute characteristics: numeric
    
## Steps
- Imports for all Datasets
- functions for all Regressors

In [1]:
# Modules

# import modules
import pandas as pd
import numpy as np
import sklearn as sk
import time
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import datetime
from time import strptime
from sklearn.ensemble import GradientBoostingRegressor

# allows to output plots in the notebook
%matplotlib inline

  from numpy.core.umath_tests import inner1d


In [3]:
# Preprocessing

# scale data
def scale_data(train_data, test_data = pd.DataFrame):
  scaler = preprocessing.StandardScaler()
  
  # Fit on training set only.
  scaler.fit(train_data)
  
  # Apply transform to both the training set and the test set.
  train_data[train_data.columns] = pd.DataFrame(scaler.transform(train_data[train_data.columns]))
  if (test_data.empty):
    return (train_data)
  else:
    test_data[test_data.columns] = pd.DataFrame(scaler.transform(test_data[test_data.columns]))
    return (train_data, test_data)

# replace empty strings with nan
def fillspace_nan(data):
  return data.apply(lambda x: x.replace('', np.nan))

# strip whitespaces
def strip(data):
  return data.apply(lambda x: x.str.strip())

# one hot encoding
def one_hot(data, drop_first = True):
  columns = data.select_dtypes(['object'])
  return pd.get_dummies(data, columns = columns, drop_first = True)

# PCA
def pca(train_data, test_data, n_comp):
  pca = PCA(n_components = n_comp)
  pca.fit(train_data)
  pca_train = pd.DataFrame(pca.transform(train_data))
  pca_test = pd.DataFrame(pca.transform(test_data))
  return (pca_train, pca_test)

In [6]:
# Linear Regression
def linear_reg(X_train, y_train, X_test, y_test, get_coef = True):
    # Build model
    reg = LinearRegression().fit(X_train, y_train)
    r2_score = reg.score(X_train, y_train)
    
    # Predict test data and compute MSE
    y_pred = reg.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    metrics = {'R2 Score: ': [r2_score], 'MSE: ' : mse}
    
    coef = {}
    if get_coef:
        coef = {
            'Coefficients: ' :reg.coef_
        }
    return metrics, coef

In [3]:
# SVR


In [4]:
# Gradient Boosting

def run_boosted_tree(train_data, train_target, test_data, test_target, param_fix, cv, param_grid):
    print("GridSearch initializing...")
    clf = GridSearchCV(estimator = GradientBoostingRegressor(**param_fix), cv = cv, param_grid = param_grid, 
                       scoring = ['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error'], 
                       refit = 'neg_mean_squared_error')
    
    print("GradientBoostedRegressor model in training...")
    t0 = time.time()
    clf.fit(train_data, train_target)
    clf_fit = time.time() - t0
    print("GradientBoostedRegressor model selected and fitted in %.3f s\n" % clf_fit)
    
    best_params = clf.best_params_
    print("Best parameters selected by GridSearch: %s" % best_params)
    
    return clf

# does not work with GridSearch!!!
def plot_training_deviance(clf, X_test, y_test):
    test_score = np.zeros((clf.best_params_['n_estimators'],), dtype=np.float64)

    for i, y_pred in enumerate(clf.staged_predict(X_test)):
        test_score[i] = clf.loss_(y_test, y_pred)

    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.title('Deviance')
    plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
             label='Training Set Deviance')
    plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
             label='Test Set Deviance')
    plt.legend(loc='upper right')
    plt.xlabel('Boosting Iterations')
    plt.ylabel('Deviance')

# kinda not works as expected
def plot_scores(results):
    scoring = ['r2', 'neg_mean_squared_error']
    
    plt.figure(figsize=(13, 13))
    plt.title("GridSearchCV evaluating using multiple scorers simultaneously",
          fontsize=16)

    plt.xlabel("n_estimators")
    plt.ylabel("Score")

    ax = plt.gca()
    ax.set_xlim(0, 402)
    ax.set_ylim(0, 1)

    # Get the regular numpy array from the MaskedArray
    X_axis = np.array(results['param_n_estimators'].data, dtype=float)

    for scorer, color in zip(sorted(scoring), ['g', 'k']):
        for sample, style in (('train', '--'), ('test', '-')):
            sample_score_mean = results['mean_%s_%s' % (sample, scorer)]
            sample_score_std = results['std_%s_%s' % (sample, scorer)]
            ax.fill_between(X_axis, sample_score_mean - sample_score_std,
                            sample_score_mean + sample_score_std,
                            alpha=0.1 if sample == 'test' else 0, color=color)
            ax.plot(X_axis, sample_score_mean, style, color=color,
                    alpha=1 if sample == 'test' else 0.7,
                    label="%s (%s)" % (scorer, sample))

        best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0]
        best_score = results['mean_test_%s' % scorer][best_index]

        # Plot a dotted vertical line at the best score for that scorer marked by x
        ax.plot([X_axis[best_index], ] * 2, [0, best_score],
                linestyle='-.', color=color, marker='x', markeredgewidth=3, ms=8)

        # Annotate the best score for that scorer
        ax.annotate("%0.2f" % best_score,
                    (X_axis[best_index], best_score + 0.005))

    plt.legend(loc="best")
    plt.grid('off')
    plt.show()