In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

%config InlineBackend.figure_format = 'svg'

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly
import plotly.graph_objs as go

init_notebook_mode(connected=True)

In [2]:
df = pd.read_csv('../data/gbm-data.csv.gz', compression='gzip')

In [3]:
y = df['Activity']

In [9]:
X = df.iloc[:, 1:]

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [80]:
def learn_gradient_boosting(X, y, learning_rate: float, verbose: bool=False, get_max_quality: bool=False):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241)
        
    clsf = GradientBoostingClassifier(random_state=241, verbose=verbose, n_estimators=250, learning_rate=learning_rate)
    
    clsf.fit(X_train, y_train)
    
    f = np.vectorize(lambda val: 1.0 / (1.0 + np.e**(0.0-val)))
    
    y_train_pred = []
    for i, y_pred in enumerate(clsf.staged_decision_function(X_train), 1):    
        y_pred_c = f(y_pred)
        err = log_loss(y_train, y_pred_c.flatten())
        y_train_pred.append(err)
    
    y_test_pred = []
    for i, y_pred in enumerate(clsf.staged_decision_function(X_test), 1):
        y_pred_c = f(y_pred)
        err = log_loss(y_test, y_pred_c.flatten())
        y_test_pred.append(err)
    
    x_plot = list(range(1, len(y_test_pred) + 1))
    
    trace1 = go.Scatter(x=x_plot, y=y_train_pred, name='Train prediction')
    trace2 = go.Scatter(x=x_plot, y=y_test_pred, name='Test prediction')
    
    data = [trace1, trace2]
    
    layout = {'title': f'Quality on train in test pred with learning_rate={learning_rate}'}
    
    fig = go.Figure(data=data, layout=layout)
    iplot(fig, show_link=False)

    if get_max_quality:
        minimum = min(enumerate(y_test_pred, 1), key=lambda it: it[1])
        
        print(f'The greatest quality is {minimum}')

In [83]:
learn_gradient_boosting(X, y, 0.2, get_max_quality=True)

The greatest quality is (37, 0.5314507963190638)


In [85]:
print(f'Seems like it\'s overfitting here, since quality on the test data decreasing after step=37')

Seems like it's overfitting here, since quality on the test data decreasing after step=37


In [86]:
def learn_random_forest(X, y, N_estimators: int):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241)
    
    clsf = RandomForestClassifier(random_state=241, n_estimators=N_estimators, n_jobs=-1)
    
    clsf.fit(X_train, y_train)
    
    y_pred = clsf.predict_proba(X_test)
    
    res = log_loss(y_test, y_pred)
    
    print(f'The quality with RandomForestClassifier with n_estimators={N_estimators} is {res}')

In [87]:
learn_random_forest(X, y, 37)

The quality with RandomForestClassifier with n_estimators=37 is 0.5410941951182815
