# Binary classification

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# load the data into a Pandas dataframe
npf = pd.read_csv("npf_train.csv")

npf = npf.set_index("date")
npf = npf.drop("id", axis=1)
npf = npf.drop("partlybad", axis=1)

class2 = np.array(["nonevent", "event"])
npf["class2"] = class2[(npf["class4"]!="nonevent").astype(int)]

# split the data into training and test sets
X = npf.drop(["class4", "class2"], axis=1)
y = npf["class2"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# create a RandomForestClassifier
clf = RandomForestClassifier()

# use RandomizedSearchCV to test the most influential hyperparameters
n_estimators = [10, 100, 200, 1000]
max_depth = [5, 10, 100, None]
min_samples_split = [4, 5, 6, 8]
min_samples_leaf = [2, 4, 6]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
rscv = RandomizedSearchCV(clf, random_grid, cv=3)
rscv.fit(X_train, y_train)

print('Best hyperparameters found using RandomizedSearchCV:')
print(rscv.best_params_)


Best hyperparameters found using RandomizedSearchCV:
{'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_depth': 5, 'bootstrap': False}


In [24]:
from matplotlib import pyplot as plt
# use GridSearchCV to further fine-tune the hyperparameters
param_grid = {'n_estimators': [100],
    'max_depth': [5, None],
    'min_samples_split': [6],
    'min_samples_leaf': [6]}
gscv = GridSearchCV(clf, param_grid, cv=3)
gscv.fit(X_train, y_train)

print('Best hyperparameters found using GridSearchCV:')
print(gscv.best_params_)

# use the best hyperparameters from the GridSearchCV to train a new RandomForestClassifier
best_clf = gscv.best_estimator_

Best hyperparameters found using GridSearchCV:
{'max_depth': 5, 'min_samples_leaf': 6, 'min_samples_split': 6, 'n_estimators': 100}


In [27]:
from sklearn.feature_selection import SelectFromModel

# use SelectFromModel to perform feature selection
sfm = SelectFromModel(best_clf)
sfm.fit(X_train, y_train)

# use the SelectFromModel object to select only the relevant features
columns = sfm.get_support(indices=True)
X_train_selected = X_train.iloc[:,columns]
X_test_selected = X_test.iloc[:,columns]

X_train_selected

Unnamed: 0_level_0,CO2504.std,Glob.mean,H2O168.mean,H2O336.mean,H2O42.mean,H2O504.mean,H2O672.mean,H2O84.mean,NET.mean,PAR.mean,...,RHIRGA504.mean,RHIRGA504.std,RHIRGA672.mean,RHIRGA84.mean,RHIRGA84.std,SWS.mean,T42.std,T672.mean,UV_A.mean,CS.mean
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-10-05,2.564315,165.102024,7.067619,7.035619,7.126762,7.014857,6.991048,7.106038,72.904003,337.882296,...,67.620571,11.502649,68.481524,66.927547,11.534374,879.659091,1.854969,7.610589,8.785038,0.003912
2004-05-02,2.188370,366.951684,5.070549,5.003758,5.167927,4.961212,4.913030,5.132927,244.310789,716.288590,...,36.255030,10.055706,36.043091,36.599878,12.713643,929.390625,4.079611,11.784124,19.512434,0.004426
2004-03-14,0.367730,96.643700,4.037863,4.020256,4.051282,4.016466,4.012241,4.039316,58.942343,174.445529,...,81.200948,7.933199,82.010172,79.142479,8.416402,924.744681,1.990519,-2.957100,5.593529,0.007268
2003-06-15,3.014848,273.396553,7.127041,6.902500,7.460155,6.793724,6.694490,7.291546,197.334420,542.335307,...,56.794031,12.730337,57.073316,61.583763,19.401909,922.410256,2.689364,9.409609,16.329454,0.001501
2011-08-22,5.506259,180.454576,13.616242,13.554527,13.691429,13.521486,13.487838,13.667114,136.129920,373.246467,...,77.703108,6.717325,78.934324,75.097114,7.115798,883.048387,1.194776,15.049728,11.665070,0.004782
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2004-04-05,2.624566,280.297093,2.936159,2.911304,2.991159,2.880435,2.859420,2.942446,176.848287,544.494789,...,30.950942,7.075854,30.741594,32.414892,9.788664,931.800000,3.600268,6.069047,15.122181,0.012347
2007-09-05,4.085051,229.782265,8.107353,7.999781,8.339191,7.962353,7.915147,8.236788,168.240949,461.919106,...,66.584044,15.434629,66.979118,66.548029,14.257865,921.410714,2.250915,9.785852,13.057172,0.001954
2009-08-27,6.033280,171.209167,13.785315,13.669510,13.992727,13.605315,13.562028,13.883357,86.448825,362.326137,...,78.801888,7.480318,79.850140,77.409301,7.409113,898.733333,1.406986,14.858750,10.752474,0.003024
2010-12-29,0.299422,9.262508,2.320000,2.310208,2.335000,2.306042,2.289792,2.327551,33.088245,21.253668,...,101.992500,0.463385,103.526458,98.427755,0.555864,912.523810,0.283294,-13.125831,0.898580,0.002181


In [40]:
default_clf = RandomForestClassifier()
default_clf.fit(X_train_selected, y_train)
best_clf.fit(X_train_selected, y_train)

print('Default accuracy:', default_clf.score(X_test_selected, y_test))
print('Best model accuracy:', best_clf.score(X_test_selected, y_test))

# For some reason the accuracy of the deafult model is higher than the best model??
# Accuracy around 90%

Default accuracy: 0.9139784946236559
Best model accuracy: 0.8817204301075269


# Multiclass classification

In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# load the data into a Pandas dataframe
npf = pd.read_csv("npf_train.csv")

npf = npf.set_index("date")
npf = npf.drop("id", axis=1)
npf = npf.drop("partlybad", axis=1)

X = npf.drop(["class4"], axis=1)
y = npf["class4"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# create a RandomForestClassifier
clf = RandomForestClassifier()

# use RandomizedSearchCV to test the most influential hyperparameters
n_estimators = [10, 100, 200, 1000]
max_depth = [5, 10, 100, None]
min_samples_split = [4, 5, 6, 8]
min_samples_leaf = [2, 4, 6]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
rscv = RandomizedSearchCV(clf, random_grid, cv=3)
rscv.fit(X_train, y_train)

print('Best hyperparameters found using RandomizedSearchCV:')
print(rscv.best_params_)

Best hyperparameters found using RandomizedSearchCV:
{'n_estimators': 200, 'min_samples_split': 6, 'min_samples_leaf': 6, 'max_depth': 10, 'bootstrap': True}


In [43]:
from matplotlib import pyplot as plt
# use GridSearchCV to further fine-tune the hyperparameters
param_grid = {'n_estimators': [100],
    'max_depth': [5, None],
    'min_samples_split': [6],
    'min_samples_leaf': [6]}
gscv = GridSearchCV(clf, param_grid, cv=3)
gscv.fit(X_train, y_train)

print('Best hyperparameters found using GridSearchCV:')
print(gscv.best_params_)

# use the best hyperparameters from the GridSearchCV to train a new RandomForestClassifier
best_clf = gscv.best_estimator_

Best hyperparameters found using GridSearchCV:
{'max_depth': 5, 'min_samples_leaf': 6, 'min_samples_split': 6, 'n_estimators': 100}


In [44]:
from sklearn.feature_selection import SelectFromModel

# use SelectFromModel to perform feature selection
sfm = SelectFromModel(best_clf)
sfm.fit(X_train, y_train)

# use the SelectFromModel object to select only the relevant features
columns = sfm.get_support(indices=True)
X_train_selected = X_train.iloc[:,columns]
X_test_selected = X_test.iloc[:,columns]

X_train_selected

Unnamed: 0_level_0,CO2504.std,Glob.mean,H2O336.mean,H2O42.mean,H2O504.mean,H2O672.mean,H2O84.mean,NET.mean,O342.mean,PAR.mean,...,RPAR.std,SWS.mean,T42.mean,T42.std,T504.std,T672.mean,T84.mean,UV_A.mean,UV_A.std,CS.mean
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-10-05,2.564315,165.102024,7.035619,7.126762,7.014857,6.991048,7.106038,72.904003,29.121171,337.882296,...,10.769167,879.659091,8.236133,1.854969,1.723337,7.610589,8.221556,8.785038,5.339467,0.003912
2004-05-02,2.188370,366.951684,5.003758,5.167927,4.961212,4.913030,5.132927,244.310789,53.551463,716.288590,...,17.253504,929.390625,12.509656,4.079611,3.436554,11.784124,12.514843,19.512434,13.151439,0.004426
2004-03-14,0.367730,96.643700,4.020256,4.051282,4.016466,4.012241,4.039316,58.942343,42.369060,174.445529,...,10.605789,924.744681,-2.347086,1.990519,1.846654,-2.957100,-2.377429,5.593529,3.229373,0.007268
2003-06-15,3.014848,273.396553,6.902500,7.460155,6.793724,6.694490,7.291546,197.334420,33.090816,542.335307,...,16.035329,922.410256,9.843112,2.689364,1.873557,9.409609,9.837474,16.329454,12.042879,0.001501
2011-08-22,5.506259,180.454576,13.554527,13.691429,13.521486,13.487838,13.667114,136.129920,26.861634,373.246467,...,12.242396,883.048387,16.073609,1.194776,0.981140,15.049728,16.076217,11.665070,10.062797,0.004782
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2004-04-05,2.624566,280.297093,2.911304,2.991159,2.880435,2.859420,2.942446,176.848287,54.912101,544.494789,...,28.073285,931.800000,5.764885,3.600268,2.798021,6.069047,6.027117,15.122181,9.693823,0.012347
2007-09-05,4.085051,229.782265,7.999781,8.339191,7.962353,7.915147,8.236788,168.240949,23.951571,461.919106,...,14.553219,921.410714,10.482300,2.250915,2.262907,9.785852,10.462408,13.057172,10.041741,0.001954
2009-08-27,6.033280,171.209167,13.669510,13.992727,13.605315,13.562028,13.883357,86.448825,27.845743,362.326137,...,9.051773,898.733333,15.715405,1.406986,1.316487,14.858750,15.720304,10.752474,8.061537,0.003024
2010-12-29,0.299422,9.262508,2.310208,2.335000,2.306042,2.289792,2.327551,33.088245,25.771887,21.253668,...,2.181447,912.523810,-12.237774,0.283294,0.296621,-13.125831,-12.300815,0.898580,0.392891,0.002181


In [49]:
default_clf = RandomForestClassifier()
default_clf.fit(X_train_selected, y_train)
best_clf.fit(X_train_selected, y_train)

print('Default accuracy:', default_clf.score(X_test_selected, y_test))
print('Best model accuracy:', best_clf.score(X_test_selected, y_test))

# For some reason the accuracy of the deafult model is higher than the best model??
# Accuracy around 70%-75%

Default accuracy: 0.7204301075268817
Best model accuracy: 0.7311827956989247
