## 1. Import lib

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

## 2. Load processed data
(after **step 3** of **asm_1.ipynb**)

In [16]:
X_train = pd.read_csv('train_processed.csv')
X_test_processed = pd.read_csv('test_processed.csv')
y_train = pd.read_csv('y_train.csv').values
y_test = pd.read_csv('y_test.csv').values



In [17]:
train_processed.head(2)

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,6,4,0,2,2,5,7,8,3,5,...,0,0,1,0,0,0,0,0,0,0
1,9,7,2,3,1,2,1,4,5,6,...,0,0,0,1,0,0,0,0,0,0


In [18]:
test_processed.head(2)

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,7,7,0,2,4,2,3,4,3,6,...,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,2,7,6,6,6,5,...,0,0,1,0,0,0,0,0,0,0


## 3. Features Selection

### 3.1 Lựa chọn đặc trưng bằng Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV

# Find best estimator of RandomForestClassifier first
parameters = {'n_estimators': [10, 20, 50, 100]}
randomForestModel = RandomForestClassifier(random_state=10)
grid = GridSearchCV(randomForestModel, parameters, cv=5, scoring='accuracy')
grid.fit(train_processed, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=10),
             param_grid={'n_estimators': [10, 20, 50, 100]},
             scoring='accuracy')

In [20]:
grid.best_params_

{'n_estimators': 100}

In [21]:
sel_ = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=10))
sel_.fit(train_processed, y_train)
selected_feat_random_forest = train_processed.columns[(sel_.get_support())]
selected_feat_random_forest

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
       'Temp9am', 'Temp3pm', 'RainToday_0', 'RainToday_1'],
      dtype='object')

### 3.2 Lựa chọn đặc trưng bằng ROC-AUC

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

roc_values = []

for feature in train_processed.columns:

    # train a decision tree classifier
    clf = DecisionTreeClassifier()
    clf.fit(train_processed[feature].values.reshape(-1, 1), y_train)

    # obtain the predictions
    y_scored = clf.predict_proba(test_processed[feature].values.reshape(-1, 1))

    # calculate and store the roc-auc
    roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))

roc_values = pd.Series(roc_values)
roc_values.index = train_processed.columns
selected_feat_roc = roc_values[roc_values > 0.51].index
selected_feat_roc

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
       'Temp9am', 'Temp3pm', 'RainToday_0', 'RainToday_1', 'WindGustDir_E',
       'WindGustDir_ENE', 'WindGustDir_ESE', 'WindGustDir_N', 'WindGustDir_W',
       'WindDir9am_E', 'WindDir9am_ESE', 'WindDir9am_N', 'WindDir9am_NNW',
       'WindDir9am_NW', 'WindDir9am_SE'],
      dtype='object')

### 3.3 Lựa chọn đặc trưng bằng đệ quy

#### Loại bỏ đặc trưng bằng đệ quy

In [23]:
# from feature_engine.selection import RecursiveFeatureElimination

# model = RandomForestClassifier(n_estimators=100, random_state=10)
# sel_ = RecursiveFeatureElimination(
#     variables=None, 
#     estimator = model, 
#     scoring = 'roc_auc', # the metric we want to evalute
#     threshold = 0.001, # the maximum performance drop allowed to remove a feature
#     cv=3, # cross-validation
# )
# sel_.fit(train_processed, y_train)

# train_recursive_ellimination_selected = sel_.transform(train_processed)
# test_recursive_ellimination_selected = sel_.transform(test_processed)


#### Thêm đặc trưng bằng đệ quy

In [24]:
# from feature_engine.selection import RecursiveFeatureAddition

# model =  RandomForestClassifier(n_estimators=100, random_state=10)
# sel_ = RecursiveFeatureAddition(
#     variables=None,
#     estimator=model,
#     scoring='roc_auc',
#     threshold=0.001,
#     cv=3,)
# sel_.fit(train_processed, y_train)
# train_recursive_addition_selected = sel_.transform(train_processed)
# test_recursive_addition_selected = sel_.transform(test_processed)


## 4. Huấn luyện mô hình

In [26]:
from sklearn.metrics import f1_score
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
# Split test set to test set and validation set
x_valid, x_test, y_valid, y_test = train_test_split(test_processed, y_test, test_size=0.5, random_state=0)

In [27]:
def evaluate_model(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    print('='*40)

    print('Train:')
    y_pred = model.predict(x_train)
    print('F1 score:', f1_score(y_train, y_pred))
    print('Validation:')
    y_pred = model.predict(x_test)
    print('F1 score:', f1_score(y_test, y_pred))

    print('='*40)
    


#### LogisticRegression

In [28]:
model = LogisticRegression()
print('All features:')
evaluate_model(model, train_processed, y_train, x_valid, y_valid)
print('Select features by ROC-AUC:')
evaluate_model(model, train_processed[selected_feat_roc], y_train, x_valid[selected_feat_roc], y_valid)
print('Select features by RandomForest:')
evaluate_model(model, train_processed[selected_feat_random_forest], y_train, x_valid[selected_feat_random_forest], y_valid)

print('Recursive feature elimination:')
print('Recursive feature addition:')


All features:
Train:
F1 score: 0.5878566337908361
Validation:
F1 score: 0.5898617511520738
Select features by ROC-AUC:
Train:
F1 score: 0.5711450593067798
Validation:
F1 score: 0.576980014803849
Select features by RandomForest:
Train:
F1 score: 0.5710985863438973
Validation:
F1 score: 0.5747041420118343
Recursive feature elimination:
Recursive feature addition:


#### DecisionTreeClassifier

In [29]:
model = tree.DecisionTreeClassifier()
print('All features:')
evaluate_model(model, train_processed, y_train, x_valid, y_valid)
print('Select features by ROC-AUC:')
evaluate_model(model, train_processed[selected_feat_roc], y_train, x_valid[selected_feat_roc], y_valid)
print('Select features by RandomForest:')
evaluate_model(model, train_processed[selected_feat_random_forest], y_train, x_valid[selected_feat_random_forest], y_valid)


All features:
Train:
F1 score: 0.9993330456275255
Validation:
F1 score: 0.5300187617260788
Select features by ROC-AUC:
Train:
F1 score: 0.9988226059654631
Validation:
F1 score: 0.5203376822716806
Select features by RandomForest:
Train:
F1 score: 0.9977422204770787
Validation:
F1 score: 0.5240576156910819


#### AdaBoostClassifier

In [30]:
model = AdaBoostClassifier()
print('All features:')
evaluate_model(model, train_processed, y_train, x_valid, y_valid)
print('Select features by ROC-AUC:')
evaluate_model(model, train_processed[selected_feat_roc], y_train, x_valid[selected_feat_roc], y_valid)
print('Select features by RandomForest:')
evaluate_model(model, train_processed[selected_feat_random_forest], y_train, x_valid[selected_feat_random_forest], y_valid)


All features:
Train:
F1 score: 0.5785158421345191
Validation:
F1 score: 0.5881914695474018
Select features by ROC-AUC:
Train:
F1 score: 0.5802828618968386
Validation:
F1 score: 0.5849934835226215
Select features by RandomForest:
Train:
F1 score: 0.5800356588788294
Validation:
F1 score: 0.5821508588498879


#### RandomForestClassifier

In [31]:
model = RandomForestClassifier(n_estimators=100, random_state=10)
print('All features:')
evaluate_model(model, train_processed, y_train, x_valid, y_valid)
print('Select features by ROC-AUC:')
evaluate_model(model, train_processed[selected_feat_roc], y_train, x_valid[selected_feat_roc], y_valid)
print('Select features by RandomForest:')
evaluate_model(model, train_processed[selected_feat_random_forest], y_train, x_valid[selected_feat_random_forest], y_valid)


All features:
Train:
F1 score: 0.999313496655748
Validation:
F1 score: 0.6086956521739131
Select features by ROC-AUC:
Train:
F1 score: 0.998803616608155
Validation:
F1 score: 0.6084008287813147
Select features by RandomForest:
Train:
F1 score: 0.9977047572339381
Validation:
F1 score: 0.608419083255379


#### MLPClassifier

In [32]:
model = model = MLPClassifier(hidden_layer_sizes=(100, 50, ), max_iter=300)
print('All features:')
evaluate_model(model, train_processed, y_train, x_valid, y_valid)
print('Select features by ROC-AUC:')
evaluate_model(model, train_processed[selected_feat_roc], y_train, x_valid[selected_feat_roc], y_valid)
print('Select features by RandomForest:')
evaluate_model(model, train_processed[selected_feat_random_forest], y_train, x_valid[selected_feat_random_forest], y_valid)

All features:
Train:
F1 score: 0.8577772345799723
Validation:
F1 score: 0.5858585858585859
Select features by ROC-AUC:
Train:
F1 score: 0.6973326802333348
Validation:
F1 score: 0.5885928839620955
Select features by RandomForest:
Train:
F1 score: 0.6686093792355652
Validation:
F1 score: 0.5943600867678959
