## 1. Import lib

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

## 2. Load processed data
(after **step 3** of **asm_1.ipynb**)

In [37]:
from sklearn.model_selection import train_test_split

X_train = pd.read_csv('train_processed.csv')
y_train = pd.read_csv('y_train.csv').values

X_test_processed = pd.read_csv('test_processed.csv')
y_test_processed = pd.read_csv('y_test.csv').values

X_test, X_valid, y_test, y_valid = train_test_split(X_test_processed, y_test_processed, test_size=0.5, random_state=0)



## 3. Features Selection

### 3.1 Lựa chọn đặc trưng bằng Random Forest

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV

# Find best estimator of RandomForestClassifier first
parameters = {'n_estimators': [10, 20, 50, 100]}
# randomForestModel = RandomForestClassifier(random_state=10)
# grid = GridSearchCV(randomForestModel, parameters, cv=5)
# grid.fit(X_train, y_train)
# grid.best_params_

# {'n_estimators': 100}

In [39]:
sel_ = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=10))
sel_.fit(X_train, y_train)

random_forest_X_train_set = sel_.transform(X_train)
random_forest_X_valid_set = sel_.transform(X_valid)
random_forest_X_test_set = sel_.transform(X_test)

### 3.2 Lựa chọn đặc trưng bằng ROC-AUC

In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

roc_values = []

for feature in X_train.columns:

    # train a decision tree classifier
    clf = DecisionTreeClassifier()
    clf.fit(X_train[feature].values.reshape(-1, 1), y_train)

    # obtain the predictions
    y_scored = clf.predict_proba(X_valid[feature].values.reshape(-1, 1))

    # calculate and store the roc-auc
    roc_values.append(roc_auc_score(y_valid, y_scored[:, 1]))

roc_values = pd.Series(roc_values)
roc_values.index = X_train.columns
selected_feat_roc = roc_values[roc_values > 0.51].index
selected_feat_roc

roc_X_train_set = X_train[selected_feat_roc]
roc_X_valid_set = X_valid[selected_feat_roc]
roc_X_test_set = X_test[selected_feat_roc]

### 3.3 Lựa chọn đặc trưng bằng đệ quy

#### Loại bỏ đặc trưng bằng đệ quy

In [41]:
from feature_engine.selection import RecursiveFeatureElimination
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=10, max_depth=2 ,random_state=10)
sel_ = RecursiveFeatureElimination(
    variables=None, 
    estimator = model, 
    scoring = 'roc_auc', # the metric we want to evalute
    threshold = 0.0005, # the maximum performance drop allowed to remove a feature
    cv=2, # cross-validation
)
sel_.fit(X_train, y_train)

recursive_ellimination_selected_X_train_set = sel_.transform(X_train)
recursive_ellimination_selected_X_valid_set = sel_.transform(X_valid)
recursive_ellimination_selected_X_test_set = sel_.transform(X_test)


#### Thêm đặc trưng bằng đệ quy

In [42]:
from feature_engine.selection import RecursiveFeatureAddition

model = GradientBoostingClassifier(n_estimators=10, max_depth=2 ,random_state=10)
sel_ = RecursiveFeatureAddition(
    variables=None,
    estimator=model,
    scoring='roc_auc',
    threshold=0.0005,
    cv=3,)
sel_.fit(X_train, y_train)

recursive_addition_selected_X_train_set = sel_.transform(X_train)
recursive_addition_selected_X_valid_set = sel_.transform(X_valid)
recursive_addition_selected_X_test_set = sel_.transform(X_test)

## 4. Huấn luyện mô hình

In [43]:
from sklearn.metrics import f1_score
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

In [44]:
def print_analysis_result(model):
    
    print("Model: ", model)
    
    print()
    print("All features:")
    print('='*40)
    print("Train")
    model.fit(X_train, y_train)
    print("F1 score:", f1_score(y_train, model.predict(X_train)))
    print("Validation")
    print("F1 score:", f1_score(y_valid, model.predict(X_valid)))
    
    print()
    print("Select features with Random Forest:")
    print('='*40)
    print("Train")
    model.fit(random_forest_X_train_set, y_train)
    print("F1 score:", f1_score(y_train, model.predict(random_forest_X_train_set)))
    print("Validation")
    print("F1 score:", f1_score(y_valid, model.predict(random_forest_X_valid_set)))

    print()
    print("Select features with ROC-AUC:")
    print('='*40)
    print("Train")
    model.fit(roc_X_train_set, y_train)
    print("F1 score:", f1_score(y_train, model.predict(roc_X_train_set)))
    print("Validation")
    print("F1 score:", f1_score(y_valid, model.predict(roc_X_valid_set)))
    
    print()
    print("Select features with RecursiveFeatureElimination:")
    print('='*40)
    print("Train")
    model.fit(recursive_ellimination_selected_X_train_set, y_train)
    print("F1 score:", f1_score(y_train, model.predict(recursive_ellimination_selected_X_train_set)))
    print("Validation")
    print("F1 score:", f1_score(y_valid, model.predict(recursive_ellimination_selected_X_valid_set)))
    
    print()
    print("Select features with RecursiveFeatureAddition:")
    print('='*40)
    print("Train")
    model.fit(recursive_addition_selected_X_train_set, y_train)
    print("F1 score:", f1_score(y_train, model.predict(recursive_addition_selected_X_train_set)))
    print("Validation")
    print("F1 score:", f1_score(y_valid, model.predict(recursive_addition_selected_X_valid_set)))
    
    print('='*40)
    

#### LogisticRegression

In [45]:
model = LogisticRegression()
print_analysis_result(model)


Model:  LogisticRegression()

All features:
Train
F1 score: 0.5878566337908361
Validation
F1 score: 0.5989792198323004

Select features with Random Forest:
Train
F1 score: 0.5710985863438973
Validation
F1 score: 0.5872262773722629

Select features with ROC-AUC:
Train
F1 score: 0.5722067071341197
Validation
F1 score: 0.5885141294439381

Select features with RecursiveFeatureElimination:
Train
F1 score: 0.5604469480956025
Validation
F1 score: 0.5777206826940723

Select features with RecursiveFeatureAddition:
Train
F1 score: 0.5582065280520385
Validation
F1 score: 0.5746914717259163


#### DecisionTreeClassifier

In [46]:
model = tree.DecisionTreeClassifier()
print_analysis_result(model)

Model:  DecisionTreeClassifier()

All features:
Train
F1 score: 0.9993330456275255
Validation
F1 score: 0.5452307692307692

Select features with Random Forest:
Train
F1 score: 0.9977422204770787
Validation
F1 score: 0.5111414279217826

Select features with ROC-AUC:
Train
F1 score: 0.9989600298255598
Validation
F1 score: 0.5190279688216414

Select features with RecursiveFeatureElimination:
Train
F1 score: 0.6916912117268306
Validation
F1 score: 0.5383383739548123

Select features with RecursiveFeatureAddition:
Train
F1 score: 0.6225577883610176
Validation
F1 score: 0.5637263891435906


#### AdaBoostClassifier

In [47]:
model = AdaBoostClassifier()
print_analysis_result(model)

Model:  AdaBoostClassifier()

All features:
Train
F1 score: 0.5785158421345191
Validation
F1 score: 0.5971966064182958

Select features with Random Forest:
Train
F1 score: 0.5800356588788294
Validation
F1 score: 0.5918854415274463

Select features with ROC-AUC:
Train
F1 score: 0.5804430933628114
Validation
F1 score: 0.594911220940875

Select features with RecursiveFeatureElimination:
Train
F1 score: 0.5723081196185539
Validation
F1 score: 0.5860036832412523

Select features with RecursiveFeatureAddition:
Train
F1 score: 0.5716477233107873
Validation
F1 score: 0.5855275271589027


#### RandomForestClassifier

In [48]:
model = RandomForestClassifier(n_estimators=100, random_state=10)
print_analysis_result(model)

Model:  RandomForestClassifier(random_state=10)

All features:
Train
F1 score: 0.999313496655748
Validation
F1 score: 0.6065200909780136

Select features with Random Forest:
Train
F1 score: 0.9977047572339381
Validation
F1 score: 0.6009723261032163

Select features with ROC-AUC:
Train
F1 score: 0.9989605193480692
Validation
F1 score: 0.5976577257272384

Select features with RecursiveFeatureElimination:
Train
F1 score: 0.7055745710610651
Validation
F1 score: 0.5617348723329835

Select features with RecursiveFeatureAddition:
Train
F1 score: 0.6339408411993962
Validation
F1 score: 0.5766215253029223


#### MLPClassifier

In [49]:
# model = model = MLPClassifier(hidden_layer_sizes=(100, 50, ), max_iter=300)
# print_analysis_result(model)

# Model:  MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=300)

# All features:
# ========================================
# Train
# F1 score: 0.8530766868024983
# Validation
# F1 score: 0.5896028414594768

# Select features with Random Forest:
# ========================================
# Train
# F1 score: 0.6794928044928046
# Validation
# F1 score: 0.5996497373029772

# Select features with ROC-AUC:
# ========================================
# Train
# F1 score: 0.6902931117589002
# Validation
# F1 score: 0.5760988013076643

# Select features with RecursiveFeatureElimination:
# ========================================
# Train
# F1 score: 0.6084971521021254
# Validation
# F1 score: 0.6096237970253718

# Select features with RecursiveFeatureAddition:
# ========================================
# Train
# F1 score: 0.5634595328757489
# Validation
# F1 score: 0.5712092130518234
# ========================================

Qua phân tích cho thấy kết quả trên model **MLPClassifier** huấn luyện trên các features được chọn dung: RecursiveFeatureElimination

- F1-score trên tập train: 0.6084971521021254
- F1-score trên tập validation: 0.6096237970253718

## 6. Nâng cao

#### Load data

In [208]:
df_advance = pd.read_csv('weatherAUS_advance.csv')
df_advance['RainTomorrow'] = df_advance['RainTomorrow'].fillna('Unknown')
# df_advance.drop(['Date', 'Location'], axis=1, inplace = True)

In [209]:
df_advance.head(2)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No


#### Handle Date feature

In [210]:
df_advance['Date'].isna().sum()

0

In [211]:
df_advance['Date'].head()

0    2008-12-01
1    2008-12-02
2    2008-12-03
3    2008-12-04
4    2008-12-05
Name: Date, dtype: object

Convert date feature from string to pandas' date

In [212]:
df_advance['Date'] = pd.to_datetime(df_advance['Date'], format='%Y-%m-%d')
df_advance['Date'].head()

0   2008-12-01
1   2008-12-02
2   2008-12-03
3   2008-12-04
4   2008-12-05
Name: Date, dtype: datetime64[ns]

Cyclic Feature Encoding

In [213]:
def encode_date(df, variable, max_value):
    df[variable + '_sin'] = np.sin(2 * np.pi * df[variable]/max_value)
    df[variable + '_cos'] = np.cos(2 * np.pi * df[variable]/max_value)
    return df

df_advance['Date_year'] = df_advance['Date'].dt.year
df_advance['Date_month'] = df_advance['Date'].dt.month
df_advance = encode_date(df_advance, 'Date_month', max_value=12)

df_advance['Date_day'] = df_advance['Date'].dt.day
df_advance = encode_date(df_advance, 'Date_day', max_value=31)

df_advance.drop(['Date'], axis=1, inplace = True)

In [214]:
date_features = ['Date_year', 'Date_month', 'Date_day', 'Date_month_sin', 'Date_month_cos', 'Date_day_sin', 'Date_day_cos']
df_advance[date_features].head()

Unnamed: 0,Date_year,Date_month,Date_day,Date_month_sin,Date_month_cos,Date_day_sin,Date_day_cos
0,2008,12,1,-2.449294e-16,1.0,0.201299,0.97953
1,2008,12,2,-2.449294e-16,1.0,0.394356,0.918958
2,2008,12,3,-2.449294e-16,1.0,0.571268,0.820763
3,2008,12,4,-2.449294e-16,1.0,0.724793,0.688967
4,2008,12,5,-2.449294e-16,1.0,0.848644,0.528964


#### Split data

In [215]:
X = df_advance.drop(['RainTomorrow'], axis=1)
y = df_advance['RainTomorrow']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

#### Chia biến hạng mục và biến số

In [216]:
categorical = [col for col in X.columns if X[col].dtypes == 'O']
numerical = [col for col in X.columns if X[col].dtypes != 'O']

In [217]:
categorical

['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

In [218]:
numerical

['MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustSpeed',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm',
 'Date_year',
 'Date_month',
 'Date_month_sin',
 'Date_month_cos',
 'Date_day',
 'Date_day_sin',
 'Date_day_cos']

#### Fill N/A cho biến số

In [219]:
for df1 in [X_train, X_test, X_valid]:
    for col in numerical:
        col_median = X_train[col].median()
        df1[col].fillna(col_median, inplace=True)



#### Fill N/A cho biến hạng mục

In [220]:
for df2 in [X_train, X_test, X_valid]:
    for var in categorical:
        df2[var].fillna(X_train[var].mode()[0], inplace=True)

#### Xử lý outlier cho biến số

In [221]:
def max_value(df3, variable, top):
    return np.where(df3[variable]>top, top, df3[variable])

for df3 in [X_train, X_test, X_valid]:
    df3['Rainfall'] = max_value(df3, 'Rainfall', 3.2)
    df3['Evaporation'] = max_value(df3, 'Evaporation', 21.8)
    df3['WindSpeed9am'] = max_value(df3, 'WindSpeed9am', 55)
    df3['WindSpeed3pm'] = max_value(df3, 'WindSpeed3pm', 57)

#### Encode cho biến hạng mục

In [222]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
y_valid = le.transform(y_valid)

#### Mã hoá biến nhị phân cho RainToday

In [223]:
X_train_1, X_test_1, X_valid_1 = X_train.copy(), X_test.copy(), X_valid.copy()

In [224]:
X_train_1.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Temp9am,Temp3pm,RainToday,Date_year,Date_month,Date_month_sin,Date_month_cos,Date_day,Date_day_sin,Date_day_cos
111548,Albany,14.5,17.8,3.2,4.8,8.4,W,39.0,N,SE,...,16.2,21.1,Yes,2017,4,0.866025,-0.5,24,-0.988468,0.151428
142528,Katherine,24.6,39.5,0.0,12.0,8.4,SE,31.0,N,NW,...,30.1,39.2,No,2013,10,-0.866025,0.5,11,0.790776,-0.612106
65304,MelbourneAirport,14.7,26.6,0.2,3.2,8.0,SSE,30.0,SW,S,...,17.3,24.5,No,2012,2,0.866025,0.5,18,-0.485302,-0.874347
40651,Williamtown,18.3,26.6,0.0,4.8,8.4,SE,28.0,WSW,SSE,...,23.0,25.4,No,2012,2,0.866025,0.5,18,-0.485302,-0.874347
97487,Adelaide,11.4,19.3,0.6,2.4,8.9,W,31.0,SW,SSW,...,14.4,18.3,No,2011,10,-0.866025,0.5,11,0.790776,-0.612106


In [225]:

import category_encoders as ce

encoder = ce.BinaryEncoder(cols=['RainToday'])

X_train_target = encoder.fit_transform(X_train_1)
col_categorical = ['Location','WindGustDir', 'WindDir9am', 'WindDir3pm']
X_train_categorical = pd.concat([X_train_target[['RainToday_0', 'RainToday_1']],
                      pd.get_dummies(X_train_1[col_categorical])], axis=1)

X_test_target = encoder.transform(X_test_1)
X_test_categorical = pd.concat([X_test_target[['RainToday_0', 'RainToday_1']],
                     pd.get_dummies(X_test_1[col_categorical])], axis=1)

X_valid_target = encoder.transform(X_valid_1)
X_valid_categorical = pd.concat([X_valid_target[['RainToday_0', 'RainToday_1']],
                     pd.get_dummies(X_valid_1[col_categorical])], axis=1)

In [226]:
X_train_categorical.head()

Unnamed: 0,RainToday_0,RainToday_1,Location_Adelaide,Location_Albany,Location_Albury,Location_AliceSprings,Location_BadgerysCreek,Location_Ballarat,Location_Bendigo,Location_Brisbane,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
111548,0,1,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
142528,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
65304,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
40651,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
97487,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


#### Rời rạc hoá biến số

In [227]:
# from feature_engine.discretisation import EqualFrequencyDiscretiser
# disc = EqualFrequencyDiscretiser(q=10, variables = numerical)

# disc.fit(X_train_1[numerical])
# X_train_numerical = disc.transform(X_train_1[numerical])
# X_test_numerical = disc.transform(X_test_1[numerical])
# X_valid_numerical = disc.transform(X_valid_1[numerical])

#### Ghép các tập đã xử lý

In [228]:
train = pd.concat([X_train_1[numerical], X_train_categorical], axis=1)
test = pd.concat([X_test_1[numerical], X_test_categorical], axis=1)
valid = pd.concat([X_valid_1[numerical], X_valid_categorical], axis=1)

In [229]:
train.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
111548,14.5,17.8,3.2,4.8,8.4,39.0,0.0,19.0,94.0,52.0,...,0,0,0,1,0,0,0,0,0,0
142528,24.6,39.5,0.0,12.0,8.4,31.0,9.0,17.0,52.0,21.0,...,0,1,0,0,0,0,0,0,0,0
65304,14.7,26.6,0.2,3.2,8.0,30.0,7.0,11.0,87.0,57.0,...,0,0,1,0,0,0,0,0,0,0
40651,18.3,26.6,0.0,4.8,8.4,28.0,11.0,22.0,79.0,67.0,...,0,0,0,0,1,0,0,0,0,0
97487,11.4,19.3,0.6,2.4,8.9,31.0,13.0,19.0,63.0,47.0,...,0,0,0,0,0,1,0,0,0,0


#### Lựa chọn Features

Lựa chọn đặc trưng bằng random forest

In [230]:
parameters = {'n_estimators': [10, 20, 50, 100]}
randomForestModel = RandomForestClassifier(random_state=10)
grid = GridSearchCV(randomForestModel, parameters, cv=5)
grid.fit(train, y_train)
grid.best_params_

# {'n_estimators': 100}

{'n_estimators': 100}

In [231]:
sel_ = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=10))
sel_.fit(train, y_train)

random_forest_X_train_set = sel_.transform(train)
random_forest_X_valid_set = sel_.transform(valid)
random_forest_X_test_set = sel_.transform(test)
random_forest_X_train_set

array([[14.5       , 17.8       ,  3.2       , ...,  0.15142778,
         0.        ,  1.        ],
       [24.6       , 39.5       ,  0.        , ..., -0.61210598,
         1.        ,  0.        ],
       [14.7       , 26.6       ,  0.2       , ..., -0.87434662,
         1.        ,  0.        ],
       ...,
       [ 7.6       , 21.7       ,  0.        , ..., -0.95413926,
         1.        ,  0.        ],
       [ 8.8       , 14.8       ,  0.        , ..., -0.05064917,
         1.        ,  0.        ],
       [15.8       , 40.6       ,  0.        , ..., -0.75875812,
         1.        ,  0.        ]])

Lựa chọn đặc trưng bằng đệ quy

In [232]:
# model = GradientBoostingClassifier(n_estimators=10, max_depth=2 ,random_state=10)
# sel_ = RecursiveFeatureElimination(
#     variables=None, 
#     estimator = model, 
#     scoring = 'roc_auc', # the metric we want to evalute
#     threshold = 0.0005, # the maximum performance drop allowed to remove a feature
#     cv=2, # cross-validation
# )
# sel_.fit(train, y_train)

# recursive_ellimination_selected_X_train_set = sel_.transform(train)
# recursive_ellimination_selected_X_valid_set = sel_.transform(valid)
# recursive_ellimination_selected_X_test_set = sel_.transform(test)

In [233]:
# model = GradientBoostingClassifier(n_estimators=10, max_depth=2 ,random_state=10)
# sel_ = RecursiveFeatureAddition(
#     variables=None,
#     estimator=model,
#     scoring='roc_auc',
#     threshold=0.0005,
#     cv=3,)
# sel_.fit(train, y_train)

# recursive_addition_selected_X_train_set = sel_.transform(train)
# recursive_addition_selected_X_valid_set = sel_.transform(valid)
# recursive_addition_selected_X_test_set = sel_.transform(test)

#### Huấn luyện mô hình

In [234]:
def print_analysis_result(model):
    
    print("Model: ", model)
    
    print()
    print("All features:")
    print('='*40)
    print("Train")
    model.fit(train, y_train)
    print("F1 score:", f1_score(y_train, model.predict(train), average='weighted'))
    print("Validation")
    print("F1 score:", f1_score(y_valid, model.predict(valid), average='weighted'))
    
    print()
    print("Select features with Random Forest:")
    print('='*40)
    print("Train")
    model.fit(random_forest_X_train_set, y_train)
    print("F1 score:", f1_score(y_train, model.predict(random_forest_X_train_set), average='weighted'))
    print("Validation")
    print("F1 score:", f1_score(y_valid, model.predict(random_forest_X_valid_set), average='weighted'))

    # print()
    # print("Select features with RecursiveFeatureElimination:")
    # print('='*40)
    # print("Train")
    # model.fit(recursive_ellimination_selected_X_train_set, y_train)
    # print("F1 score:", f1_score(y_train, model.predict(recursive_ellimination_selected_X_train_set), average='weighted'))
    # print("Validation")
    # print("F1 score:", f1_score(y_valid, model.predict(recursive_ellimination_selected_X_valid_set), average='weighted'))
    
    # print()
    # print("Select features with RecursiveFeatureAddition:")
    # print('='*40)
    # print("Train")
    # model.fit(recursive_addition_selected_X_train_set, y_train)
    # print("F1 score:", f1_score(y_train, model.predict(recursive_addition_selected_X_train_set), average='weighted'))
    # print("Validation")
    # print("F1 score:", f1_score(y_valid, model.predict(recursive_addition_selected_X_valid_set), average='weighted'))
    
    print('='*40)

Logistic Regression

In [235]:
model = LogisticRegression()
print_analysis_result(model)

Model:  LogisticRegression()

All features:
Train
F1 score: 0.8008148180561063
Validation
F1 score: 0.8009932063693237

Select features with Random Forest:
Train
F1 score: 0.8005195964385893
Validation
F1 score: 0.8017533857107199


DecisionTreeClassifier

In [236]:
model = tree.DecisionTreeClassifier()
print_analysis_result(model)

Model:  DecisionTreeClassifier()

All features:
Train
F1 score: 1.0
Validation
F1 score: 0.7721517751785996

Select features with Random Forest:
Train
F1 score: 0.999991405772352
Validation
F1 score: 0.7640958441010445


AdaBoostClassifier

In [237]:
model = AdaBoostClassifier()
print_analysis_result(model)

Model:  AdaBoostClassifier()

All features:
Train
F1 score: 0.8075482070234644
Validation
F1 score: 0.8060829652773513

Select features with Random Forest:
Train
F1 score: 0.8038520500653998
Validation
F1 score: 0.8013647517422984


RandomForestClassifier

In [238]:
model = RandomForestClassifier(n_estimators=100, random_state=10)
print_analysis_result(model)

Model:  RandomForestClassifier(random_state=10)

All features:
Train
F1 score: 0.9999914065121894
Validation
F1 score: 0.8252951411167

Select features with Random Forest:
Train
F1 score: 0.9999398447472719
Validation
F1 score: 0.8256054509457553


MPLClassifier

In [239]:
model = model = MLPClassifier(hidden_layer_sizes=(100, 50, ), max_iter=300)
print_analysis_result(model)

Model:  MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=300)

All features:
Train
F1 score: 0.8237432757977362
Validation
F1 score: 0.8234453143469092

Select features with Random Forest:
Train
F1 score: 0.8117389210618332
Validation
F1 score: 0.8085027360496212


#### Lựa chọn mô hình

Qua phân tích cho thấy kết quả cao nhất trên model: , được áp dụng trên các features:

- F1-score trên tập train: 
- F1-score trên tập validation: 

In [240]:
model = RandomForestClassifier(n_estimators=100, max_features='auto', max_depth=8, criterion='gini', random_state=42)
model.fit(train, y_train)

RandomForestClassifier(max_depth=8, random_state=42)

In [241]:
f1_score(model.predict(train), y_train, average='weighted')

0.855458978931079

In [242]:
f1_score(model.predict(test), y_test, average='weighted')

0.8472725631526058

In [243]:
f1_score(model.predict(valid), y_valid, average='weighted')

0.8492096243635924