## 1. Import lib

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

## 2. Load processed data
(after **step 3** of **asm_1.ipynb**)

In [3]:
from sklearn.model_selection import train_test_split

X_train = pd.read_csv('train_processed.csv')
y_train = pd.read_csv('y_train.csv').values

X_test_processed = pd.read_csv('test_processed.csv')
y_test_processed = pd.read_csv('y_test.csv').values

X_test, X_valid, y_test, y_valid = train_test_split(X_test_processed, y_test_processed, test_size=0.5, random_state=0)



## 3. Features Selection

### 3.1 Lựa chọn đặc trưng bằng Random Forest

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV

# Find best estimator of RandomForestClassifier first
parameters = {'n_estimators': [10, 20, 50, 100]}
randomForestModel = RandomForestClassifier(random_state=10)
grid = GridSearchCV(randomForestModel, parameters, cv=5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=10),
             param_grid={'n_estimators': [10, 20, 50, 100]})

In [5]:
grid.best_params_

{'n_estimators': 100}

In [6]:
sel_ = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=10))
sel_.fit(X_train, y_train)

random_forest_X_train_set = sel_.transform(X_train)
random_forest_X_valid_set = sel_.transform(X_valid)
random_forest_X_test_set = sel_.transform(X_test)

### 3.2 Lựa chọn đặc trưng bằng ROC-AUC

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

roc_values = []

for feature in X_train.columns:

    # train a decision tree classifier
    clf = DecisionTreeClassifier()
    clf.fit(X_train[feature].values.reshape(-1, 1), y_train)

    # obtain the predictions
    y_scored = clf.predict_proba(X_valid[feature].values.reshape(-1, 1))

    # calculate and store the roc-auc
    roc_values.append(roc_auc_score(y_valid, y_scored[:, 1]))

roc_values = pd.Series(roc_values)
roc_values.index = X_train.columns
selected_feat_roc = roc_values[roc_values > 0.51].index
selected_feat_roc

roc_X_train_set = X_train[selected_feat_roc]
roc_X_valid_set = X_valid[selected_feat_roc]
roc_X_test_set = X_test[selected_feat_roc]

### 3.3 Lựa chọn đặc trưng bằng đệ quy

#### Loại bỏ đặc trưng bằng đệ quy

In [9]:
from feature_engine.selection import RecursiveFeatureElimination
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=10, max_depth=2 ,random_state=10)
sel_ = RecursiveFeatureElimination(
    variables=None, 
    estimator = model, 
    scoring = 'roc_auc', # the metric we want to evalute
    threshold = 0.0005, # the maximum performance drop allowed to remove a feature
    cv=2, # cross-validation
)
sel_.fit(X_train, y_train)

recursive_ellimination_selected_X_train_set = sel_.transform(X_train)
recursive_ellimination_selected_X_valid_set = sel_.transform(X_valid)
recursive_ellimination_selected_X_test_set = sel_.transform(X_test)


#### Thêm đặc trưng bằng đệ quy

In [10]:
from feature_engine.selection import RecursiveFeatureAddition

model = GradientBoostingClassifier(n_estimators=10, max_depth=2 ,random_state=10)
sel_ = RecursiveFeatureAddition(
    variables=None,
    estimator=model,
    scoring='roc_auc',
    threshold=0.0005,
    cv=3,)
sel_.fit(X_train, y_train)

recursive_addition_selected_X_train_set = sel_.transform(X_train)
recursive_addition_selected_X_valid_set = sel_.transform(X_valid)
recursive_addition_selected_X_test_set = sel_.transform(X_test)

## 4. Huấn luyện mô hình

In [12]:
from sklearn.metrics import f1_score
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

In [21]:
def print_analysis_result(model):
    
    print("Model: ", model)
    
    print()
    print("All features:")
    print('='*40)
    print("Train")
    model.fit(X_train, y_train)
    print("F1 score:", f1_score(y_train, model.predict(X_train)))
    print("Validation")
    print("F1 score:", f1_score(y_valid, model.predict(X_valid)))
    
    print()
    print("Select features with Random Forest:")
    print('='*40)
    print("Train")
    model.fit(random_forest_X_train_set, y_train)
    print("F1 score:", f1_score(y_train, model.predict(random_forest_X_train_set)))
    print("Validation")
    print("F1 score:", f1_score(y_valid, model.predict(random_forest_X_valid_set)))

    print()
    print("Select features with ROC-AUC:")
    print('='*40)
    print("Train")
    model.fit(roc_X_train_set, y_train)
    print("F1 score:", f1_score(y_train, model.predict(roc_X_train_set)))
    print("Validation")
    print("F1 score:", f1_score(y_valid, model.predict(roc_X_valid_set)))
    
    print()
    print("Select features with RecursiveFeatureElimination:")
    print('='*40)
    print("Train")
    model.fit(recursive_ellimination_selected_X_train_set, y_train)
    print("F1 score:", f1_score(y_train, model.predict(recursive_ellimination_selected_X_train_set)))
    print("Validation")
    print("F1 score:", f1_score(y_valid, model.predict(recursive_ellimination_selected_X_valid_set)))
    
    print()
    print("Select features with RecursiveFeatureAddition:")
    print('='*40)
    print("Train")
    model.fit(recursive_addition_selected_X_train_set, y_train)
    print("F1 score:", f1_score(y_train, model.predict(recursive_addition_selected_X_train_set)))
    print("Validation")
    print("F1 score:", f1_score(y_valid, model.predict(recursive_addition_selected_X_valid_set)))
    
    print('='*40)
    

#### LogisticRegression

In [22]:
model = LogisticRegression()
print_analysis_result(model)


Model:  LogisticRegression()

All features:
Train
F1 score: 0.5878566337908361
Validation
F1 score: 0.5989792198323004

Select features with Random Forest:
Train
F1 score: 0.5710985863438973
Validation
F1 score: 0.5872262773722629

Select features with ROC-AUC:
Train
F1 score: 0.5722067071341197
Validation
F1 score: 0.5885141294439381

Select features with RecursiveFeatureElimination:
Train
F1 score: 0.5604469480956025
Validation
F1 score: 0.5777206826940723

Select features with RecursiveFeatureAddition:
Train
F1 score: 0.5582065280520385
Validation
F1 score: 0.5746914717259163


#### DecisionTreeClassifier

In [23]:
model = tree.DecisionTreeClassifier()
print_analysis_result(model)

Model:  DecisionTreeClassifier()

All features:
Train
F1 score: 0.9993330456275255
Validation
F1 score: 0.5468750000000001

Select features with Random Forest:
Train
F1 score: 0.9977422204770787
Validation
F1 score: 0.5148994100741188

Select features with ROC-AUC:
Train
F1 score: 0.9989600298255598
Validation
F1 score: 0.5205437605009929

Select features with RecursiveFeatureElimination:
Train
F1 score: 0.6916912117268306
Validation
F1 score: 0.5371753824261828

Select features with RecursiveFeatureAddition:
Train
F1 score: 0.6225577883610176
Validation
F1 score: 0.5639897323065641


#### AdaBoostClassifier

In [24]:
model = AdaBoostClassifier()
print_analysis_result(model)

Model:  AdaBoostClassifier()

All features:
Train
F1 score: 0.5785158421345191
Validation
F1 score: 0.5971966064182958

Select features with Random Forest:
Train
F1 score: 0.5800356588788294
Validation
F1 score: 0.5918854415274463

Select features with ROC-AUC:
Train
F1 score: 0.5804430933628114
Validation
F1 score: 0.594911220940875

Select features with RecursiveFeatureElimination:
Train
F1 score: 0.5723081196185539
Validation
F1 score: 0.5860036832412523

Select features with RecursiveFeatureAddition:
Train
F1 score: 0.5716477233107873
Validation
F1 score: 0.5855275271589027


#### RandomForestClassifier

In [25]:
model = RandomForestClassifier(n_estimators=100, random_state=10)
print_analysis_result(model)

Model:  RandomForestClassifier(random_state=10)

All features:
Train
F1 score: 0.999313496655748
Validation
F1 score: 0.6065200909780136

Select features with Random Forest:
Train
F1 score: 0.9977047572339381
Validation
F1 score: 0.6009723261032163

Select features with ROC-AUC:
Train
F1 score: 0.9989605193480692
Validation
F1 score: 0.5976577257272384

Select features with RecursiveFeatureElimination:
Train
F1 score: 0.7055745710610651
Validation
F1 score: 0.5617348723329835

Select features with RecursiveFeatureAddition:
Train
F1 score: 0.6339408411993962
Validation
F1 score: 0.5766215253029223


#### MLPClassifier

In [26]:
model = model = MLPClassifier(hidden_layer_sizes=(100, 50, ), max_iter=300)
print_analysis_result(model)

Model:  MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=300)

All features:
Train
F1 score: 0.8547236190892648
Validation
F1 score: 0.5978736710444028

Select features with Random Forest:
Train
F1 score: 0.6875748246964255
Validation
F1 score: 0.6059888343765861

Select features with ROC-AUC:
Train
F1 score: 0.6937045728998155
Validation
F1 score: 0.5764492753623189

Select features with RecursiveFeatureElimination:
Train
F1 score: 0.5781929990539262
Validation
F1 score: 0.5851003940701821

Select features with RecursiveFeatureAddition:
Train
F1 score: 0.5512639229986852
Validation
F1 score: 0.555511811023622


Qua phân tích cho thấy kết quả trên model **RandomForestClassifier** huấn luyện trên tất cả features cho kết quả tốt nhất

## 5. Tối ưu hoá mô hình 

In [27]:
model=RandomForestClassifier(random_state=42)
param_grid = { 
    'n_estimators': [50, 100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=model, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [50, 100, 200, 500]})

In [29]:
CV_rfc.best_params_

{'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'auto',
 'n_estimators': 100}

In [34]:
CV_rfc.best_score_

0.8340893624770025

In [30]:
model = RandomForestClassifier(n_estimators=100, max_features='auto', max_depth=8, criterion='gini', random_state=42)
model.fit(X_train, y_train)

RandomForestClassifier(max_depth=8, random_state=42)

In [36]:
print("Test")
print('Accuracy:', model.score(X_test, y_test))

Test
Accuracy: 0.8357127786764189


## 6. Nâng cao

#### Load data

In [45]:
df_advance = pd.read_csv('weatherAUS_advance.csv')
df_advance['RainTomorrow'] = df_advance['RainTomorrow'].fillna('Unknown')

X = df_advance.drop(['RainTomorrow'], axis=1)
y = df_advance['RainTomorrow']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

Chia biến hạng mục và biến số

In [46]:
categorical = [col for col in X.columns if X[col].dtypes == 'O']
numerical = [col for col in X.columns if X[col].dtypes != 'O']

Fill N/A cho biến số

In [47]:
for df1 in [X_train, X_test, X_valid]:
    for col in numerical:
        col_median = X_train[col].median()
        df1[col].fillna(col_median, inplace=True)



Fill N/A cho biến hạng mục

In [48]:
for df2 in [X_train, X_test, X_valid]:
    for var in categorical:
        df2[var].fillna(X_train[var].mode()[0], inplace=True)

Xử lý outlier cho biến số

In [49]:
def max_value(df3, variable, top):
    return np.where(df3[variable]>top, top, df3[variable])

for df3 in [X_train, X_test]:
    df3['Rainfall'] = max_value(df3, 'Rainfall', 3.2)
    df3['Evaporation'] = max_value(df3, 'Evaporation', 21.8)
    df3['WindSpeed9am'] = max_value(df3, 'WindSpeed9am', 55)
    df3['WindSpeed3pm'] = max_value(df3, 'WindSpeed3pm', 57)

Encode cho biến hạng mục

In [50]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
y_valid = le.transform(y_valid)

Mã hoá biến nhị phân cho RainToday

In [51]:
from pyexpat.model import XML_CQUANT_NONE
import category_encoders as ce

encoder = ce.BinaryEncoder(cols=['RainToday'])

X_train_target = encoder.fit_transform(X_train)
col_categorical = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']
X_train_categorical = pd.concat([X_train_target[['RainToday_0', 'RainToday_1']],
                      pd.get_dummies(X_train[col_categorical])], axis=1)

X_test_target = encoder.transform(X_test)
X_test_categorical = pd.concat([X_test_target[['RainToday_0', 'RainToday_1']],
                     pd.get_dummies(X_test[col_categorical])], axis=1)

X_valid_target = encoder.transform(X_valid)
X_valid_categorical = pd.concat([X_valid_target[['RainToday_0', 'RainToday_1']],
                     pd.get_dummies(X_valid[col_categorical])], axis=1)

Rời rạc hoá biến số

In [52]:
from feature_engine.discretisation import EqualFrequencyDiscretiser
disc = EqualFrequencyDiscretiser(q=10, variables = numerical)

disc.fit(X_train[numerical])
X_train_numerical = disc.transform(X_train[numerical])
X_test_numerical = disc.transform(X_test[numerical])
X_valid_numerical = disc.transform(X_valid[numerical])

Ghép các tập đã xử lý

In [53]:
train = pd.concat([X_train_numerical, X_train_categorical], axis=1)
test = pd.concat([X_test_numerical, X_test_categorical], axis=1)
valid = pd.concat([X_valid_numerical, X_valid_categorical], axis=1)

Sử dụng **RandomForestClassifier** cho tất cả feature

In [56]:
model = RandomForestClassifier(n_estimators=100, max_features='auto', max_depth=8, criterion='gini', random_state=42)
model.fit(train, y_train)

RandomForestClassifier(max_depth=8, random_state=42)

In [58]:
f1_score(model.predict(train), y_train, average='weighted')

0.8530070292753242

In [59]:
f1_score(model.predict(test), y_test, average='weighted')

0.8475378363710365

In [60]:
f1_score(model.predict(valid), y_valid, average='weighted')

0.8502635799926123