In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import sys
import os
from utils.metrics_utils import *
from utils.competition_utils import load_data, load_save_vectors
import xgboost
from tqdm import tqdm_notebook
import pickle
from sklearn.model_selection import GridSearchCV
from itertools import product
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 40)

Using TensorFlow backend.


## train model

In [3]:
def search_best_params(data, param_grid):
    param_grid = sorted(param_grid.items())
    param_name_list = [x[0] for x in param_grid]
    grid_list = [x[1] for x in param_grid]
    all_model_report = []
    for model_config in tqdm_notebook(product(*grid_list)):
        x_train, y_train, x_dev, y_dev, x_test, y_test = data
        model_config = dict(zip(param_name_list, model_config))
        model = xgboost.XGBClassifier(
            tree_method='gpu_hist', subsample=0.9, gpu_id=3, n_jobs=10, **model_config)
        model.fit(x_train, y_train)
        y_dev_pred = model.predict(x_dev)
        y_test_pred = model.predict(x_test)
        model_report = {}
        model_report.update(model_config)
        model_report.update(get_model_result_adv(
            y_dev, y_dev_pred, data_set='dev'))
        model_report.update(get_model_result_adv(
            y_test, y_test_pred, data_set='test'))
        model_report['y_true'] = y_test
        model_report['y_pred'] = y_test_pred
        all_model_report.append(model_report)

    return pd.DataFrame(all_model_report)

In [4]:
keep_cols = ['emb_name', 'dev_acc',
             'dev_rmse', 'dev_abs_1_score', 'dev_macro_f1-score',
             'dev_micro_f1-score', 'test_acc', 'test_rmse',
             'test_abs_1_score','test_micro_f1-score', 'test_macro_f1-score', 'max_depth', 'n_estimators']

In [5]:
data_dir = 'data/features/wide_features/'
data_basic = load_save_vectors(os.path.join(data_dir,'raw'))

Start load data form data/features/wide_features/raw+filter
Finish load data


In [6]:
data_basic[0].shape

(3726, 25)

In [7]:
param_grid = {
    'max_depth': range(1, 10, 1),
    'n_estimators': range(10, 100, 10),
}

### wide

In [None]:
df_report = search_best_params(data_basic, param_grid)

In [9]:
df_report.sort_values('test_acc',ascending=False)[keep_cols[1:]][:1]

Unnamed: 0,dev_acc,dev_rmse,dev_abs_1_score,dev_macro_f1-score,dev_micro_f1-score,test_acc,test_rmse,test_abs_1_score,test_micro_f1-score,test_macro_f1-score,max_depth,n_estimators
36,0.7147,0.7412,0.912,0.616006,0.696213,0.7053,0.723,0.924,0.675702,0.60617,5,10


### wide + text emb

In [10]:
emb_list = ['word2vec_max', 'edu_roberta_max', 'edu_roberta_cls']
param_grid_list = [param_grid]*len(emb_list)
param_grid_list

[{'max_depth': range(1, 10), 'n_estimators': range(10, 100, 10)},
 {'max_depth': range(1, 10), 'n_estimators': range(10, 100, 10)},
 {'max_depth': range(1, 10), 'n_estimators': range(10, 100, 10)}]

In [11]:
model_dict = {"edu_roberta_cls": "Wide + EduRoBERTa(CLS)", 
              "edu_roberta_max": "Wide + EduRoBERTa(max)",
              "word2vec_max":"Wide + Tencent"}
for emb_name in emb_list:
    if emb_name not in model_dict:
        model_dict[emb_name] = emb_name
model_dict

{'edu_roberta_cls': 'Wide + EduRoBERTa(CLS)',
 'edu_roberta_max': 'Wide + EduRoBERTa(max)',
 'word2vec_max': 'Wide + Tencent'}

In [None]:
df_report_list = []
for emb_name,param_grid in zip(emb_list,param_grid_list):
    data_new = load_save_vectors(os.path.join(data_dir,emb_name))
    data_new = list(data_new)
    for i in range(0, 6, 2):
        data_new[i] = np.hstack([data_basic[i], data_new[i]])  # concat feature
    df_report = search_best_params(
        data_new, param_grid)
    df_report = df_report.sort_values('test_acc',ascending=False)
    df_report['emb_name'] = emb_name
    df_report_list.append(df_report)
    model_name = model_dict[emb_name]

In [13]:
df = pd.concat([df_report_list[i].sort_values('test_acc', ascending=False)[
               :1] for i in range(len(df_report_list))])[keep_cols]
df

Unnamed: 0,emb_name,dev_acc,dev_rmse,dev_abs_1_score,dev_macro_f1-score,dev_micro_f1-score,test_acc,test_rmse,test_abs_1_score,test_micro_f1-score,test_macro_f1-score,max_depth,n_estimators
4,word2vec_max,0.7347,0.7193,0.916,0.642749,0.716693,0.7053,0.7033,0.9333,0.671696,0.596735,1,50
80,edu_roberta_max,0.7147,0.722,0.9213,0.609157,0.69488,0.7147,0.7052,0.9293,0.684525,0.617096,9,90
37,edu_roberta_cls,0.7347,0.6792,0.9347,0.64423,0.72098,0.716,0.7014,0.9307,0.693871,0.631453,5,20
