In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from word2vec_selection.classes import Word2VecModel

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from word2vec_selection.functions import *
model_w2v_settings = return_best_model()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:
model_w2v = Word2VecModel(model_w2v_settings)

In [4]:
# read preprocessed data from pickle file
df = pd.read_pickle('data/preprocessed_titles_labels.pkl')
df.head()

Unnamed: 0,title,is_clickbait,text,dataset,sample
0,"[seventeen, time, game, throne, matched, perfe...",1,,clickbait-dataset,train
1,"[facing, texas, tar, heel, territory, duke, fi...",0,,clickbait-dataset,train
2,"[several, gop, state, move, block, funding, sa...",0,A number of GOP states have moved to introduce...,clickbait-news-detection,train
3,"[hillary, clinton, john, kerry, divergent, pat...",0,"WASHINGTON — Early in 2011, after a hectic ...",fake-news,train
4,"[wikinews, interview, kevin, baugh, president,...",0,,clickbait-dataset,train


In [5]:
df['sample'].value_counts()


sample
train    66664
test      3703
val1      1852
val2      1852
Name: count, dtype: int64

In [6]:
# split data into train and test stratified by y
from sklearn.preprocessing import StandardScaler
import os

df['title_vector'] = [get_word_vectors(model_w2v, title, aggregation='mean') for title in df['title']]

X_train = np.vstack(df[df['sample']=='train']['title_vector'])
X_test = np.vstack(df[df['sample']=='val2']['title_vector'])

y_train = df[df['sample']=='train']['is_clickbait']
y_test = df[df['sample']=='val2']['is_clickbait']

# scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [84]:
import json

# if file does not exist write empty dict
if not os.path.exists('grid_search_results.json'):
    with open('grid_search_results.json', 'w') as f:
        json.dump({}, f)

with open('grid_search_results.json', 'r') as f:
    model_settings = json.load(f)

model_settings
grids = model_settings['grid_search_grids']
grids

{'decision_tree_grid': {'max_depth': [None, 3, 4, 5, 6, 7, 8],
  'max_features': ['sqrt', 'log2', None]},
 'catboost_grid': {'iterations': [100, 250, 500, 750, 1000],
  'learning_rate': [None, 0.001, 0.01, 0.1, 0.2, 0.3],
  'depth': [3, 4, 5, 6, 7, 8],
  'l2_leaf_reg': [1, 3, 5, 7, 9]},
 'lightgbm_grid': {'n_estimators': [10, 25, 50, 100, 250],
  'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
  'max_depth': [-1, 3, 4, 5, 6, 7, 8],
  'num_leaves': [15, 31, 62, 93, 124]},
 'xgboost_grid': {'n_estimators': [10, 25, 50, 100, 250],
  'learning_rate': [None, 0.001, 0.01, 0.1, 0.2, 0.3],
  'max_depth': [None, 3, 4, 5, 6, 7, 8],
  'gamma': [None, 0, 0.001, 0.01, 0.1, 0.5, 1, 2, 3, 4, 5]},
 'random_forest_grid': {'n_estimators': [50, 100, 250, 750, 1000],
  'max_depth': [None, 3, 4, 5, 6, 7, 8],
  'max_features': ['sqrt', 'log2', None]}}

In [85]:
models_to_CV = list()

for model_name, grid in grids.items():
    # check if model has already been trained - if 'best_params' exists in grid
    if 'best_params' in grid.keys():
        print(f'Model {model_name} already trained')
        continue
    models_to_CV.append((model_name))
models_to_CV

# remove _grid from model names
models_to_CV = [model_name.replace('_grid', '') for model_name in models_to_CV]
models_to_CV

['decision_tree', 'catboost', 'lightgbm', 'xgboost', 'random_forest']

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [21]:
def return_model_with_param(model_name, param):
    if model_name == 'decision_tree':
        model = DecisionTreeClassifier(**param)
    elif model_name == 'random_forest':
        model = RandomForestClassifier(**param)
    elif model_name == 'xgboost':
        model = XGBClassifier(**param)
    elif model_name == 'lightgbm':
        model = LGBMClassifier(**param, verbose=-1)
    elif model_name == 'catboost':
        model = CatBoostClassifier(**param,verbose=0)
    return model

In [101]:
models_to_CV = ['lightgbm', 'decision_tree',   'xgboost', 'random_forest','catboost',]

In [102]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score
from tqdm.notebook import tqdm

grid_search_results = dict()
for model_name in models_to_CV:
    grid_search_results[model_name] = dict()

for model_name in tqdm(models_to_CV, desc = 'Models'):

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # get grid
    grid = grids[model_name+'_grid']

    # generate all combinations of parameters
    import itertools

    keys, values = zip(*grid.items())
    combinations_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]

    # do grid search
    
    for params in combinations_dicts:
        grid_search_results[model_name][str(params)] = dict()

    for params in tqdm(combinations_dicts, desc = 'Grid combinations search for model {}'.format(model_name)):
        model = return_model_with_param(model_name, params)
        scores_auc_cv_val = list()
        scores_auc_val2 = list()
        scores_auc_train = list()

        scores_f1_cv_val = list()
        scores_f1_val2 = list()
        scores_f1_train = list()

        for train_index, val_index in kf.split(X_train_scaled, y_train):
            X_train_kf = X_train_scaled[train_index]
            X_val_kf = X_train_scaled[val_index]
            y_train_kf = y_train.iloc[train_index]
            y_val_kf = y_train.iloc[val_index]
            model.fit(X_train_kf, y_train_kf)
        
            scores_auc_cv_val.append(roc_auc_score(y_val_kf, model.predict_proba(X_val_kf)[:,1]))
            scores_auc_val2.append(roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:,1]))
            scores_auc_train.append(roc_auc_score(y_train_kf, model.predict_proba(X_train_kf)[:,1]))

            scores_f1_cv_val.append(f1_score(y_val_kf, model.predict(X_val_kf)))
            scores_f1_val2.append(f1_score(y_test, model.predict(X_test_scaled)))
            scores_f1_train.append(f1_score(y_train_kf, model.predict(X_train_kf)))

        
        grid_search_results[model_name][str(params)]['mean_train_f1'] = np.mean(scores_f1_train)
        grid_search_results[model_name][str(params)]['mean_cv_val_f1'] = np.mean(scores_f1_cv_val)
        grid_search_results[model_name][str(params)]['mean_val2_f1'] = np.mean(scores_f1_val2)

        grid_search_results[model_name][str(params)]['mean_train_auc'] = np.mean(scores_auc_train)
        grid_search_results[model_name][str(params)]['mean_cv_val_auc'] = np.mean(scores_auc_cv_val)
        grid_search_results[model_name][str(params)]['mean_val2_auc'] = np.mean(scores_auc_val2)

        grid_search_results[model_name][str(params)]['f1_diff_cv_val'] = np.mean(scores_f1_train) - np.mean(scores_f1_cv_val)
        grid_search_results[model_name][str(params)]['auc_diff_cv_val'] = np.mean(scores_auc_train) - np.mean(scores_auc_cv_val)

        grid_search_results[model_name][str(params)]['f1_diff_val2'] = np.mean(scores_f1_train) - np.mean(scores_f1_val2)
        grid_search_results[model_name][str(params)]['auc_diff_val2'] = np.mean(scores_auc_train) - np.mean(scores_auc_val2)

        grid_search_results[model_name][str(params)]['params'] = params


    print(grid_search_results[model_name])
grid_search_results

Models:   0%|          | 0/5 [00:00<?, ?it/s]

Grid combinations search for model lightgbm:   0%|          | 0/875 [00:00<?, ?it/s]

{"{'n_estimators': 10, 'learning_rate': 0.001, 'max_depth': -1, 'num_leaves': 15}": {'mean_train_f1': 0.0, 'mean_cv_val_f1': 0.0, 'mean_val2_f1': 0.0, 'mean_train_auc': 0.8129092798469089, 'mean_cv_val_auc': 0.8092741866932569, 'mean_val2_auc': 0.8128050843627508, 'f1_diff_cv_val': 0.0, 'auc_diff_cv_val': 0.003635093153651958, 'f1_diff_val2': 0.0, 'auc_diff_val2': 0.00010419548415807611, 'params': {'n_estimators': 10, 'learning_rate': 0.001, 'max_depth': -1, 'num_leaves': 15}}, "{'n_estimators': 10, 'learning_rate': 0.001, 'max_depth': -1, 'num_leaves': 31}": {'mean_train_f1': 0.0, 'mean_cv_val_f1': 0.0, 'mean_val2_f1': 0.0, 'mean_train_auc': 0.8302107463160441, 'mean_cv_val_auc': 0.823960625512225, 'mean_val2_auc': 0.8246250306901342, 'f1_diff_cv_val': 0.0, 'auc_diff_cv_val': 0.006250120803819037, 'f1_diff_val2': 0.0, 'auc_diff_val2': 0.005585715625909926, 'params': {'n_estimators': 10, 'learning_rate': 0.001, 'max_depth': -1, 'num_leaves': 31}}, "{'n_estimators': 10, 'learning_rate':

Grid combinations search for model decision_tree:   0%|          | 0/21 [00:00<?, ?it/s]

{"{'max_depth': None, 'max_features': 'sqrt'}": {'mean_train_f1': 0.9997140462751333, 'mean_cv_val_f1': 0.6601292194294268, 'mean_val2_f1': 0.6777029975759017, 'mean_train_auc': 0.9999998876477576, 'mean_cv_val_auc': 0.7162565458996841, 'mean_val2_auc': 0.7343132189278424, 'f1_diff_cv_val': 0.33958482684570646, 'auc_diff_cv_val': 0.2837433417480735, 'f1_diff_val2': 0.3220110486992316, 'auc_diff_val2': 0.2656866687199152, 'params': {'max_depth': None, 'max_features': 'sqrt'}}, "{'max_depth': None, 'max_features': 'log2'}": {'mean_train_f1': 0.9997140462751333, 'mean_cv_val_f1': 0.6690942990240053, 'mean_val2_f1': 0.676455492174538, 'mean_train_auc': 0.9999998876477576, 'mean_cv_val_auc': 0.7233464342670329, 'mean_val2_auc': 0.7329541803932489, 'f1_diff_cv_val': 0.33061974725112797, 'auc_diff_cv_val': 0.27665345338072467, 'f1_diff_val2': 0.32325855410059523, 'auc_diff_val2': 0.2670457072545087, 'params': {'max_depth': None, 'max_features': 'log2'}}, "{'max_depth': None, 'max_features': N

Grid combinations search for model xgboost:   0%|          | 0/2310 [00:00<?, ?it/s]

{"{'n_estimators': 10, 'learning_rate': None, 'max_depth': None, 'gamma': None}": {'mean_train_f1': 0.7685867117503109, 'mean_cv_val_f1': 0.7416950009850698, 'mean_val2_f1': 0.7411630264062211, 'mean_train_auc': 0.899791880780384, 'mean_cv_val_auc': 0.8798086750116798, 'mean_val2_auc': 0.8778372647242406, 'f1_diff_cv_val': 0.026891710765241106, 'auc_diff_cv_val': 0.01998320576870416, 'f1_diff_val2': 0.027423685344089788, 'auc_diff_val2': 0.02195461605614335, 'params': {'n_estimators': 10, 'learning_rate': None, 'max_depth': None, 'gamma': None}}, "{'n_estimators': 10, 'learning_rate': None, 'max_depth': None, 'gamma': 0}": {'mean_train_f1': 0.7685867117503109, 'mean_cv_val_f1': 0.7416950009850698, 'mean_val2_f1': 0.7411630264062211, 'mean_train_auc': 0.899791880780384, 'mean_cv_val_auc': 0.8798086750116798, 'mean_val2_auc': 0.8778372647242406, 'f1_diff_cv_val': 0.026891710765241106, 'auc_diff_cv_val': 0.01998320576870416, 'f1_diff_val2': 0.027423685344089788, 'auc_diff_val2': 0.0219546

Grid combinations search for model random_forest:   0%|          | 0/105 [00:00<?, ?it/s]

{"{'n_estimators': 50, 'max_depth': None, 'max_features': 'sqrt'}": {'mean_train_f1': 0.9993905563500679, 'mean_cv_val_f1': 0.7573190594036245, 'mean_val2_f1': 0.7528265576114976, 'mean_train_auc': 0.9999987500601758, 'mean_cv_val_auc': 0.8888737551963034, 'mean_val2_auc': 0.887738237891597, 'f1_diff_cv_val': 0.24207149694644337, 'auc_diff_cv_val': 0.1111249948638724, 'f1_diff_val2': 0.24656399873857027, 'auc_diff_val2': 0.1122605121685788, 'params': {'n_estimators': 50, 'max_depth': None, 'max_features': 'sqrt'}}, "{'n_estimators': 50, 'max_depth': None, 'max_features': 'log2'}": {'mean_train_f1': 0.9994515949005965, 'mean_cv_val_f1': 0.7566649916185493, 'mean_val2_f1': 0.7540198162879863, 'mean_train_auc': 0.9999984585823493, 'mean_cv_val_auc': 0.8889122857427075, 'mean_val2_auc': 0.887987912057039, 'f1_diff_cv_val': 0.24278660328204726, 'auc_diff_cv_val': 0.11108617283964173, 'f1_diff_val2': 0.24543177861261023, 'auc_diff_val2': 0.11201054652531028, 'params': {'n_estimators': 50, 'm

Grid combinations search for model catboost:   0%|          | 0/900 [00:00<?, ?it/s]

{"{'iterations': 100, 'learning_rate': None, 'depth': 3, 'l2_leaf_reg': 1}": {'mean_train_f1': 0.7102179365282928, 'mean_cv_val_f1': 0.7095095589994688, 'mean_val2_f1': 0.7109339407878643, 'mean_train_auc': 0.8621906505332481, 'mean_cv_val_auc': 0.860850924516605, 'mean_val2_auc': 0.8595074676272526, 'f1_diff_cv_val': 0.0007083775288240224, 'auc_diff_cv_val': 0.0013397260166430058, 'f1_diff_val2': -0.0007160042595715232, 'auc_diff_val2': 0.0026831829059954515, 'params': {'iterations': 100, 'learning_rate': None, 'depth': 3, 'l2_leaf_reg': 1}}, "{'iterations': 100, 'learning_rate': None, 'depth': 3, 'l2_leaf_reg': 3}": {'mean_train_f1': 0.7100272169288193, 'mean_cv_val_f1': 0.70890278896031, 'mean_val2_f1': 0.7116607567401874, 'mean_train_auc': 0.8617908483687181, 'mean_cv_val_auc': 0.8605520150304871, 'mean_val2_auc': 0.8592067348506781, 'f1_diff_cv_val': 0.0011244279685093117, 'auc_diff_cv_val': 0.001238833338231049, 'f1_diff_val2': -0.0016335398113680455, 'auc_diff_val2': 0.002584113

{'lightgbm': {"{'n_estimators': 10, 'learning_rate': 0.001, 'max_depth': -1, 'num_leaves': 15}": {'mean_train_f1': 0.0,
   'mean_cv_val_f1': 0.0,
   'mean_val2_f1': 0.0,
   'mean_train_auc': 0.8129092798469089,
   'mean_cv_val_auc': 0.8092741866932569,
   'mean_val2_auc': 0.8128050843627508,
   'f1_diff_cv_val': 0.0,
   'auc_diff_cv_val': 0.003635093153651958,
   'f1_diff_val2': 0.0,
   'auc_diff_val2': 0.00010419548415807611,
   'params': {'n_estimators': 10,
    'learning_rate': 0.001,
    'max_depth': -1,
    'num_leaves': 15}},
  "{'n_estimators': 10, 'learning_rate': 0.001, 'max_depth': -1, 'num_leaves': 31}": {'mean_train_f1': 0.0,
   'mean_cv_val_f1': 0.0,
   'mean_val2_f1': 0.0,
   'mean_train_auc': 0.8302107463160441,
   'mean_cv_val_auc': 0.823960625512225,
   'mean_val2_auc': 0.8246250306901342,
   'f1_diff_cv_val': 0.0,
   'auc_diff_cv_val': 0.006250120803819037,
   'f1_diff_val2': 0.0,
   'auc_diff_val2': 0.005585715625909926,
   'params': {'n_estimators': 10,
    'learnin

In [103]:
# save results
with open('grid_search_results_new.json', 'w') as f:
    json.dump(grid_search_results, f)

In [7]:
# load results
with open('grid_search_results_new.json', 'r') as f:
    grid_search_results = json.load(f)

In [48]:
df_res= pd.DataFrame()

for model in grid_search_results:
    temp = grid_search_results[model]
    temp = pd.DataFrame.from_dict(temp)
    temp = temp.transpose()
    temp['model'] = model
    df_res = pd.concat([df_res, temp])

In [49]:
df_res = df_res.reset_index()
df_res

Unnamed: 0,index,mean_train_f1,mean_cv_val_f1,mean_val2_f1,mean_train_auc,mean_cv_val_auc,mean_val2_auc,f1_diff_cv_val,auc_diff_cv_val,f1_diff_val2,auc_diff_val2,params,model
0,"{'n_estimators': 10, 'learning_rate': 0.001, '...",0.0,0.0,0.0,0.812909,0.809274,0.812805,0.0,0.003635,0.0,0.000104,"{'n_estimators': 10, 'learning_rate': 0.001, '...",lightgbm
1,"{'n_estimators': 10, 'learning_rate': 0.001, '...",0.0,0.0,0.0,0.830211,0.823961,0.824625,0.0,0.00625,0.0,0.005586,"{'n_estimators': 10, 'learning_rate': 0.001, '...",lightgbm
2,"{'n_estimators': 10, 'learning_rate': 0.001, '...",0.0,0.0,0.0,0.846949,0.836175,0.836586,0.0,0.010774,0.0,0.010363,"{'n_estimators': 10, 'learning_rate': 0.001, '...",lightgbm
3,"{'n_estimators': 10, 'learning_rate': 0.001, '...",0.0,0.0,0.0,0.855941,0.84082,0.839018,0.0,0.01512,0.0,0.016923,"{'n_estimators': 10, 'learning_rate': 0.001, '...",lightgbm
4,"{'n_estimators': 10, 'learning_rate': 0.001, '...",0.0,0.0,0.0,0.861494,0.843364,0.841387,0.0,0.01813,0.0,0.020106,"{'n_estimators': 10, 'learning_rate': 0.001, '...",lightgbm
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4206,"{'iterations': 1000, 'learning_rate': 0.3, 'de...",0.999714,0.76464,0.763285,1.0,0.8873,0.889363,0.235074,0.1127,0.236429,0.110637,"{'iterations': 1000, 'learning_rate': 0.3, 'de...",catboost
4207,"{'iterations': 1000, 'learning_rate': 0.3, 'de...",0.999709,0.765761,0.765442,1.0,0.889546,0.890658,0.233948,0.110454,0.234267,0.109342,"{'iterations': 1000, 'learning_rate': 0.3, 'de...",catboost
4208,"{'iterations': 1000, 'learning_rate': 0.3, 'de...",0.999709,0.764302,0.757077,1.0,0.889051,0.88988,0.235407,0.110949,0.242632,0.11012,"{'iterations': 1000, 'learning_rate': 0.3, 'de...",catboost
4209,"{'iterations': 1000, 'learning_rate': 0.3, 'de...",0.999649,0.766239,0.761497,1.0,0.888903,0.892334,0.23341,0.111096,0.238152,0.107666,"{'iterations': 1000, 'learning_rate': 0.3, 'de...",catboost


In [100]:
top_auc = df_res[(df_res['auc_diff_val2']<=0.05)].sort_values(by='mean_val2_auc', ascending=False).head(500)
top_auc

Unnamed: 0,index,mean_train_f1,mean_cv_val_f1,mean_val2_f1,mean_train_auc,mean_cv_val_auc,mean_val2_auc,f1_diff_cv_val,auc_diff_cv_val,f1_diff_val2,auc_diff_val2,params,model
4052,"{'iterations': 1000, 'learning_rate': None, 'd...",0.829995,0.773603,0.759469,0.94081,0.900736,0.898911,0.056392,0.040074,0.070525,0.041899,"{'iterations': 1000, 'learning_rate': None, 'd...",catboost
3880,"{'iterations': 750, 'learning_rate': None, 'de...",0.831743,0.773363,0.758881,0.942406,0.900729,0.898754,0.058379,0.041677,0.072862,0.043651,"{'iterations': 750, 'learning_rate': None, 'de...",catboost
3877,"{'iterations': 750, 'learning_rate': None, 'de...",0.839949,0.774278,0.75399,0.947142,0.900998,0.898716,0.065671,0.046144,0.085959,0.048426,"{'iterations': 750, 'learning_rate': None, 'de...",catboost
3878,"{'iterations': 750, 'learning_rate': None, 'de...",0.836801,0.773748,0.757124,0.945203,0.90099,0.89848,0.063054,0.044213,0.079678,0.046723,"{'iterations': 750, 'learning_rate': None, 'de...",catboost
3879,"{'iterations': 750, 'learning_rate': None, 'de...",0.83389,0.774072,0.757938,0.943614,0.90092,0.898404,0.059817,0.042695,0.075952,0.045211,"{'iterations': 750, 'learning_rate': None, 'de...",catboost
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3140,"{'n_estimators': 250, 'learning_rate': 0.3, 'm...",0.802256,0.766123,0.754422,0.921554,0.892449,0.890813,0.036133,0.029105,0.047833,0.030741,"{'n_estimators': 250, 'learning_rate': 0.3, 'm...",xgboost
2757,"{'n_estimators': 250, 'learning_rate': None, '...",0.802256,0.766123,0.754422,0.921554,0.892449,0.890813,0.036133,0.029105,0.047833,0.030741,"{'n_estimators': 250, 'learning_rate': None, '...",xgboost
3142,"{'n_estimators': 250, 'learning_rate': 0.3, 'm...",0.802256,0.766123,0.754422,0.921554,0.892449,0.890813,0.036133,0.029105,0.047833,0.030741,"{'n_estimators': 250, 'learning_rate': 0.3, 'm...",xgboost
3686,"{'iterations': 500, 'learning_rate': None, 'de...",0.782312,0.764931,0.745099,0.909788,0.894859,0.890811,0.01738,0.014929,0.037212,0.018977,"{'iterations': 500, 'learning_rate': None, 'de...",catboost


In [101]:
topf1 = df_res[(df_res['f1_diff_val2']<=0.05)].sort_values(by='mean_val2_f1', ascending=False).head(500)
topf1

Unnamed: 0,index,mean_train_f1,mean_cv_val_f1,mean_val2_f1,mean_train_auc,mean_cv_val_auc,mean_val2_auc,f1_diff_cv_val,auc_diff_cv_val,f1_diff_val2,auc_diff_val2,params,model
3647,"{'iterations': 250, 'learning_rate': 0.3, 'dep...",0.802857,0.768232,0.761145,0.921848,0.894732,0.894043,0.034625,0.027116,0.041712,0.027805,"{'iterations': 250, 'learning_rate': 0.3, 'dep...",catboost
3822,"{'iterations': 500, 'learning_rate': 0.3, 'dep...",0.8057,0.769789,0.758422,0.923521,0.894439,0.89364,0.035912,0.029082,0.047278,0.029881,"{'iterations': 500, 'learning_rate': 0.3, 'dep...",catboost
596,"{'n_estimators': 100, 'learning_rate': 0.1, 'm...",0.797419,0.764862,0.758201,0.921306,0.894626,0.891613,0.032557,0.02668,0.039217,0.029694,"{'n_estimators': 100, 'learning_rate': 0.1, 'm...",lightgbm
422,"{'n_estimators': 50, 'learning_rate': 0.1, 'ma...",0.799671,0.763178,0.758008,0.922192,0.892836,0.891035,0.036493,0.029356,0.041663,0.031157,"{'n_estimators': 50, 'learning_rate': 0.1, 'ma...",lightgbm
3478,"{'iterations': 100, 'learning_rate': 0.3, 'dep...",0.8026,0.765728,0.757797,0.922085,0.893807,0.894511,0.036873,0.028278,0.044803,0.027574,"{'iterations': 100, 'learning_rate': 0.3, 'dep...",catboost
...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,"{'n_estimators': 50, 'learning_rate': 0.3, 'ma...",0.783172,0.760772,0.749915,0.908794,0.88974,0.887904,0.022401,0.019054,0.033257,0.02089,"{'n_estimators': 50, 'learning_rate': 0.3, 'ma...",lightgbm
1793,"{'n_estimators': 25, 'learning_rate': 0.3, 'ma...",0.797743,0.759356,0.749896,0.919888,0.889547,0.887524,0.038388,0.030341,0.047847,0.032364,"{'n_estimators': 25, 'learning_rate': 0.3, 'ma...",xgboost
1749,"{'n_estimators': 25, 'learning_rate': 0.3, 'ma...",0.797743,0.759356,0.749896,0.919888,0.889547,0.887524,0.038388,0.030341,0.047847,0.032364,"{'n_estimators': 25, 'learning_rate': 0.3, 'ma...",xgboost
1408,"{'n_estimators': 25, 'learning_rate': None, 'm...",0.797743,0.759356,0.749896,0.919888,0.889547,0.887524,0.038388,0.030341,0.047847,0.032364,"{'n_estimators': 25, 'learning_rate': None, 'm...",xgboost


In [102]:
top_both = pd.merge(top_auc, topf1, on=['index'], how='inner')

# keep _x columns
top_both = top_both[[col for col in top_both.columns if '_x' in col]]
top_both.columns = [col.replace('_x', '') for col in top_both.columns]

In [107]:
top_both['sum_diff_val2'] = top_both['f1_diff_val2'] + top_both['auc_diff_val2']
top_both['sum_auc_f1_val2'] = top_both['mean_val2_f1'] + top_both['mean_val2_auc']

top_both = top_both.sort_values(by='sum_auc_f1_val2', ascending=False).head(1000).reset_index(drop=True)
top_both

Unnamed: 0,mean_train_f1,mean_cv_val_f1,mean_val2_f1,mean_train_auc,mean_cv_val_auc,mean_val2_auc,f1_diff_cv_val,auc_diff_cv_val,f1_diff_val2,auc_diff_val2,params,model,sum_diff_val2,sum_auc_f1_val2
0,0.802857,0.768232,0.761145,0.921848,0.894732,0.894043,0.034625,0.027116,0.041712,0.027805,"{'iterations': 250, 'learning_rate': 0.3, 'dep...",catboost,0.069517,1.655188
1,0.8026,0.765728,0.757797,0.922085,0.893807,0.894511,0.036873,0.028278,0.044803,0.027574,"{'iterations': 100, 'learning_rate': 0.3, 'dep...",catboost,0.072377,1.652309
2,0.8057,0.769789,0.758422,0.923521,0.894439,0.89364,0.035912,0.029082,0.047278,0.029881,"{'iterations': 500, 'learning_rate': 0.3, 'dep...",catboost,0.077159,1.652062
3,0.805877,0.767542,0.757205,0.92497,0.895469,0.894144,0.038335,0.029501,0.048672,0.030826,"{'iterations': 100, 'learning_rate': 0.2, 'dep...",catboost,0.079498,1.651349
4,0.807355,0.771077,0.757561,0.924299,0.895608,0.893656,0.036278,0.02869,0.049793,0.030642,"{'iterations': 750, 'learning_rate': 0.2, 'dep...",catboost,0.080436,1.651217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,0.791896,0.765415,0.751325,0.913977,0.893022,0.891201,0.026481,0.020956,0.040571,0.022776,"{'n_estimators': 250, 'learning_rate': 0.2, 'm...",xgboost,0.063347,1.642526
118,0.789679,0.764834,0.75111,0.913707,0.894245,0.891345,0.024845,0.019462,0.038569,0.022362,"{'iterations': 100, 'learning_rate': 0.2, 'dep...",catboost,0.060931,1.642455
119,0.788645,0.768804,0.750743,0.91145,0.894827,0.891666,0.019841,0.016623,0.037902,0.019784,"{'iterations': 750, 'learning_rate': 0.1, 'dep...",catboost,0.057686,1.642409
120,0.784959,0.765296,0.749987,0.910561,0.895226,0.891362,0.019663,0.015335,0.034971,0.019199,"{'iterations': 250, 'learning_rate': 0.1, 'dep...",catboost,0.05417,1.64135


In [108]:
params, model_name = top_both['params'][0], top_both['model'][0]
print('Winner model: |{}|'.format(model_name), 'with params: {}'.format(params))

Winner model: |catboost| with params: {'iterations': 250, 'learning_rate': 0.3, 'depth': 4, 'l2_leaf_reg': 3}


In [109]:
# train model with best params
model = return_model_with_param(model_name, params)
model.fit(X_train_scaled, y_train)

# save model
import pickle
with open('predictive_models/{}.pkl'.format(model_name), 'wb') as f:
    pickle.dump(model, f)


# save scaler
with open('predictive_models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [110]:
# calculate metrics on test set
from sklearn.metrics import roc_auc_score, f1_score

X_test = np.vstack(df[df['sample']=='test']['title_vector'])
y_test = df[df['sample']=='test']['is_clickbait']

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:,1]

print('F1 score: {}'.format(f1_score(y_test, y_pred)))
print('AUC score: {}'.format(roc_auc_score(y_test, y_pred_proba)))

F1 score: 0.7622803872355683
AUC score: 0.8899570071794486


In [111]:
# find best threshold for f1 score
import numpy as np
from tqdm.auto import tqdm
# find threshold for best f1 score
thresholds = np.linspace(0, 1, 10000)
f1s = dict()
for threshold in tqdm(thresholds):
    y_pred = (model.predict_proba(X_test_scaled)[:, 1] >= threshold).astype(int)
    f1 = f1_score(y_test, y_pred)
    f1s[threshold] = f1

# get threshold for best f1 score
threshold = max(f1s, key=f1s.get)
threshold

  0%|          | 0/10000 [00:00<?, ?it/s]

0.43534353435343537

In [112]:
# find auc and f1 for best threshold
y_pred = (model.predict_proba(X_test_scaled)[:, 1] >= threshold).astype(int)
y_proba = model.predict_proba(X_test_scaled)[:, 1]

roc_auc = roc_auc_score(y_test, y_proba)
f1 = f1_score(y_test, y_pred)

print(f'ROC AUC test: {roc_auc}')
print(f'F1 test: {f1}')

ROC AUC test: 0.8899570071794486
F1 test: 0.7718097844680124


In [113]:
# find auc and f1 for best threshold on train
y_pred = (model.predict_proba(X_train_scaled)[:, 1] >= threshold).astype(int)
y_proba = model.predict_proba(X_train_scaled)[:, 1]

roc_auc = roc_auc_score(y_train, y_proba)
f1 = f1_score(y_train, y_pred)

print(f'ROC AUC train: {roc_auc}')
print(f'F1 train: {f1}')

ROC AUC train: 0.917700831372996
F1 train: 0.8035177218563215


In [114]:
# save threshold to txt file
with open('predictive_models/threshold.txt', 'w') as f:
    f.write(str(threshold))