In [1]:
import ssl

ssl._create_default_https_context = ssl._create_unverified_context


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')

%run '../functions.py'
%run '../classes.py'

In [3]:
save_path = 'grid_search_results_new.json'
grids_path = '../grid_search_grids.json'

In [4]:
model_w2v_settings = return_best_model()

In [5]:
model_w2v_settings['model_path'] = '../all_datasets/' + model_w2v_settings['model_path']

model_w2v = Word2VecModel(model_w2v_settings)

In [None]:
# read preprocessed data from pickle file
df = pd.read_pickle('data/preprocessed_titles_labels.pkl')
df.head()

In [None]:
df['sample'].value_counts()


In [None]:
# split data into train and test stratified by y
from sklearn.preprocessing import StandardScaler
import os
import pickle

df['title_vector'] = [get_word_vectors(model_w2v, title, aggregation='mean') for title in df['title']]
df = df.reset_index(drop=True)
df.head()

In [9]:
df['title_vector'][0]

array([-0.63768053, -0.16730867,  1.2560811 ,  0.3338902 , -0.56454986,
        0.44497627,  0.784385  ,  0.3551108 , -0.8828402 ,  0.59592795,
        0.1280393 , -0.64912486,  0.81945324, -0.3996215 ,  0.32769346,
        1.2330511 ,  0.67513835, -0.43883395,  0.24185432, -0.9126102 ],
      dtype=float32)

In [10]:
variables_to_drop = get_dimensions_to_drop()
variables_to_drop

[10, 6, 4, 12, 13, 19, 18, 16, 14, 11, 8, 9]

In [11]:
df['title_vector'] = [drop_dimensions_from_vector(vector, variables_to_drop) for vector in df['title_vector']]

In [12]:
df['title_vector'][0]

array([-0.63768053, -0.16730867,  1.2560811 ,  0.3338902 ,  0.44497627,
        0.3551108 ,  1.2330511 , -0.43883395], dtype=float32)

In [13]:
X_train = np.vstack(df[df['sample']=='train']['title_vector'])
X_test = np.vstack(df[df['sample']=='val2']['title_vector'])

y_train = df[df['sample']=='train']['is_clickbait']
y_test = df[df['sample']=='val2']['is_clickbait']

# scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
import json

with open(grids_path, 'r') as f:
    model_settings = json.load(f)

model_settings
grids = model_settings['grid_search_grids']
grids

In [15]:
models_to_CV = list()

for model_name, grid in grids.items():
    # check if model has already been trained - if 'best_params' exists in grid
    if 'best_params' in grid.keys():
        print(f'Model {model_name} already trained')
        continue
    models_to_CV.append((model_name))
models_to_CV

# remove _grid from model names
models_to_CV = [model_name.replace('_grid', '') for model_name in models_to_CV]
models_to_CV

['decision_tree', 'catboost', 'lightgbm', 'xgboost', 'random_forest']

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [17]:
def return_model_with_param(model_name, param = dict()):
    if model_name == 'decision_tree':
        model = DecisionTreeClassifier(**param)
    elif model_name == 'random_forest':
        model = RandomForestClassifier(**param)
    elif model_name == 'xgboost':
        model = XGBClassifier(**param)
    elif model_name == 'lightgbm':
        model = LGBMClassifier(**param, verbose=-1)
    elif model_name == 'catboost':
        model = CatBoostClassifier(**param,verbose=0)
    return model

In [18]:
models_to_CV = [
    'catboost',
    'lightgbm', 
    'xgboost',
    
    'decision_tree', 
    'random_forest',  
    
]

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score
from tqdm.notebook import tqdm

# read existing results
try:
    with open(save_path, 'r') as f:
        grid_search_results = json.load(f)
    print('Loaded existing results')
except:
    grid_search_results = dict()
    print('No existing results found - creating new dict')
for model_name in models_to_CV:
    grid_search_results[model_name] = dict()

for model_name in tqdm(models_to_CV, desc = 'Models'):

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # get grid
    grid = grids[model_name+'_grid']

    # generate all combinations of parameters
    import itertools

    keys, values = zip(*grid.items())
    combinations_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]

    # do grid search
    
    for params in combinations_dicts:
        grid_search_results[model_name][str(params)] = dict()

    for params in tqdm(combinations_dicts, desc = 'Grid combinations search for model {}'.format(model_name)):
       
        model = return_model_with_param(model_name, params)
        scores_auc_cv_val = list()
        scores_auc_val2 = list()
        scores_auc_train = list()

        scores_f1_cv_val = list()
        scores_f1_val2 = list()
        scores_f1_train = list()

        for train_index, val_index in kf.split(X_train_scaled, y_train):
            X_train_kf = X_train_scaled[train_index]
            X_val_kf = X_train_scaled[val_index]
            y_train_kf = y_train.iloc[train_index]
            y_val_kf = y_train.iloc[val_index]

            model.fit(X_train_kf, y_train_kf)

            # print(model)

        
            scores_auc_cv_val.append(roc_auc_score(y_val_kf, model.predict_proba(X_val_kf)[:,1]))
            scores_auc_val2.append(roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:,1]))
            scores_auc_train.append(roc_auc_score(y_train_kf, model.predict_proba(X_train_kf)[:,1]))

            y_pred_cv_val = model.predict_proba(X_val_kf)[:,1]
            y_pred_val2 = model.predict_proba(X_test_scaled)[:,1]
            y_pred_train = model.predict_proba(X_train_kf)[:,1]

            # print(y_pred_cv_val[y_pred_cv_val>0.5])

            y_pred_cv_val = np.where(y_pred_cv_val > 0.5, 1, 0)
            y_pred_val2 = np.where(y_pred_val2 > 0.5, 1, 0)
            y_pred_train = np.where(y_pred_train > 0.5, 1, 0)


            
            scores_f1_cv_val.append(f1_score(y_val_kf, model.predict(X_val_kf)))
            scores_f1_val2.append(f1_score(y_test, model.predict(X_test_scaled)))
            scores_f1_train.append(f1_score(y_train_kf, model.predict(X_train_kf)))
            break

        # print(scores_f1_train, scores_f1_cv_val, scores_f1_val2)
        # print(scores_auc_train, scores_auc_cv_val, scores_auc_val2)

        grid_search_results[model_name][str(params)]['mean_train_f1'] = np.mean(scores_f1_train)
        grid_search_results[model_name][str(params)]['mean_cv_val_f1'] = np.mean(scores_f1_cv_val)
        grid_search_results[model_name][str(params)]['mean_val2_f1'] = np.mean(scores_f1_val2)

        grid_search_results[model_name][str(params)]['mean_train_auc'] = np.mean(scores_auc_train)
        grid_search_results[model_name][str(params)]['mean_cv_val_auc'] = np.mean(scores_auc_cv_val)
        grid_search_results[model_name][str(params)]['mean_val2_auc'] = np.mean(scores_auc_val2)

        grid_search_results[model_name][str(params)]['f1_diff_cv_val'] = np.mean(scores_f1_train) - np.mean(scores_f1_cv_val)
        grid_search_results[model_name][str(params)]['auc_diff_cv_val'] = np.mean(scores_auc_train) - np.mean(scores_auc_cv_val)

        grid_search_results[model_name][str(params)]['f1_diff_val2'] = np.mean(scores_f1_train) - np.mean(scores_f1_val2)
        grid_search_results[model_name][str(params)]['auc_diff_val2'] = np.mean(scores_auc_train) - np.mean(scores_auc_val2)

        grid_search_results[model_name][str(params)]['params'] = params

        # save results to json file
        with open(save_path, 'w') as f:
            json.dump(grid_search_results, f)
    



    print(grid_search_results[model_name])
    
# grid_search_results

In [20]:
# save results
with open(save_path, 'w') as f:
    json.dump(grid_search_results, f)

In [21]:
# load results
with open(save_path, 'r') as f:
    grid_search_results = json.load(f)

In [22]:
df_res= pd.DataFrame()

for model in grid_search_results:
    temp = grid_search_results[model]
    temp = pd.DataFrame.from_dict(temp)
    temp = temp.transpose()
    temp['model'] = model
    df_res = pd.concat([df_res, temp])

In [23]:
df_res = df_res.reset_index()

In [None]:
top_auc = df_res[(df_res['auc_diff_val2']<=0.05)].sort_values(by='mean_val2_auc', ascending=False).head(500)
top_auc

In [None]:
topf1 = df_res[(df_res['f1_diff_val2']<=0.05)].sort_values(by='mean_val2_f1', ascending=False).head(500)
topf1

In [26]:
top_both = pd.merge(top_auc, topf1, on=['index'], how='inner')

# keep _x columns
top_both = top_both[[col for col in top_both.columns if '_x' in col]]
top_both.columns = [col.replace('_x', '') for col in top_both.columns]

In [27]:
top_both['sum_diff_val2'] = top_both['f1_diff_val2'] + top_both['auc_diff_val2']
top_both['sum_auc_f1_val2'] = top_both['mean_val2_f1'] + top_both['mean_val2_auc']

top_both = top_both.sort_values(by='sum_auc_f1_val2', ascending=False).head(1000).reset_index(drop=True)
top_both

Unnamed: 0,mean_train_f1,mean_cv_val_f1,mean_val2_f1,mean_train_auc,mean_cv_val_auc,mean_val2_auc,f1_diff_cv_val,auc_diff_cv_val,f1_diff_val2,auc_diff_val2,params,model,sum_diff_val2,sum_auc_f1_val2
0,1.0,0.952613,0.95778,1.0,0.988746,0.989432,0.047387,0.011254,0.04222,0.010568,"{'n_estimators': 250, 'learning_rate': 0.2, 'm...",lightgbm,0.052787,1.947213
1,0.988469,0.951635,0.956522,0.999448,0.989098,0.989551,0.036833,0.01035,0.031947,0.009897,"{'n_estimators': 50, 'learning_rate': 0.2, 'ma...",lightgbm,0.041844,1.946073
2,1.0,0.952066,0.954106,1.0,0.988705,0.990702,0.047934,0.011295,0.045894,0.009298,"{'n_estimators': 500, 'learning_rate': None, '...",lightgbm,0.055192,1.944808
3,1.0,0.952066,0.954106,1.0,0.988705,0.990702,0.047934,0.011295,0.045894,0.009298,"{'n_estimators': 500, 'learning_rate': 0.1, 'm...",lightgbm,0.055192,1.944808
4,1.0,0.951767,0.955368,1.0,0.987997,0.989354,0.048233,0.012003,0.044632,0.010646,"{'n_estimators': 500, 'learning_rate': 0.2, 'm...",lightgbm,0.055278,1.944722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,0.98004,0.953076,0.951574,0.998091,0.989728,0.988913,0.026964,0.008363,0.028466,0.009178,"{'n_estimators': 100, 'learning_rate': 0.1, 'm...",xgboost,0.037644,1.940487
188,0.965586,0.953375,0.951574,0.994584,0.989774,0.9889,0.012211,0.00481,0.014012,0.005684,"{'n_estimators': 100, 'learning_rate': 0.1, 'm...",xgboost,0.019697,1.940474
189,0.965586,0.953375,0.951574,0.994584,0.989774,0.9889,0.012211,0.00481,0.014012,0.005684,"{'n_estimators': 100, 'learning_rate': 0.1, 'm...",xgboost,0.019697,1.940474
190,0.981665,0.953424,0.951691,0.998486,0.988779,0.988775,0.028241,0.009706,0.029974,0.009711,"{'n_estimators': 100, 'learning_rate': 0.2, 'm...",xgboost,0.039684,1.940466


In [28]:
params, model_name = top_both['params'][0], top_both['model'][0]
print('Winner model: |{}|'.format(model_name), 'with params: {}'.format(params))

Winner model: |lightgbm| with params: {'n_estimators': 250, 'learning_rate': 0.2, 'max_depth': 8, 'num_leaves': 93}


In [29]:
# train model with best params
model = return_model_with_param(model_name, params)
model.fit(X_train_scaled, y_train)

In [30]:
# save model
import pickle
with open('predictive_models/{}.pkl'.format(model_name), 'wb') as f:
    pickle.dump(model, f)


In [31]:
# save scaler
with open('predictive_models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [32]:
# calculate metrics on test set
from sklearn.metrics import roc_auc_score, f1_score

X_test = np.vstack(df[df['sample']=='test']['title_vector'])
y_test = df[df['sample']=='test']['is_clickbait']

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:,1]

print('F1 score: {}'.format(f1_score(y_test, y_pred)))
print('AUC score: {}'.format(roc_auc_score(y_test, y_pred_proba)))

F1 score: 0.9509011808576756
AUC score: 0.988110506000508


In [None]:
# find best threshold for f1 score
import numpy as np
from tqdm.auto import tqdm
# find threshold for best f1 score
thresholds = np.linspace(0, 1, 10000)
f1s = dict()
for threshold in tqdm(thresholds):
    y_pred = (model.predict_proba(X_test_scaled)[:, 1] >= threshold).astype(int)
    f1 = f1_score(y_test, y_pred)
    f1s[threshold] = f1

# get threshold for best f1 score
threshold = max(f1s, key=f1s.get)
threshold

In [34]:
# find auc and f1 for best threshold and accuracy
from sklearn.metrics import accuracy_score
y_pred = (model.predict_proba(X_test_scaled)[:, 1] >= threshold).astype(int)
y_proba = model.predict_proba(X_test_scaled)[:, 1]

roc_auc = roc_auc_score(y_test, y_proba)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f'ROC AUC test: {roc_auc}')
print(f'F1 test: {f1}')
print(f'Accuracy test: {accuracy}')

ROC AUC test: 0.988110506000508
F1 test: 0.9556527170518425
Accuracy test: 0.955818294959552


In [35]:
# find auc and f1 for best threshold on train
y_pred = (model.predict_proba(X_train_scaled)[:, 1] >= threshold).astype(int)
y_proba = model.predict_proba(X_train_scaled)[:, 1]

roc_auc = roc_auc_score(y_train, y_proba)
f1 = f1_score(y_train, y_pred)
accuracy = accuracy_score(y_train, y_pred)

print(f'ROC AUC train: {roc_auc}')
print(f'F1 train: {f1}')
print(f'Accuracy train: {accuracy}')

ROC AUC train: 1.0
F1 train: 1.0
Accuracy train: 1.0


In [36]:
# save threshold to txt file
with open('predictive_models/threshold.txt', 'w') as f:
    f.write(str(threshold))

In [37]:
def get_undropped_dimensions(dropped_dimensions, n_dim):
    all_dimensions = list(range(n_dim))
    undropped_dimensions = [dim for dim in all_dimensions if dim not in dropped_dimensions]
    return undropped_dimensions

undropped_dimensions = get_undropped_dimensions(variables_to_drop, 20)
undropped_dimensions = [str('dim_'+str(dim)) for dim in undropped_dimensions]
undropped_dimensions = np.array(undropped_dimensions)
undropped_dimensions

array(['dim_0', 'dim_1', 'dim_2', 'dim_3', 'dim_5', 'dim_7', 'dim_15',
       'dim_17'], dtype='<U6')

In [38]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=undropped_dimensions)

In [39]:
import dalex
# Create an explainer object
exp = dalex.Explainer(model, X_train_scaled_df, y_train, label='Champion Model')

# Calculate feature importance
feature_importance = exp.model_parts()
feature_importance.plot(max_vars=27)


Preparation of a new explainer is initiated

  -> data              : 28836 rows 8 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 28836 values
  -> model_class       : lightgbm.sklearn.LGBMClassifier (default)
  -> label             : Champion Model
  -> predict function  : <function yhat_proba_default at 0x2b9d737e0> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 2.81e-10, mean = 0.498, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.0711, mean = -2.77e-07, max = 0.0999
  -> model_info        : package lightgbm

A new explainer has been created!
