In [None]:
%run -i "../classes.py"
%run -i "../functions.py"

import pickle
import pandas as pd

In [2]:
model_settings_all_datasets = {
     "best_word2vec":
    {
        "model_path": "../all_datasets/word2vec_models/word2vec_vs50_win5_sg0.model",
        "vector_size": 50,
        "window_size": 5,
        "is_skipgram": False
    }
}

model_settings_clickbait_dataset = {
     "best_word2vec":
    {
        "model_path": "../all_datasets/word2vec_models/word2vec_vs20_win4_sg1.model",
        "vector_size": 20,
        "window_size": 4,
        "is_skipgram": True
    }
}

model_settings_fake_news = {
     "best_word2vec":
    {
        "model_path": "../all_datasets/word2vec_models/word2vec_vs100_win3_sg0.model",
        "vector_size": 100,
        "window_size": 3,
        "is_skipgram": False
    }
}


model_w2v_all_datasets = Word2VecModel(model_settings_all_datasets["best_word2vec"])
model_w2v_clickbait_dataset = Word2VecModel(model_settings_clickbait_dataset["best_word2vec"])
model_w2v_fake_news = Word2VecModel(model_settings_fake_news["best_word2vec"])

In [3]:
variables_to_drop_all_datasets = get_dimensions_to_drop("../all_datasets/")
variables_to_drop_clickbait_dataset_small = get_dimensions_to_drop("../clickbait_dataset/")
variables_to_drop_clickbait_dataset_large = get_dimensions_to_drop("../clickbait_dataset_more_variables/")
variables_to_drop_fake_news = get_dimensions_to_drop("../fake_news/")

In [None]:
sorted(variables_to_drop_clickbait_dataset_small)

In [None]:
sorted(variables_to_drop_clickbait_dataset_large)

In [None]:
with open('../all_datasets/data/preprocessed_titles_labels.pkl', 'rb') as f:
    data = pickle.load(f)

df = pd.DataFrame(data)
df=df[df['sample']=='test'].reset_index(drop=True)

df['title_vector_all-datasets'] = [get_word_vectors(model_w2v_all_datasets, title, aggregation='mean') for title in df['title']]
df['title_vector_clickbait-dataset-small'] = [get_word_vectors(model_w2v_clickbait_dataset, title, aggregation='mean') for title in df['title']]
df['title_vector_clickbait-dataset-large'] = [get_word_vectors(model_w2v_clickbait_dataset, title, aggregation='mean') for title in df['title']]
df['title_vector_fake-news'] = [get_word_vectors(model_w2v_fake_news, title, aggregation='mean') for title in df['title']]
df.head()

In [None]:
df['title_vector_all-datasets'] = [drop_dimensions_from_vector(vector, variables_to_drop_all_datasets) for vector in df['title_vector_all-datasets']]
df['title_vector_clickbait-dataset-small'] = [drop_dimensions_from_vector(vector, variables_to_drop_clickbait_dataset_small) for vector in df['title_vector_clickbait-dataset-small']]
df['title_vector_clickbait-dataset-large'] = [drop_dimensions_from_vector(vector, variables_to_drop_clickbait_dataset_large) for vector in df['title_vector_clickbait-dataset-large']]
df['title_vector_fake-news'] = [drop_dimensions_from_vector(vector, variables_to_drop_fake_news) for vector in df['title_vector_fake-news']]

df.head()

In [8]:
import copy as cp
df_all_datasets = cp.deepcopy(df)
df_clickbait_dataset = df[df['dataset']=='clickbait-dataset'].reset_index(drop=True)
df_fake_news = df[df['dataset']=='fake-news'].reset_index(drop=True)

df_all_datasets.shape, df_clickbait_dataset.shape, df_fake_news.shape


((2590, 9), (1595, 9), (995, 9))

In [None]:
df_all_datasets.head()

In [10]:
model_combinations = ['mod_all-datasets', 'mod_fake-news', 'mod_clickbait-dataset-small', 'mod_clickbait-dataset-large']
dataset_combinations = ['data_all_datasets', 'data_fake-news', 'data_clickbait-dataset']
metrics = ['accuracy', 'precision', 'recall', 'f1-score', 'roc-auc']
tresholds = ['defaultThresh', 'optimizedF1Thresh']

results = dict()

for model in model_combinations:
    results[model] = dict()
    for dataset in dataset_combinations:
        results[model][dataset] = dict()
        for threshold in tresholds:
            results[model][dataset][threshold] = dict()
            for metric in metrics:
                results[model][dataset][threshold][metric] = None




In [11]:
with open('../all_datasets/predictive_models/catboost.pkl', 'rb') as f:
    model_all_datasets = pickle.load(f)

with open('../clickbait_dataset/predictive_models/catboost.pkl', 'rb') as f:
    model_clickbait_dataset_small = pickle.load(f)

with open('../clickbait_dataset_more_variables/predictive_models/catboost.pkl', 'rb') as f:
    model_clickbait_dataset_large = pickle.load(f)

with open('../fake_news/predictive_models/catboost.pkl', 'rb') as f:
    model_fake_news = pickle.load(f)

In [12]:
with open('../all_datasets/predictive_models/threshold.txt', 'rb') as f:
    threshold_all_datasets = float(f.read())

with open('../clickbait_dataset/predictive_models/threshold.txt', 'rb') as f:
    threshold_clickbait_dataset_small = float(f.read())

with open('../clickbait_dataset_more_variables/predictive_models/threshold.txt', 'rb') as f:
    threshold_clickbait_dataset_large = float(f.read())

with open('../fake_news/predictive_models/threshold.txt', 'rb') as f:
    threshold_fake_news = float(f.read())

In [None]:
threshold_all_datasets, threshold_clickbait_dataset_small, threshold_clickbait_dataset_large, threshold_fake_news

In [14]:
with open('../all_datasets/predictive_models/scaler.pkl', 'rb') as f:
    scaler_all_datasets = pickle.load(f)

with open('../clickbait_dataset/predictive_models/scaler.pkl', 'rb') as f:
    scaler_clickbait_dataset_small = pickle.load(f)

with open('../clickbait_dataset_more_variables/predictive_models/scaler.pkl', 'rb') as f:
    scaler_clickbait_dataset_large = pickle.load(f)

with open('../fake_news/predictive_models/scaler.pkl', 'rb') as f:
    scaler_fake_news = pickle.load(f)


# All datasets data

In [None]:
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
for model in tqdm(model_combinations, desc='models'):
    print(model)
    if model == 'mod_all-datasets':
        scaler = cp.deepcopy(scaler_all_datasets)
        predictor = cp.deepcopy(model_all_datasets)
        opt_threshold = threshold_all_datasets
    elif model == 'mod_fake-news':
        scaler = cp.deepcopy(scaler_fake_news)
        predictor = cp.deepcopy(model_fake_news)
        opt_threshold = threshold_fake_news
    elif model == 'mod_clickbait-dataset-small':
        scaler = cp.deepcopy(scaler_clickbait_dataset_small)
        predictor = cp.deepcopy(model_clickbait_dataset_small)
        opt_threshold = threshold_clickbait_dataset_small
    elif model == 'mod_clickbait-dataset-large':
        scaler = cp.deepcopy(scaler_clickbait_dataset_large)
        predictor = cp.deepcopy(model_clickbait_dataset_large)
        opt_threshold = threshold_clickbait_dataset_large
       

    for dataset in tqdm(dataset_combinations, desc='datasets'):
        if dataset == 'data_all_datasets':
            df_temp = cp.deepcopy(df_all_datasets)
            dataset_name = 'all_datasets'
        elif dataset == 'data_fake-news':
            df_temp = cp.deepcopy(df_fake_news)
            dataset_name = 'fake-news'
        elif dataset == 'data_clickbait-dataset':
            df_temp = cp.deepcopy(df_clickbait_dataset)
            dataset_name = 'clickbait-dataset'

        for threhold, threshold_name in zip([0.5, opt_threshold], ['defaultThresh', 'optimizedF1Thresh']):
            model_name_split = model.split('_')[1]

            X = np.vstack(df_temp['title_vector_' + model_name_split].values)
            y_true = df_temp['is_clickbait']
            X_scaled = scaler.transform(X)

            y_pred_proba = predictor.predict_proba(X_scaled)[:,1]
            y_pred = (y_pred_proba > threhold).astype(int)


            results[model][dataset][threshold_name]['accuracy'] = accuracy_score(y_true, y_pred)
            results[model][dataset][threshold_name]['precision'] = precision_score(y_true, y_pred)
            results[model][dataset][threshold_name]['recall'] = recall_score(y_true, y_pred)
            results[model][dataset][threshold_name]['f1-score'] = f1_score(y_true, y_pred)
            results[model][dataset][threshold_name]['roc-auc'] = roc_auc_score(y_true, y_pred_proba)



    


In [None]:
results

In [None]:
# unpack the results to a dataframe
results_list = []
results_df = pd.DataFrame()
for model in model_combinations:
    for dataset in dataset_combinations:
        for threshold in tresholds:
            for metric in metrics:
                results_list.append([model, dataset, threshold, metric, results[model][dataset][threshold][metric]])

results_df = pd.DataFrame(results_list, columns=['model', 'dataset', 'threshold', 'metric', 'value'])
results_df.head()

In [18]:
results_df.to_csv('nlp_results.csv', index=False)

In [19]:
res = results_df

In [22]:
res_pivot = res[res['model']=='mod_all-datasets'].pivot_table(index=['dataset'], columns=['threshold', 'metric'], values='value', aggfunc='mean').reset_index()
res_pivot = res_pivot.transpose()

res_pivot




Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
threshold,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dataset,,data_all_datasets,data_clickbait-dataset,data_fake-news
defaultThresh,accuracy,0.867954,0.922884,0.779899
defaultThresh,f1-score,0.860294,0.924954,0.729295
defaultThresh,precision,0.894647,0.89917,0.883234
defaultThresh,recall,0.828482,0.952261,0.621053
defaultThresh,roc-auc,0.943361,0.983805,0.890883
optimizedF1Thresh,accuracy,0.867954,0.91348,0.794975
optimizedF1Thresh,f1-score,0.865142,0.917759,0.762238
optimizedF1Thresh,precision,0.867194,0.873016,0.853786
optimizedF1Thresh,recall,0.8631,0.967337,0.688421


In [23]:
res_pivot = res[res['model']=='mod_clickbait-dataset-small'].pivot_table(index=['dataset'], columns=['threshold', 'metric'], values='value', aggfunc='mean').reset_index()
res_pivot = res_pivot.transpose()

res_pivot




Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
threshold,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dataset,,data_all_datasets,data_clickbait-dataset,data_fake-news
defaultThresh,accuracy,0.791892,0.928527,0.572864
defaultThresh,f1-score,0.754665,0.928482,0.295191
defaultThresh,precision,0.895248,0.927318,0.695312
defaultThresh,recall,0.652242,0.929648,0.187368
defaultThresh,roc-auc,0.857894,0.977534,0.608065
optimizedF1Thresh,accuracy,0.793436,0.930408,0.573869
optimizedF1Thresh,f1-score,0.75357,0.92988,0.278912
optimizedF1Thresh,precision,0.908889,0.935197,0.725664
optimizedF1Thresh,recall,0.643588,0.924623,0.172632


In [24]:
res_pivot = res[res['model']=='mod_clickbait-dataset-large'].pivot_table(index=['dataset'], columns=['threshold', 'metric'], values='value', aggfunc='mean').reset_index()
res_pivot = res_pivot.transpose()

res_pivot




Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
threshold,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dataset,,data_all_datasets,data_clickbait-dataset,data_fake-news
defaultThresh,accuracy,0.808494,0.95674,0.570854
defaultThresh,f1-score,0.777778,0.956412,0.342065
defaultThresh,precision,0.903226,0.961881,0.637931
defaultThresh,recall,0.682927,0.951005,0.233684
defaultThresh,roc-auc,0.87683,0.989942,0.595903
optimizedF1Thresh,accuracy,0.811197,0.954859,0.580905
optimizedF1Thresh,f1-score,0.791115,0.955556,0.421637
optimizedF1Thresh,precision,0.865421,0.93932,0.617886
optimizedF1Thresh,recall,0.72856,0.972362,0.32


In [25]:
res_pivot = res[res['model']=='mod_fake-news'].pivot_table(index=['dataset'], columns=['threshold', 'metric'], values='value', aggfunc='mean').reset_index()
res_pivot = res_pivot.transpose()

res_pivot




Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
threshold,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dataset,,data_all_datasets,data_clickbait-dataset,data_fake-news
defaultThresh,accuracy,0.692664,0.573668,0.883417
defaultThresh,f1-score,0.747462,0.690628,0.878407
defaultThresh,precision,0.626263,0.541369,0.874739
defaultThresh,recall,0.926829,0.953518,0.882105
defaultThresh,roc-auc,0.808678,0.774368,0.948907
optimizedF1Thresh,accuracy,0.678378,0.55047,0.883417
optimizedF1Thresh,f1-score,0.745804,0.684835,0.884232
optimizedF1Thresh,precision,0.609172,0.526707,0.840607
optimizedF1Thresh,recall,0.961448,0.978643,0.932632
