In [20]:
import pandas as pd
import numpy as np
import os

from gensim.models import Word2Vec

%run '../../functions.py'
%run '../../classes.py'

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [21]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import f1_score, roc_auc_score

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler


In [22]:
if not os.path.exists('../data/preprocessed_titles_labels.pkl'):
    df = preprocess_title(df, verbose=True)
    df.to_pickle('data/preprocessed_titles_labels.pkl') 

else:
    df = pd.read_pickle('../data/preprocessed_titles_labels.pkl')
df.head()

Unnamed: 0,title,is_clickbait,text,dataset,sample
0,"[seventeen, time, game, throne, matched, perfe...",1,,clickbait-dataset,train
1,"[facing, texas, tar, heel, territory, duke, fi...",0,,clickbait-dataset,train
4,"[wikinews, interview, kevin, baugh, president,...",0,,clickbait-dataset,train
6,"[identify, justin, bieber, music, video, youtu...",1,,clickbait-dataset,train
9,"[uganda, introduces, antihomosexual, legislation]",0,,clickbait-dataset,train


In [23]:
# read results from previous runs json file
save_path = 'results/word2vec_results_500_1000.json'
import json

# if file doesn't exist, create it
if not os.path.exists(save_path):
    with open(save_path, 'w') as f:
        json.dump({}, f)
with open(save_path, 'r') as f:
    word2vec_results = json.load(f)


# get all the models from the results
models_already_trained = []
for model in word2vec_results:
    models_already_trained.append(model)
models_already_trained = [model + ".model" for model in models_already_trained]
models_already_trained

['word2vec_vs500_win7_sg0.model',
 'word2vec_vs500_win6_sg1.model',
 'word2vec_vs500_win6_sg0.model',
 'word2vec_vs500_win7_sg1.model',
 'word2vec_vs500_win3_sg0.model',
 'word2vec_vs500_win5_sg0.model',
 'word2vec_vs500_win8_sg0.model',
 'word2vec_vs500_win4_sg1.model',
 'word2vec_vs500_win3_sg1.model',
 'word2vec_vs500_win4_sg0.model',
 'word2vec_vs500_win8_sg1.model',
 'word2vec_vs500_win5_sg1.model',
 'word2vec_vs1000_win5_sg1.model',
 'word2vec_vs1000_win4_sg0.model']

In [24]:
from tqdm import tqdm
import copy as cp


files = os.listdir('../../all_datasets/word2vec_models')
files = [file for file in files if file.endswith('.model')]

files500 = [file for file in files if file.startswith('word2vec_vs500_')]
files1000 = [file for file in files if file.startswith('word2vec_vs1000_')]


files = files500 + files1000

# remove the files that have already been trained
files = [file for file in files if file not in models_already_trained]

print(files)


['word2vec_vs1000_win8_sg1.model', 'word2vec_vs1000_win3_sg1.model', 'word2vec_vs1000_win8_sg0.model', 'word2vec_vs1000_win4_sg1.model', 'word2vec_vs1000_win5_sg0.model', 'word2vec_vs1000_win3_sg0.model', 'word2vec_vs1000_win7_sg1.model', 'word2vec_vs1000_win6_sg0.model', 'word2vec_vs1000_win6_sg1.model', 'word2vec_vs1000_win7_sg0.model']


In [25]:
import copy as cp
from tqdm.auto import tqdm
# iterate over files in word2vec_models folder
for file in tqdm(files):
    # check if file is a .model file
    if file.endswith('.model'):
        df_temp = cp.deepcopy(df)
        # load model
        # path to model
        model_path = os.path.join('../../all_datasets/word2vec_models', file)

        properties = file.split('_')
        # print(properties)

        vector_size = int(properties[1][2:])
        window_size = int(properties[2][3:])
        is_skipgram = bool(int(properties[3][2:-6]))

        settings = {
            'model_path': model_path,
            'is_skipgram' : is_skipgram,
            'window_size' : window_size,
            'vector_size' : vector_size, 

        }

        model_w2v = Word2VecModel(settings)
        # get model name
        model_name = file.split('.')[0]

        title_vectors = [get_word_vectors(model_w2v, title, aggregation='mean') for title in df_temp['title']]
        df_temp['title_vector'] = title_vectors

        train = df_temp['sample'] == 'train'
        test = df_temp['sample'] == 'val1'

        # X = np.vstack(title_vectors)
        # y = df['is_clickbait'].values

        X_train = np.vstack(df_temp[train]['title_vector'])
        y_train = df_temp[train]['is_clickbait'].values

        X_test = np.vstack(df_temp[test]['title_vector'])
        y_test = df_temp[test]['is_clickbait'].values
   

        # scale data
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # train models
        models = {
            'catboost': [
                CatBoostClassifier(verbose=False, random_state=1, ),
                CatBoostClassifier(verbose=False, random_state=2, ),
                CatBoostClassifier(verbose=False, random_state=3, ),
             
            ],

            'lightgbm': [
                LGBMClassifier(random_state=1, verbose=-1),
                LGBMClassifier(random_state=2, verbose=-1),
                LGBMClassifier(random_state=3 ,verbose=-1),
            
            ],
            
            'knn' : [
                KNeighborsClassifier(n_neighbors=3),
                KNeighborsClassifier(n_neighbors=7),
                KNeighborsClassifier(n_neighbors=5),
               
            ],

            'randomforest': [
                RandomForestClassifier(random_state=1,),
                RandomForestClassifier(random_state=2, ),
                RandomForestClassifier(random_state=3, ),
        

            ],

            'decisiontree': [
                DecisionTreeClassifier(random_state=1, ),
                DecisionTreeClassifier(random_state=2, ),
                DecisionTreeClassifier(random_state=3, ),
               
            ],
        }


        results = {}

        for model_type in tqdm(models, desc=f'Predictive Models for {model_name}'):
            f1_temp = list()
            auc_temp = list()
            # print(model_type)
            for submodel in tqdm(models[model_type], desc=f'{model_type} submodels'):
                submodel.fit(X_train_scaled, y_train)
                y_pred = submodel.predict(X_test_scaled)
                f1 = f1_score(y_test, y_pred)
                auc = roc_auc_score(y_test, y_pred)
                f1_temp.append(f1)
                auc_temp.append(auc)
                
            results[model_type] = {
                'f1': np.mean(f1_temp),
                'auc': np.mean(auc_temp),
                'f1_list': cp.deepcopy(f1_temp),
                'auc_list': cp.deepcopy(auc_temp),
            }
            
            
        word2vec_results[model_name] = results
        # save results to json
        with open(save_path, 'w') as f:
            json.dump(word2vec_results, f, indent=4)
        print(f'{model_name} done and updated json')





  0%|          | 0/10 [00:00<?, ?it/s]

{'model_path': '../../all_datasets/word2vec_models/word2vec_vs1000_win8_sg1.model', 'is_skipgram': True, 'window_size': 8, 'vector_size': 1000}


Predictive Models for word2vec_vs1000_win8_sg1:   0%|          | 0/5 [00:00<?, ?it/s]

catboost submodels:   0%|          | 0/3 [00:00<?, ?it/s]

lightgbm submodels:   0%|          | 0/3 [00:00<?, ?it/s]

knn submodels:   0%|          | 0/3 [00:00<?, ?it/s]

randomforest submodels:   0%|          | 0/3 [00:00<?, ?it/s]

decisiontree submodels:   0%|          | 0/3 [00:00<?, ?it/s]

word2vec_vs1000_win8_sg1 done and updated json
{'model_path': '../../all_datasets/word2vec_models/word2vec_vs1000_win3_sg1.model', 'is_skipgram': True, 'window_size': 3, 'vector_size': 1000}


Predictive Models for word2vec_vs1000_win3_sg1:   0%|          | 0/5 [00:00<?, ?it/s]

catboost submodels:   0%|          | 0/3 [00:00<?, ?it/s]

lightgbm submodels:   0%|          | 0/3 [00:00<?, ?it/s]

knn submodels:   0%|          | 0/3 [00:00<?, ?it/s]

randomforest submodels:   0%|          | 0/3 [00:00<?, ?it/s]

decisiontree submodels:   0%|          | 0/3 [00:00<?, ?it/s]

word2vec_vs1000_win3_sg1 done and updated json
{'model_path': '../../all_datasets/word2vec_models/word2vec_vs1000_win8_sg0.model', 'is_skipgram': False, 'window_size': 8, 'vector_size': 1000}


Predictive Models for word2vec_vs1000_win8_sg0:   0%|          | 0/5 [00:00<?, ?it/s]

catboost submodels:   0%|          | 0/3 [00:00<?, ?it/s]

lightgbm submodels:   0%|          | 0/3 [00:00<?, ?it/s]

knn submodels:   0%|          | 0/3 [00:00<?, ?it/s]

randomforest submodels:   0%|          | 0/3 [00:00<?, ?it/s]

decisiontree submodels:   0%|          | 0/3 [00:00<?, ?it/s]

word2vec_vs1000_win8_sg0 done and updated json
{'model_path': '../../all_datasets/word2vec_models/word2vec_vs1000_win4_sg1.model', 'is_skipgram': True, 'window_size': 4, 'vector_size': 1000}


Predictive Models for word2vec_vs1000_win4_sg1:   0%|          | 0/5 [00:00<?, ?it/s]

catboost submodels:   0%|          | 0/3 [00:00<?, ?it/s]

lightgbm submodels:   0%|          | 0/3 [00:00<?, ?it/s]

knn submodels:   0%|          | 0/3 [00:00<?, ?it/s]

randomforest submodels:   0%|          | 0/3 [00:00<?, ?it/s]

decisiontree submodels:   0%|          | 0/3 [00:00<?, ?it/s]

word2vec_vs1000_win4_sg1 done and updated json
{'model_path': '../../all_datasets/word2vec_models/word2vec_vs1000_win5_sg0.model', 'is_skipgram': False, 'window_size': 5, 'vector_size': 1000}


Predictive Models for word2vec_vs1000_win5_sg0:   0%|          | 0/5 [00:00<?, ?it/s]

catboost submodels:   0%|          | 0/3 [00:00<?, ?it/s]

lightgbm submodels:   0%|          | 0/3 [00:00<?, ?it/s]

knn submodels:   0%|          | 0/3 [00:00<?, ?it/s]

randomforest submodels:   0%|          | 0/3 [00:00<?, ?it/s]

decisiontree submodels:   0%|          | 0/3 [00:00<?, ?it/s]

word2vec_vs1000_win5_sg0 done and updated json
{'model_path': '../../all_datasets/word2vec_models/word2vec_vs1000_win3_sg0.model', 'is_skipgram': False, 'window_size': 3, 'vector_size': 1000}


Predictive Models for word2vec_vs1000_win3_sg0:   0%|          | 0/5 [00:00<?, ?it/s]

catboost submodels:   0%|          | 0/3 [00:00<?, ?it/s]

lightgbm submodels:   0%|          | 0/3 [00:00<?, ?it/s]

knn submodels:   0%|          | 0/3 [00:00<?, ?it/s]

randomforest submodels:   0%|          | 0/3 [00:00<?, ?it/s]

decisiontree submodels:   0%|          | 0/3 [00:00<?, ?it/s]

word2vec_vs1000_win3_sg0 done and updated json
{'model_path': '../../all_datasets/word2vec_models/word2vec_vs1000_win7_sg1.model', 'is_skipgram': True, 'window_size': 7, 'vector_size': 1000}


Predictive Models for word2vec_vs1000_win7_sg1:   0%|          | 0/5 [00:00<?, ?it/s]

catboost submodels:   0%|          | 0/3 [00:00<?, ?it/s]

lightgbm submodels:   0%|          | 0/3 [00:00<?, ?it/s]

knn submodels:   0%|          | 0/3 [00:00<?, ?it/s]

randomforest submodels:   0%|          | 0/3 [00:00<?, ?it/s]

decisiontree submodels:   0%|          | 0/3 [00:00<?, ?it/s]

word2vec_vs1000_win7_sg1 done and updated json
{'model_path': '../../all_datasets/word2vec_models/word2vec_vs1000_win6_sg0.model', 'is_skipgram': False, 'window_size': 6, 'vector_size': 1000}


Predictive Models for word2vec_vs1000_win6_sg0:   0%|          | 0/5 [00:00<?, ?it/s]

catboost submodels:   0%|          | 0/3 [00:00<?, ?it/s]

lightgbm submodels:   0%|          | 0/3 [00:00<?, ?it/s]

knn submodels:   0%|          | 0/3 [00:00<?, ?it/s]

randomforest submodels:   0%|          | 0/3 [00:00<?, ?it/s]

decisiontree submodels:   0%|          | 0/3 [00:00<?, ?it/s]

word2vec_vs1000_win6_sg0 done and updated json
{'model_path': '../../all_datasets/word2vec_models/word2vec_vs1000_win6_sg1.model', 'is_skipgram': True, 'window_size': 6, 'vector_size': 1000}


Predictive Models for word2vec_vs1000_win6_sg1:   0%|          | 0/5 [00:00<?, ?it/s]

catboost submodels:   0%|          | 0/3 [00:00<?, ?it/s]

lightgbm submodels:   0%|          | 0/3 [00:00<?, ?it/s]

knn submodels:   0%|          | 0/3 [00:00<?, ?it/s]

randomforest submodels:   0%|          | 0/3 [00:00<?, ?it/s]

decisiontree submodels:   0%|          | 0/3 [00:00<?, ?it/s]

word2vec_vs1000_win6_sg1 done and updated json
{'model_path': '../../all_datasets/word2vec_models/word2vec_vs1000_win7_sg0.model', 'is_skipgram': False, 'window_size': 7, 'vector_size': 1000}


Predictive Models for word2vec_vs1000_win7_sg0:   0%|          | 0/5 [00:00<?, ?it/s]

catboost submodels:   0%|          | 0/3 [00:00<?, ?it/s]

lightgbm submodels:   0%|          | 0/3 [00:00<?, ?it/s]

knn submodels:   0%|          | 0/3 [00:00<?, ?it/s]

randomforest submodels:   0%|          | 0/3 [00:00<?, ?it/s]

decisiontree submodels:   0%|          | 0/3 [00:00<?, ?it/s]

word2vec_vs1000_win7_sg0 done and updated json


In [26]:
df['sample'].value_counts() 

sample
train    28836
test      1607
val2       781
val1       776
Name: count, dtype: int64