In [3]:
import pandas as pd
import numpy as np
import os

from gensim.models import Word2Vec

from classes import *

%run functions.py

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [4]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import f1_score, roc_auc_score

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler


In [18]:
# if data not saved as csv, run this
import os
if not os.path.exists('data/merged_titles_labels.csv'):
    df1 = pd.read_csv('../eda/small1/labeled.csv')
    df2 = pd.read_csv('../eda/small2/labeled.csv')
    df3 = pd.read_csv('../eda/small3/labeled.csv')
    df = pd.concat([df1, df2, df3], ignore_index=True).reset_index(drop=True)
    df.to_csv('data/merged_titles_labels.csv', index=False)
    df.head()
else:
    df = pd.read_csv('data/merged_titles_labels.csv')
df.head()


Unnamed: 0,title,is_clickbait
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,Why the Truth Might Get You Fired,1
3,15 Civilians Killed In Single US Airstrike Hav...,1
4,Iranian woman jailed for fictional unpublished...,1


In [19]:
%run functions.py
if not os.path.exists('data/preprocessed_titles_labels.pkl'):
    df = preprocess_title(df, verbose=True)
    df.to_pickle('data/preprocessed_titles_labels.pkl') 

else:
    df = pd.read_pickle('data/preprocessed_titles_labels.pkl')
df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package words is already up-to-date!


0        House Dem Aide: We Didn’t Even See Comey’s Let...
1        FLYNN: Hillary Clinton, Big Woman on Campus - ...
2                        Why the Truth Might Get You Fired
3        15 Civilians Killed In Single US Airstrike Hav...
4        Iranian woman jailed for fictional unpublished...
                               ...                        
75017    To Make Female Hearts Flutter in Iraq, Throw a...
75018    British Liberal Democrat Patsy Calton, 56, die...
75019    Drone smartphone app to help heart attack vict...
75020    Netanyahu Urges Pope Benedict, in Israel, to D...
75021    Computer Makers Prepare to Stake Bigger Claim ...
Name: title, Length: 75022, dtype: object
Removing numbers and replacing with words...
0        House Dem Aide: We Didn’t Even See Comey’s Let...
1        FLYNN: Hillary Clinton, Big Woman on Campus - ...
2                        Why the Truth Might Get You Fired
3        fifteen Civilians Killed In Single US Airstrik...
4        Iranian woman jaile

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title'] = df['title'].apply(lambda x: [word for word in x if word not in stop_words])


Unnamed: 0,title,is_clickbait
0,"[house, dem, aide, , even, see, comey, letter...",1
1,"[flynn, hillary, clinton, big, woman, campus, ...",0
2,"[truth, might, get, fired]",1
3,"[fifteen, civilian, killed, single, usa, airst...",1
4,"[iranian, woman, jailed, fictional, unpublishe...",1


In [22]:
from tqdm.notebook import tqdm
word2vec_results = {}
# iterate over files in word2vec_models folder
for file in tqdm(os.listdir('word2vec_models')):
    # check if file is a .model file
    if file.endswith('.model'):

        # load model
        # path to model
        path = os.path.join('word2vec_models', file)

        properties = file.split('_')
        # print(properties)

        vector_size = int(properties[1][2:])
        window_size = int(properties[2][3:])
        is_skipgram = bool(int(properties[3][2:-6]))

        settings = {
            'model_path': path,
            'is_skipgram' : is_skipgram,
            'window_size' : window_size,
            'vector_size' : vector_size, 

        }
        # print(settings)

        model_w2v = Word2VecModel(settings)
        # get model name
        model_name = file.split('.')[0]

        title_vectors = [get_word_vectors(model_w2v, title, aggregation='mean') for title in df['title']]

        X = np.vstack(title_vectors)
        y = df['is_clickbait'].values
   

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True)

        # scale data
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # train models
        models = {
            'catboost': CatBoostClassifier(verbose=False),
            'xgboost': XGBClassifier(),
            'lightgbm': LGBMClassifier(),
            'knn': KNeighborsClassifier(),
            'svm': SVC(),
            'logreg': LogisticRegression(),
            'randomforest': RandomForestClassifier(),
            'decisiontree': DecisionTreeClassifier(),
        }

        results = {}

        for model in tqdm(models, desc=f'Predictive Models for {model_name}'):
            #print(f'Training {model}...')
            models[model].fit(X_train_scaled, y_train)
            y_pred = models[model].predict(X_test_scaled)
            f1 = f1_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred)
            results[model] = {
                'f1': f1,
                'auc': auc
            }
            print(f'F1: {f1}')
            print(f'AUC: {auc}')
            print()
        
    word2vec_results[model_name] = results





  0%|          | 0/236 [00:00<?, ?it/s]

Predictive Models for word2vec_vs3000_win8_sg0:   0%|          | 0/8 [00:00<?, ?it/s]

KeyboardInterrupt: 