In [140]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from classes import Word2VecModel

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [141]:
from functions import *
model_w2v_settings = return_best_model()

In [142]:
model_w2v = Word2VecModel(model_w2v_settings)

In [150]:
# read merged data
df = pd.read_csv('data/merged_titles_labels.csv')
df.head()

# preprocess data
df = preprocess_title(df)

In [151]:
# import models
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier


title_vectors = [get_word_vectors(model_w2v, title, aggregation='mean') for title in df['title']]

X = np.vstack(title_vectors)
y = df['is_clickbait'].values

In [152]:
# split data into train and test stratified by y
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True)

# scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# save scaled data as csv if data does not exist yet
if not os.path.exists('data/X_train.csv'):
    np.savetxt('data/X_train.csv', X_train_scaled, delimiter=',')
    np.savetxt('data/X_test.csv', X_test_scaled, delimiter=',')
    np.savetxt('data/y_train.csv', y_train, delimiter=',')
    np.savetxt('data/y_test.csv', y_test, delimiter=',')

In [167]:
# train sample model for now
classifier = CatBoostClassifier()
classifier.fit(X_train_scaled, y_train)

# predict on test data
y_pred = classifier.predict(X_test_scaled)

# calculate auc and f1
from sklearn.metrics import roc_auc_score, f1_score
print('AUC: ', roc_auc_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))


Learning rate set to 0.059191
0:	learn: 0.6734308	total: 9.75ms	remaining: 9.74s
1:	learn: 0.6566068	total: 18.4ms	remaining: 9.16s
2:	learn: 0.6409506	total: 26.6ms	remaining: 8.84s
3:	learn: 0.6268798	total: 35.3ms	remaining: 8.78s
4:	learn: 0.6146574	total: 43.6ms	remaining: 8.67s
5:	learn: 0.6037985	total: 51.9ms	remaining: 8.59s
6:	learn: 0.5933120	total: 60.9ms	remaining: 8.64s
7:	learn: 0.5841801	total: 69.1ms	remaining: 8.57s
8:	learn: 0.5757187	total: 77.5ms	remaining: 8.54s
9:	learn: 0.5684327	total: 85.8ms	remaining: 8.5s
10:	learn: 0.5616526	total: 93.9ms	remaining: 8.44s
11:	learn: 0.5551241	total: 102ms	remaining: 8.42s
12:	learn: 0.5490737	total: 111ms	remaining: 8.41s
13:	learn: 0.5432880	total: 119ms	remaining: 8.38s
14:	learn: 0.5382647	total: 127ms	remaining: 8.36s
15:	learn: 0.5334073	total: 136ms	remaining: 8.35s
16:	learn: 0.5289240	total: 144ms	remaining: 8.32s
17:	learn: 0.5245616	total: 153ms	remaining: 8.36s
18:	learn: 0.5204169	total: 162ms	remaining: 8.35s
1

In [168]:
temp_df = pd.DataFrame({'title': ['You won\'t believe what this celebrity did!']})

temp_df = preprocess_title(temp_df)

In [169]:
%run functions.py

# preprocess_title(temp_df)

get_word_vectors(model_w2v, temp_df, aggregation='mean')



array([-0.30085495,  0.36355594, -0.6865807 ,  0.61914337,  0.32182485,
       -0.9802122 , -0.5465201 ,  0.14574091, -0.15582961, -0.21524619,
        0.09403447, -0.7262607 ,  0.16441579,  0.5090908 , -0.9060351 ,
       -0.2035516 ,  0.9606277 ,  0.30512366, -0.08067381, -0.5957421 ,
        0.10018655,  1.1386993 ,  0.15839522, -0.92647195,  0.23036008,
        0.19822645,  0.02281234,  0.28668594, -0.16425857,  0.7232782 ,
        0.5734054 , -0.09418639, -0.53382355, -0.96784043, -0.17374772,
       -0.4143497 ,  0.73393583, -0.25609112, -0.18307258,  0.420023  ,
       -0.2489742 , -0.7896616 , -0.4599563 ,  1.2660378 , -0.78162867,
        0.08888236,  0.39010015,  1.3087475 , -0.44112504,  0.68226105,
        0.1432785 , -0.1919261 ,  0.149201  , -0.05038412,  0.15784372,
        0.49006367, -0.04158034, -0.13773376, -0.05109983,  0.6793417 ,
        0.78785515, -0.04802269,  0.6953093 ,  0.43364698, -0.36705896,
       -0.318304  , -0.39732498,  0.17011891, -0.6768475 ,  0.89

In [188]:
chat_clickbait_examples = [
    "You won't believe what this celebrity did!",
    "Shocking secrets revealed about the latest trend!",
    "This simple trick will change your life forever!",
    "Is this the craziest video on the internet?",
    "10 unbelievable facts that will blow your mind!",
    "You'll never guess what happened next!",
    "Exclusive insider information on the hottest topic!",

]


for example in chat_clickbait_examples:
    print(f"{example} | CHANCE OF CLICKBAIT: {predict_on_text(classifier, model_w2v, example)[0][1]:>10}")


You won't believe what this celebrity did! | CHANCE OF CLICKBAIT: 0.5355404774242387
Shocking secrets revealed about the latest trend! | CHANCE OF CLICKBAIT: 0.48276431179723095
This simple trick will change your life forever! | CHANCE OF CLICKBAIT: 0.5998583163842371
Is this the craziest video on the internet? | CHANCE OF CLICKBAIT: 0.5696578035636994
10 unbelievable facts that will blow your mind! | CHANCE OF CLICKBAIT: 0.5495353291359033
You'll never guess what happened next! | CHANCE OF CLICKBAIT: 0.6346704769389464
Exclusive insider information on the hottest topic! | CHANCE OF CLICKBAIT: 0.4282170577094954


In [183]:
non_clickbait_titles = [
    "Wikinews interviews Tatton Spiller, founder of political news service Simple Politics",
    "NASA's OSIRIS-REx arrives in Houston, US after returning asteroid samples to Earth"]

for example in non_clickbait_titles:
    print(f"{example} | CHANCE OF CLICKBAIT: {predict_on_text(classifier, model_w2v , example)[0][1]}" )


Wikinews interviews Tatton Spiller, founder of political news service Simple Politics | CHANCE OF CLICKBAIT: 0.2590838370370218
NASA's OSIRIS-REx arrives in Houston, US after returning asteroid samples to Earth | CHANCE OF CLICKBAIT: 0.18068577547683976
