In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from classes import Word2VecModel

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adamm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adamm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
from functions import *
model_w2v_settings = return_best_model()

In [11]:
model_w2v = Word2VecModel(model_w2v_settings)

In [12]:
# read preprocessed data from pickle file
df = pd.read_pickle('data/preprocessed_titles_labels.pkl')
df.head()

Unnamed: 0,title,is_clickbait
0,"[hous, dem, aid, even, see, comey, letter, jas...",1
1,"[flynn, hillari, clinton, big, woman, campu, b...",0
2,"[truth, might, get, fire]",1
3,"[fifteen, civilian, kill, singl, us, airstrik,...",1
4,"[iranian, woman, jail, fiction, unpublish, sto...",1


In [13]:
# import models
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier


title_vectors = [get_word_vectors(model_w2v, title, aggregation='mean') for title in df['title']]

X = np.vstack(title_vectors)
y = df['is_clickbait'].values

In [14]:
# split data into train and test stratified by y
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True)

# scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# save scaled data as csv if data does not exist yet
if not os.path.exists('data/X_train.csv'):
    np.savetxt('data/X_train.csv', X_train_scaled, delimiter=',')
    np.savetxt('data/X_test.csv', X_test_scaled, delimiter=',')
    np.savetxt('data/y_train.csv', y_train, delimiter=',')
    np.savetxt('data/y_test.csv', y_test, delimiter=',')

In [15]:
# train sample model for now
classifier = CatBoostClassifier()
classifier.fit(X_train_scaled, y_train)

# predict on test data
y_pred = classifier.predict(X_test_scaled)

# calculate auc and f1
from sklearn.metrics import roc_auc_score, f1_score
print('AUC: ', roc_auc_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))


Learning rate set to 0.059191
0:	learn: 0.6713391	total: 154ms	remaining: 2m 33s
1:	learn: 0.6515687	total: 168ms	remaining: 1m 23s
2:	learn: 0.6336494	total: 183ms	remaining: 1m
3:	learn: 0.6181264	total: 197ms	remaining: 49.2s
4:	learn: 0.6037058	total: 221ms	remaining: 44s
5:	learn: 0.5915987	total: 236ms	remaining: 39.2s
6:	learn: 0.5803540	total: 256ms	remaining: 36.3s
7:	learn: 0.5706372	total: 270ms	remaining: 33.5s
8:	learn: 0.5614186	total: 283ms	remaining: 31.2s
9:	learn: 0.5531621	total: 297ms	remaining: 29.4s
10:	learn: 0.5455857	total: 313ms	remaining: 28.1s
11:	learn: 0.5387007	total: 326ms	remaining: 26.9s
12:	learn: 0.5324410	total: 339ms	remaining: 25.7s
13:	learn: 0.5266751	total: 351ms	remaining: 24.7s
14:	learn: 0.5213809	total: 363ms	remaining: 23.9s
15:	learn: 0.5169901	total: 375ms	remaining: 23.1s
16:	learn: 0.5123873	total: 387ms	remaining: 22.4s
17:	learn: 0.5084956	total: 398ms	remaining: 21.7s
18:	learn: 0.5047643	total: 411ms	remaining: 21.2s
19:	learn: 0.5

In [16]:
chat_clickbait_examples = [
    "You won't believe what this celebrity did!",
    "Shocking secrets revealed about the latest trend!",
    "This simple trick will change your life forever!",
    "Is this the craziest video on the internet?",
    "10 unbelievable facts that will blow your mind!",
    "You'll never guess what happened next!",
    "Exclusive insider information on the hottest topic!",

]


for example in chat_clickbait_examples:
    print(f"{example} \nCHANCE OF CLICKBAIT: {predict_on_text(classifier, model_w2v, example)[0][1]} \n")


You won't believe what this celebrity did! 
CHANCE OF CLICKBAIT: 0.8788382035438796 

Shocking secrets revealed about the latest trend! 
CHANCE OF CLICKBAIT: 0.8148511766970515 

This simple trick will change your life forever! 
CHANCE OF CLICKBAIT: 0.9259097828955541 

Is this the craziest video on the internet? 
CHANCE OF CLICKBAIT: 0.7835009307565174 

10 unbelievable facts that will blow your mind! 
CHANCE OF CLICKBAIT: 0.882670025328093 

You'll never guess what happened next! 
CHANCE OF CLICKBAIT: 0.9941102642474525 

Exclusive insider information on the hottest topic! 
CHANCE OF CLICKBAIT: 0.42633586156773534 



In [17]:
non_clickbait_titles = [
    "Wikinews interviews Tatton Spiller, founder of political news service Simple Politics",
    "NASA's OSIRIS-REx arrives in Houston, US after returning asteroid samples to Earth"]

for example in non_clickbait_titles:
    print(f"{example} \nCHANCE OF CLICKBAIT: {predict_on_text(classifier, model_w2v, example)[0][1]} \n")



Wikinews interviews Tatton Spiller, founder of political news service Simple Politics 
CHANCE OF CLICKBAIT: 0.21305801953155498 

NASA's OSIRIS-REx arrives in Houston, US after returning asteroid samples to Earth 
CHANCE OF CLICKBAIT: 0.12641096060390578 

