In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from classes import Word2VecModel

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adamm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adamm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [55]:
from functions import *
model_w2v_settings = return_best_model()

In [56]:
model_w2v = Word2VecModel(model_w2v_settings)

In [57]:
# read preprocessed data from pickle file
df = pd.read_pickle('data/preprocessed_titles_labels.pkl')
df.head()

Unnamed: 0,title,is_clickbait
0,"[house, aide, even, see, letter, jason]",1
1,"[hillary, clinton, big, woman, campus]",0
2,"[truth, might, get, fired]",1
3,"[fifteen, civilian, single, u]",1
4,"[iranian, woman, fictional, unpublished, story...",1


In [58]:
# import models
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier


title_vectors = [get_word_vectors(model_w2v, title, aggregation='mean') for title in df['title']]

X = np.vstack(title_vectors)
y = df['is_clickbait'].values

In [59]:
# split data into train and test stratified by y
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True)

# scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# save scaled data as csv if data does not exist yet
if not os.path.exists('data/X_train.csv'):
    np.savetxt('data/X_train.csv', X_train_scaled, delimiter=',')
    np.savetxt('data/X_test.csv', X_test_scaled, delimiter=',')
    np.savetxt('data/y_train.csv', y_train, delimiter=',')
    np.savetxt('data/y_test.csv', y_test, delimiter=',')

In [60]:
# train sample model for now
classifier = CatBoostClassifier()
classifier.fit(X_train_scaled, y_train)

# predict on test data
y_pred = classifier.predict(X_test_scaled)

# calculate auc and f1
from sklearn.metrics import roc_auc_score, f1_score
print('AUC: ', roc_auc_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))


Learning rate set to 0.059064
0:	learn: 0.6743747	total: 38.5ms	remaining: 38.5s
1:	learn: 0.6579048	total: 73.7ms	remaining: 36.8s
2:	learn: 0.6423524	total: 107ms	remaining: 35.6s
3:	learn: 0.6284902	total: 141ms	remaining: 35.2s
4:	learn: 0.6169786	total: 175ms	remaining: 34.9s
5:	learn: 0.6060915	total: 212ms	remaining: 35.2s
6:	learn: 0.5962311	total: 247ms	remaining: 35s
7:	learn: 0.5872800	total: 284ms	remaining: 35.2s
8:	learn: 0.5790541	total: 321ms	remaining: 35.3s
9:	learn: 0.5718316	total: 358ms	remaining: 35.5s
10:	learn: 0.5651025	total: 395ms	remaining: 35.6s
11:	learn: 0.5587306	total: 433ms	remaining: 35.7s
12:	learn: 0.5532016	total: 472ms	remaining: 35.9s
13:	learn: 0.5480215	total: 510ms	remaining: 35.9s
14:	learn: 0.5434179	total: 548ms	remaining: 36s
15:	learn: 0.5392705	total: 584ms	remaining: 35.9s
16:	learn: 0.5352789	total: 621ms	remaining: 35.9s
17:	learn: 0.5314158	total: 660ms	remaining: 36s
18:	learn: 0.5274901	total: 698ms	remaining: 36s
19:	learn: 0.5243

In [61]:
chat_clickbait_examples = [
    "You won't believe what this celebrity did!",
    "Shocking secrets revealed about the latest trend!",
    "This simple trick will change your life forever!",
    "Is this the craziest video on the internet?",
    "10 unbelievable facts that will blow your mind!",
    "You'll never guess what happened next!",
    "Exclusive insider information on the hottest topic!",

]


for example in chat_clickbait_examples:
    print(f"{example} \nCHANCE OF CLICKBAIT: {predict_on_text(classifier, model_w2v, example)[0][1]} \n")


You won't believe what this celebrity did! 
CHANCE OF CLICKBAIT: 0.4598793849583289 

Shocking secrets revealed about the latest trend! 
CHANCE OF CLICKBAIT: 0.4864181004582857 

This simple trick will change your life forever! 
CHANCE OF CLICKBAIT: 0.8138118893254427 

Is this the craziest video on the internet? 
CHANCE OF CLICKBAIT: 0.5576875567526149 

10 unbelievable facts that will blow your mind! 
CHANCE OF CLICKBAIT: 0.8548075365328303 

You'll never guess what happened next! 
CHANCE OF CLICKBAIT: 0.9272564868323602 

Exclusive insider information on the hottest topic! 
CHANCE OF CLICKBAIT: 0.27006833551454773 



In [62]:
non_clickbait_titles = [
    "Wikinews interviews Tatton Spiller, founder of political news service Simple Politics",
    "NASA's OSIRIS-REx arrives in Houston, US after returning asteroid samples to Earth"]

for example in non_clickbait_titles:
    print(f"{example} \nCHANCE OF CLICKBAIT: {predict_on_text(classifier, model_w2v, example)[0][1]} \n")



Wikinews interviews Tatton Spiller, founder of political news service Simple Politics 
CHANCE OF CLICKBAIT: 0.5250863434157014 

NASA's OSIRIS-REx arrives in Houston, US after returning asteroid samples to Earth 
CHANCE OF CLICKBAIT: 0.20394055385527224 

