In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from classes import Word2VecModel

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adamm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adamm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
from functions import *
model_w2v_settings = return_best_model()

In [9]:
model_w2v = Word2VecModel(model_w2v_settings)

In [10]:
# read preprocessed data from pickle file
df = pd.read_pickle('data/preprocessed_titles_labels.pkl')
df.head()

Unnamed: 0,title,is_clickbait
0,"[house, dem, aide, even, see, comey, letter, j...",1
1,"[flynn, hillary, clinton, big, woman, campus, ...",0
2,"[truth, might, get, fired]",1
3,"[fifteen, civilian, killed, single, usa, airst...",1
4,"[iranian, woman, jailed, fictional, unpublishe...",1


In [11]:
# import models
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier


title_vectors = [get_word_vectors(model_w2v, title, aggregation='mean') for title in df['title']]

X = np.vstack(title_vectors)
y = df['is_clickbait'].values

In [12]:
# split data into train and test stratified by y
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True)

# scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# save scaled data as csv

# np.savetxt('data/X_train.csv', X_train_scaled, delimiter=',')
# np.savetxt('data/X_test.csv', X_test_scaled, delimiter=',')
# np.savetxt('data/y_train.csv', y_train, delimiter=',')
# np.savetxt('data/y_test.csv', y_test, delimiter=',')

In [13]:
# train sample model for now
classifier = CatBoostClassifier(iterations=1000)
classifier.fit(X_train_scaled, y_train)

# predict on test data
y_pred = classifier.predict(X_test_scaled)

# calculate auc and f1
from sklearn.metrics import roc_auc_score, f1_score
print('AUC: ', roc_auc_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))


Learning rate set to 0.059137
0:	learn: 0.6689670	total: 212ms	remaining: 3m 32s
1:	learn: 0.6493651	total: 252ms	remaining: 2m 5s
2:	learn: 0.6312804	total: 288ms	remaining: 1m 35s
3:	learn: 0.6151675	total: 325ms	remaining: 1m 20s
4:	learn: 0.6011406	total: 362ms	remaining: 1m 12s
5:	learn: 0.5886360	total: 400ms	remaining: 1m 6s
6:	learn: 0.5777125	total: 439ms	remaining: 1m 2s
7:	learn: 0.5675744	total: 477ms	remaining: 59.2s
8:	learn: 0.5585814	total: 518ms	remaining: 57s
9:	learn: 0.5502079	total: 564ms	remaining: 55.8s
10:	learn: 0.5424343	total: 602ms	remaining: 54.2s
11:	learn: 0.5357130	total: 640ms	remaining: 52.7s
12:	learn: 0.5294601	total: 679ms	remaining: 51.5s
13:	learn: 0.5234236	total: 716ms	remaining: 50.4s
14:	learn: 0.5181510	total: 753ms	remaining: 49.5s
15:	learn: 0.5135000	total: 791ms	remaining: 48.7s
16:	learn: 0.5089819	total: 831ms	remaining: 48.1s
17:	learn: 0.5046616	total: 868ms	remaining: 47.3s
18:	learn: 0.5010901	total: 907ms	remaining: 46.8s
19:	learn

In [14]:
%run functions.py

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adamm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adamm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adamm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\adamm\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\adamm\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [15]:
chat_clickbait_examples = [
    "You won't believe what this celebrity did!",
    "Shocking secrets revealed about the latest trend!",
    "This simple trick will change your life forever!",
    "Is this the craziest video on the internet?",
    "10 unbelievable facts that will blow your mind!",
    "You'll never guess what happened next!",
    "Exclusive insider information on the hottest topic!",

]


for example in chat_clickbait_examples:
    print(f"{example} \nCHANCE OF CLICKBAIT: {predict_on_text(classifier, model_w2v, example)[0][1]} \n")


You won't believe what this celebrity did! 
CHANCE OF CLICKBAIT: 0.8778930045265337 

Shocking secrets revealed about the latest trend! 
CHANCE OF CLICKBAIT: 0.6570578627532526 

This simple trick will change your life forever! 
CHANCE OF CLICKBAIT: 0.8085739429999638 

Is this the craziest video on the internet? 
CHANCE OF CLICKBAIT: 0.5789976327936314 

10 unbelievable facts that will blow your mind! 
CHANCE OF CLICKBAIT: 0.8749395654458958 

You'll never guess what happened next! 
CHANCE OF CLICKBAIT: 0.9456191845322321 

Exclusive insider information on the hottest topic! 
CHANCE OF CLICKBAIT: 0.3343016599455578 



In [16]:
non_clickbait_titles = [
    "Wikinews interviews Tatton Spiller, founder of political news service Simple Politics",
    "NASA's OSIRIS-REx arrives in Houston, US after returning asteroid samples to Earth",
    "Paedo teacher Jeremy Forrest who preyed on schoolgirl, 15, & fled to France is fired from new job after bosses find out",]

for example in non_clickbait_titles:
    print(f"{example} \nCHANCE OF CLICKBAIT: {predict_on_text(classifier, model_w2v, example)[0][1]} \n")



Wikinews interviews Tatton Spiller, founder of political news service Simple Politics 
CHANCE OF CLICKBAIT: 0.2349896113969257 

NASA's OSIRIS-REx arrives in Houston, US after returning asteroid samples to Earth 
CHANCE OF CLICKBAIT: 0.18885209344828935 

Paedo teacher Jeremy Forrest who preyed on schoolgirl, 15, & fled to France is fired from new job after bosses find out 
CHANCE OF CLICKBAIT: 0.3240676334509383 



In [18]:
# save classifier
import pickle

pickle.dump(classifier, open('predictive_models/catboost_model.pkl', 'wb'))



In [19]:
# read model from pickle file
classifier = pickle.load(open('predictive_models/catboost_model.pkl', 'rb'))

In [20]:
predict_on_text(classifier, model_w2v, "You won't believe what this celebrity did!")

array([[0.122107, 0.877893]])