In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from classes import Word2VecModel

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
from functions import *
model_w2v_settings = return_best_model()

In [13]:
model_w2v = Word2VecModel(model_w2v_settings)

In [14]:
# read preprocessed data from pickle file
df = pd.read_pickle('data/preprocessed_titles_labels.pkl')
df.head()

Unnamed: 0,title,is_clickbait
0,"[house, dem, aide, even, see, comey, letter, j...",1
1,"[flynn, hillary, clinton, big, woman, campus, ...",0
2,"[truth, might, get, fired]",1
3,"[fifteen, civilian, killed, single, usa, airst...",1
4,"[iranian, woman, jailed, fictional, unpublishe...",1


In [15]:
# import models
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier


title_vectors = [get_word_vectors(model_w2v, title, aggregation='mean') for title in df['title']]

X = np.vstack(title_vectors)
y = df['is_clickbait'].values

In [16]:
# split data into train and test stratified by y
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True)

# scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# save scaled data as csv

np.savetxt('data/X_train.csv', X_train_scaled, delimiter=',')
np.savetxt('data/X_test.csv', X_test_scaled, delimiter=',')
np.savetxt('data/y_train.csv', y_train, delimiter=',')
np.savetxt('data/y_test.csv', y_test, delimiter=',')

In [22]:
# train sample model for now
classifier = CatBoostClassifier(iterations=10000, learning_rate=0.03)
classifier.fit(X_train_scaled, y_train)

# predict on test data
y_pred = classifier.predict(X_test_scaled)

# calculate auc and f1
from sklearn.metrics import roc_auc_score, f1_score
print('AUC: ', roc_auc_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))


0:	learn: 0.6806298	total: 29ms	remaining: 4m 50s
1:	learn: 0.6697879	total: 52.2ms	remaining: 4m 20s
2:	learn: 0.6591337	total: 74.9ms	remaining: 4m 9s
3:	learn: 0.6491597	total: 98.4ms	remaining: 4m 5s
4:	learn: 0.6402076	total: 121ms	remaining: 4m 1s
5:	learn: 0.6316135	total: 145ms	remaining: 4m 1s
6:	learn: 0.6232906	total: 169ms	remaining: 4m 1s
7:	learn: 0.6154516	total: 194ms	remaining: 4m 2s
8:	learn: 0.6083351	total: 218ms	remaining: 4m 1s
9:	learn: 0.6013210	total: 242ms	remaining: 4m 1s
10:	learn: 0.5947134	total: 267ms	remaining: 4m 2s
11:	learn: 0.5883291	total: 290ms	remaining: 4m 1s
12:	learn: 0.5824129	total: 316ms	remaining: 4m 2s
13:	learn: 0.5767385	total: 340ms	remaining: 4m 2s
14:	learn: 0.5713119	total: 366ms	remaining: 4m 3s
15:	learn: 0.5665060	total: 391ms	remaining: 4m 4s
16:	learn: 0.5621962	total: 417ms	remaining: 4m 5s
17:	learn: 0.5580453	total: 443ms	remaining: 4m 5s
18:	learn: 0.5537047	total: 468ms	remaining: 4m 5s
19:	learn: 0.5495482	total: 493ms	rem

In [None]:
%run functions.py

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
chat_clickbait_examples = [
    "You won't believe what this celebrity did!",
    "Shocking secrets revealed about the latest trend!",
    "This simple trick will change your life forever!",
    "Is this the craziest video on the internet?",
    "10 unbelievable facts that will blow your mind!",
    "You'll never guess what happened next!",
    "Exclusive insider information on the hottest topic!",

]


for example in chat_clickbait_examples:
    print(f"{example} \nCHANCE OF CLICKBAIT: {predict_on_text(classifier, model_w2v, example)[0][1]} \n")


You won't believe what this celebrity did! 
CHANCE OF CLICKBAIT: 0.9062393918071735 

Shocking secrets revealed about the latest trend! 
CHANCE OF CLICKBAIT: 0.6543984370721739 

This simple trick will change your life forever! 
CHANCE OF CLICKBAIT: 0.7741490266896761 

Is this the craziest video on the internet? 
CHANCE OF CLICKBAIT: 0.617405762957752 

10 unbelievable facts that will blow your mind! 
CHANCE OF CLICKBAIT: 0.8687403647110127 

You'll never guess what happened next! 
CHANCE OF CLICKBAIT: 0.9460445517373255 

Exclusive insider information on the hottest topic! 
CHANCE OF CLICKBAIT: 0.3405405316071899 



In [None]:
non_clickbait_titles = [
    "Wikinews interviews Tatton Spiller, founder of political news service Simple Politics",
    "NASA's OSIRIS-REx arrives in Houston, US after returning asteroid samples to Earth"]

for example in non_clickbait_titles:
    print(f"{example} \nCHANCE OF CLICKBAIT: {predict_on_text(classifier, model_w2v, example)[0][1]} \n")



Wikinews interviews Tatton Spiller, founder of political news service Simple Politics 
CHANCE OF CLICKBAIT: 0.19511630878526762 

NASA's OSIRIS-REx arrives in Houston, US after returning asteroid samples to Earth 
CHANCE OF CLICKBAIT: 0.1425134386333806 

