In [87]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from classes import Word2VecModel

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [88]:
from functions import *
model_w2v_settings = return_best_model()

In [89]:
model_w2v = Word2VecModel(model_w2v_settings)

In [90]:
# read merged data
df = pd.read_csv('data/merged_titles_labels.csv')
df.head()

Unnamed: 0,title,is_clickbait
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,Why the Truth Might Get You Fired,1
3,15 Civilians Killed In Single US Airstrike Hav...,1
4,Iranian woman jailed for fictional unpublished...,1


In [91]:
# import models
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier


title_vectors = [get_word_vectors(model_w2v, title, aggregation='mean') for title in df['title']]

X = np.vstack(title_vectors)
y = df['is_clickbait'].values

In [92]:
# split data into train and test stratified by y
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True)

# scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# save scaled data as csv if data does not exist yet
if not os.path.exists('data/X_train.csv'):
    np.savetxt('data/X_train.csv', X_train_scaled, delimiter=',')
    np.savetxt('data/X_test.csv', X_test_scaled, delimiter=',')
    np.savetxt('data/y_train.csv', y_train, delimiter=',')
    np.savetxt('data/y_test.csv', y_test, delimiter=',')

In [97]:
# train sample model for now
classifier = KNeighborsClassifier()
classifier.fit(X_train_scaled, y_train)

# predict on test data
y_pred = classifier.predict(X_test_scaled)

# calculate auc and f1
from sklearn.metrics import roc_auc_score, f1_score
print('AUC: ', roc_auc_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))


AUC:  0.7832694468565332
F1:  0.7392190152801358


TypeError: list indices must be integers or slices, not str

In [129]:
temp_df = pd.DataFrame({'title': ['You won\'t believe what this celebrity did!']})

preprocess_title(temp_df)

Unnamed: 0,title
0,"[wont, believe, celebrity]"


In [125]:
%run functions.py

# preprocess_title(temp_df)

get_word_vectors(model_w2v, preprocess_title(temp_df), aggregation='mean')



array([0., 0., 0., ..., 0., 0., 0.])

In [139]:
# list words in model by .key_to_index
model_w2v.model.wv.key_to_index
 

{' ': 0,
 'e': 1,
 'a': 2,
 'o': 3,
 'i': 4,
 'r': 5,
 'n': 6,
 't': 7,
 's': 8,
 'l': 9,
 'h': 10,
 'd': 11,
 'u': 12,
 'c': 13,
 'm': 14,
 'T': 15,
 'g': 16,
 'p': 17,
 'y': 18,
 'f': 19,
 'S': 20,
 'A': 21,
 'w': 22,
 'k': 23,
 'b': 24,
 'C': 25,
 'v': 26,
 'W': 27,
 'B': 28,
 'P': 29,
 'I': 30,
 'M': 31,
 'F': 32,
 'H': 33,
 'N': 34,
 'D': 35,
 'R': 36,
 'O': 37,
 'Y': 38,
 '-': 39,
 'L': 40,
 'E': 41,
 "'": 42,
 'G': 43,
 ',': 44,
 '’': 45,
 ':': 46,
 'U': 47,
 '1': 48,
 '.': 49,
 '0': 50,
 '2': 51,
 'K': 52,
 'x': 53,
 'V': 54,
 'J': 55,
 '"': 56,
 'z': 57,
 '5': 58,
 '‘': 59,
 'j': 60,
 '?': 61,
 '7': 62,
 '3': 63,
 '6': 64,
 '9': 65,
 'q': 66,
 '4': 67,
 '8': 68,
 'Q': 69,
 '\xa0': 70,
 'Z': 71,
 '!': 72,
 '$': 73,
 'о': 74,
 '–': 75,
 '”': 76,
 '“': 77,
 '(': 78,
 ')': 79,
 'е': 80,
 '&': 81,
 'и': 82,
 'а': 83,
 'н': 84,
 '|': 85,
 '/': 86,
 'X': 87,
 'т': 88,
 '—': 89,
 'р': 90,
 'с': 91,
 ';': 92,
 'в': 93,
 '%': 94,
 'л': 95,
 '*': 96,
 'к': 97,
 '…': 98,
 'é': 99,
 ']': 1

In [99]:
chat_clickbait_examples = [
    "You won't believe what this celebrity did!",
    "Shocking secrets revealed about the latest trend!",
    "This simple trick will change your life forever!",
    "Is this the craziest video on the internet?",
    "10 unbelievable facts that will blow your mind!",
    "You'll never guess what happened next!",
    "Exclusive insider information on the hottest topic!",
    "This product guarantees instant results – find out how!",
    "What experts don't want you to know about...",
    "Uncovered: The hidden truth behind a viral sensation!",
    "The secret to getting rich overnight – revealed!",
    "You'll be shocked by these before-and-after pictures!",
    "The shocking reason why [common practice] is actually dangerous!",
    "What [professionals/experts] don't want you to know about [subject] – it's revelatory!",
    "Exposed: The truth behind [popular myth] – it will leave you flabbergasted!",
    "You won't believe the results of this [year] experiment – it's mind-blowing!",
    "The secret to [achievement] that [professionals/experts] keep quiet!",
    "The surprising reason why [common belief] is actually a deception!",
    "What [number]% of people regret about [life decision] – learn from their errors!",
    "The ultimate hack for [common problem] – it's a game-changer!",
    "Exposed: The dark side of [popular product] – it will leave you stunned!",
    "This [year] trend is changing the way we [do something] – don't miss out!",
    "What [professionals/experts] say about [hot topic] will blow your mind!",
    "You won't believe the transformation of [ordinary person] – it's unbelievable!",
    "The surprising link between [two seemingly unrelated things] – it's astonishing!",
    "This simple trick will make you a [skill] expert in minutes!",
    "Can you pass this impossible [topic] quiz? The results will astound you!",
    "The shocking reason why [common practice] is actually dangerous!",
    "What [professionals/experts] don't want you to know about [subject] – it's revelatory!",
    "Exposed: The truth behind [popular myth] – it will leave you flabbergasted!",
    "You won't believe the results of this [year] experiment – it's mind-blowing!",
    "The secret to [achievement] that [professionals/experts] keep quiet!",
    "The surprising reason why [common belief] is actually a deception!",
    "What [number]% of people regret about [life decision] – learn from their errors!",
    "The ultimate hack for [common problem] – it's a game-changer!",
    "Exposed: The dark side of [popular product] – it will leave you stunned!",
    "This [year] trend is changing the way we [do something] – don't miss out!",
    "What [professionals/experts] say about [hot topic] will blow your mind!",
    "You won't believe the transformation of [ordinary person] – it's unbelievable!",]


# get 10 random examples

chat_clickbait_examples = np.random.choice(chat_clickbait_examples, 10)

for example in chat_clickbait_examples:
    print(predict_on_text(classifier, model_w2v , example))
    print('\n')


[[0.4 0.6]]


[[0.4 0.6]]


[[0.4 0.6]]


[[0.4 0.6]]


[[0.4 0.6]]


[[0.4 0.6]]


[[0.4 0.6]]


[[0.4 0.6]]


[[0.4 0.6]]


[[0.4 0.6]]




In [100]:
non_clickbait_titles = [
    "Wikinews interviews Tatton Spiller, founder of political news service Simple Politics",
    "NASA's OSIRIS-REx arrives in Houston, US after returning asteroid samples to Earth"]

for example in non_clickbait_titles:
    print(predict_on_text(classifier, model_w2v , example))
    print('\n')

[[0.4 0.6]]


[[0.4 0.6]]


