In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from classes import Word2VecModel

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
from functions import *
model_w2v_settings = return_best_model()

In [20]:
model_w2v = Word2VecModel(model_w2v_settings)

In [21]:
# read merged data
df = pd.read_csv('data/merged_titles_labels.csv')
df.head()

Unnamed: 0,title,is_clickbait
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,Why the Truth Might Get You Fired,1
3,15 Civilians Killed In Single US Airstrike Hav...,1
4,Iranian woman jailed for fictional unpublished...,1


In [22]:
# import models
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier


title_vectors = [get_word_vectors(model_w2v, title, aggregation='mean') for title in df['title']]

X = np.vstack(title_vectors)
y = df['is_clickbait'].values

In [23]:
# split data into train and test stratified by y
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True)

# scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# save scaled data as csv if data does not exist yet
if not os.path.exists('data/X_train.csv'):
    np.savetxt('data/X_train.csv', X_train_scaled, delimiter=',')
    np.savetxt('data/X_test.csv', X_test_scaled, delimiter=',')
    np.savetxt('data/y_train.csv', y_train, delimiter=',')
    np.savetxt('data/y_test.csv', y_test, delimiter=',')

In [24]:
# train sample model for now
classifier = CatBoostClassifier()
classifier.fit(X_train_scaled, y_train)

# predict on test data
y_pred = classifier.predict(X_test_scaled)

# calculate auc and f1
from sklearn.metrics import roc_auc_score, f1_score
print('AUC: ', roc_auc_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))


Learning rate set to 0.059191
0:	learn: 0.6723388	total: 68ms	remaining: 1m 7s
1:	learn: 0.6521293	total: 131ms	remaining: 1m 5s
2:	learn: 0.6344080	total: 176ms	remaining: 58.6s
3:	learn: 0.6185543	total: 220ms	remaining: 54.8s
4:	learn: 0.6045653	total: 263ms	remaining: 52.4s
5:	learn: 0.5915873	total: 305ms	remaining: 50.5s
6:	learn: 0.5804856	total: 345ms	remaining: 48.9s
7:	learn: 0.5698982	total: 384ms	remaining: 47.6s
8:	learn: 0.5605342	total: 423ms	remaining: 46.6s
9:	learn: 0.5518924	total: 464ms	remaining: 46s
10:	learn: 0.5441516	total: 504ms	remaining: 45.3s
11:	learn: 0.5374961	total: 545ms	remaining: 44.9s
12:	learn: 0.5312812	total: 586ms	remaining: 44.5s
13:	learn: 0.5244757	total: 629ms	remaining: 44.3s
14:	learn: 0.5187312	total: 669ms	remaining: 44s
15:	learn: 0.5138959	total: 709ms	remaining: 43.6s
16:	learn: 0.5087609	total: 749ms	remaining: 43.3s
17:	learn: 0.5045673	total: 790ms	remaining: 43.1s
18:	learn: 0.4999405	total: 829ms	remaining: 42.8s
19:	learn: 0.496

In [25]:
%run functions.py
predict_on_text(classifier, model_w2v , '19 Things You Don’t Know About Your Favorite Sports Teams')




array([0.40066667, 0.59933333])