In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from classes import Word2VecModel

In [None]:
from functions import *
model_w2v_settings = return_best_model()

In [None]:
model_w2v = Word2VecModel(model_w2v_settings)

In [None]:
# read preprocessed data from pickle file
df = pd.read_pickle('data/preprocessed_titles_labels.pkl')
df.head()

In [None]:
# import models
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier


title_vectors = [get_word_vectors(model_w2v, title, aggregation='mean') for title in df['title']]

X = np.vstack(title_vectors)
y = df['is_clickbait'].values

In [None]:
# split data into train and test stratified by y
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True)

# scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# train sample model for now
classifier = CatBoostClassifier(iterations=1000)
classifier.fit(X_train_scaled, y_train)

# predict on test data
y_pred = classifier.predict(X_test_scaled)

# calculate auc and f1
from sklearn.metrics import roc_auc_score, f1_score
print('AUC: ', roc_auc_score(y_test, y_pred))
print('F1: ', f1_score(y_test, y_pred))

In [None]:
%run functions.py

In [None]:
chat_clickbait_examples = [
    "You won't believe what this celebrity did!",
    "Shocking secrets revealed about the latest trend!",
    "This simple trick will change your life forever!",
    "Is this the craziest video on the internet?",
    "10 unbelievable facts that will blow your mind!",
    "You'll never guess what happened next!",
    "Exclusive insider information on the hottest topic!",

]


for example in chat_clickbait_examples:
    print(f"{example} \nCHANCE OF CLICKBAIT: {predict_on_text(classifier, model_w2v, example)[0][1]} \n")


In [None]:
non_clickbait_titles = [
    "Wikinews interviews Tatton Spiller, founder of political news service Simple Politics",
    "NASA's OSIRIS-REx arrives in Houston, US after returning asteroid samples to Earth",
    "Paedo teacher Jeremy Forrest who preyed on schoolgirl, 15, & fled to France is fired from new job after bosses find out",]

for example in non_clickbait_titles:
    print(f"{example} \nCHANCE OF CLICKBAIT: {predict_on_text(classifier, model_w2v, example)[0][1]} \n")

In [None]:
# save classifier
import pickle

pickle.dump(classifier, open('predictive_models/catboost_model.pkl', 'wb'))

In [None]:
# read model from pickle file
classifier = pickle.load(open('predictive_models/catboost_model.pkl', 'rb'))

In [None]:
predict_on_text(classifier, model_w2v, "You won't believe what this celebrity did!")