In [124]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import autogluon as ag
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adammajczyk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [125]:
df1 = pd.read_csv('eda/small1/labeled.csv')
df2 = pd.read_csv('eda/small2/labeled.csv')
df3 = pd.read_csv('eda/small3/labeled.csv')
df = pd.concat([df1, df2, df3], ignore_index=True).reset_index(drop=True)
df.head()

Unnamed: 0,title,is_clickbait
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,Why the Truth Might Get You Fired,1
3,15 Civilians Killed In Single US Airstrike Hav...,1
4,Iranian woman jailed for fictional unpublished...,1


In [126]:
# remove punctuation
import string
punct = string.punctuation
punct = punct
all_quoatation = ['“', '”', '‘', '’']

def remove_punct(text, punct=punct):
    for p in punct:
        text = text.replace(p, '')
    text = text.replace('...', ' ')
    text = text.replace('…', ' ')
    return text

def remove_possesive_s(text):
    for p in all_quoatation:
        text = text.replace(p+'s', '')
        text = text.replace('s'+ p, '')
    return text

def replace_short_version(text):
    for p in all_quoatation:
        text = text.replace(p+'re', ' are')
        text = text.replace(p+'ve', ' have')
        text = text.replace(p+'ll', ' will')
        text = text.replace(p+'m', ' am')
        text = text.replace(p+'d', ' would')
        text = text.replace('n'+p+'t', ' not')
    return text    



import inflect
import re 

def replace_numbers_with_words(text):
    regex = r'\b\d+\b'

    matched = re.finditer(regex, text)

    for m in matched:
        number = m.group()
        p = inflect.engine()
        text = text.replace(number, p.number_to_words(number))
    return text

In [190]:
stop_words = set(stopwords.words('english'))

def tokenize(text):
    return [word for word in word_tokenize(text.lower()) if word not in stop_words]





def preprocess_title(df):
    # remove punctuation and other stuff
    df['title'] = df['title'].apply(replace_numbers_with_words)
    df['title'] = df['title'].apply(remove_punct)
    df['title'] = df['title'].apply(remove_possesive_s)
    df['title'] = df['title'].apply(replace_short_version)
    

    # tokenize
    df['title'] = df['title'].apply(tokenize)
    return df


In [161]:
VECTOR_SIZE = 2500
WINDOW = 4
EPOCHS = 500
WORKERS = 10
MIN_COUNT = 1

model = Word2Vec(df['title'], vector_size=VECTOR_SIZE, window=WINDOW, min_count=MIN_COUNT, workers=WORKERS)
model.train(df['title'], total_examples=len(df['title']), epochs=EPOCHS)

(281952464, 297774000)

In [162]:
def get_word_vectors(title, aggregation=None):
    word_vectors = [model.wv[word] for word in title if word in model.wv]
    # print(len(word_vectors))
    # print(word_vectors)
    if len(word_vectors) == 0:
        return np.zeros(VECTOR_SIZE)
    elif aggregation == 'mean':
        return np.mean(word_vectors, axis=0)
    elif aggregation is None:
        return word_vectors

In [184]:
get_word_vectors(test['title'][0], aggregation='mean')

array([-0.28916737, -0.3153554 , -0.07691829, ..., -0.15442827,
       -0.25115234, -0.87083054], dtype=float32)

In [163]:
# train xgb model to predict the label
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier


title_vectors = [get_word_vectors(title, aggregation='mean') for title in df['title']]

X = np.vstack(title_vectors)
y = df['is_clickbait'].values

In [164]:
# split data into train and test stratified by y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Choose and train a classifier, for example, Catboost Regression
classifier = CatBoostClassifier()
classifier.fit(X_train_scaled, y_train)


Learning rate set to 0.059191
0:	learn: 0.6711653	total: 130ms	remaining: 2m 10s
1:	learn: 0.6507094	total: 227ms	remaining: 1m 53s
2:	learn: 0.6332256	total: 326ms	remaining: 1m 48s
3:	learn: 0.6166301	total: 422ms	remaining: 1m 44s
4:	learn: 0.6015405	total: 518ms	remaining: 1m 43s
5:	learn: 0.5888092	total: 614ms	remaining: 1m 41s
6:	learn: 0.5775100	total: 706ms	remaining: 1m 40s
7:	learn: 0.5666873	total: 804ms	remaining: 1m 39s
8:	learn: 0.5566770	total: 896ms	remaining: 1m 38s
9:	learn: 0.5470623	total: 985ms	remaining: 1m 37s
10:	learn: 0.5386013	total: 1.08s	remaining: 1m 37s
11:	learn: 0.5312467	total: 1.18s	remaining: 1m 37s
12:	learn: 0.5242367	total: 1.27s	remaining: 1m 36s
13:	learn: 0.5179647	total: 1.37s	remaining: 1m 36s
14:	learn: 0.5121660	total: 1.46s	remaining: 1m 35s
15:	learn: 0.5073413	total: 1.55s	remaining: 1m 35s
16:	learn: 0.5020703	total: 1.65s	remaining: 1m 35s
17:	learn: 0.4972516	total: 1.74s	remaining: 1m 35s
18:	learn: 0.4928609	total: 1.84s	remaining:

<catboost.core.CatBoostClassifier at 0x2d6ba9f90>

In [165]:
# Evaluate on test data
y_pred = classifier.predict(X_test_scaled)
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)

0.8316825184513711

In [166]:
# count f1
from sklearn.metrics import f1_score
f1_score(y_test, y_pred)

0.7984073400848265

In [203]:
def predict_on_text(text):
    # print(text)
    text = preprocess_title(pd.DataFrame({'title': [text]}))
    text = get_word_vectors(text['title'][0], aggregation='mean')
    # print(len(text))
    return classifier.predict_proba(text)

predict_on_text('19 Things You Don’t Know About Your Favorite Sports Teams')

array([0.26269516, 0.73730484])