In [None]:
import sys
import os
if os.getcwd().endswith('notebooks'):
    os.chdir("..")
print(os.getcwd())

## Imports 

In [None]:
import pandas as pd
import numpy as np
import itertools
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
import keras
from keras.models import Model, Sequential
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Data 

In [3]:
df = pd.read_csv(os.path.join('data', 'clickbait_labeled_data.csv'))
df

Unnamed: 0,text,label
0,"Waiting for Madoff, Angry Crowd Is Disappointed",0
1,"Rihanna Looks Flawless In Her New Video For ""W...",1
2,19 Michael Scott Moments Guaranteed To Make Yo...,1
3,Here's All Of The Famous Faces Who Attended Lo...,1
4,Transporting food costs the UK billions,0
...,...,...
31995,33 Cats Who Found Their Forever Home This Year,1
31996,Q-and-A With Retiring Big East Commissioner Mi...,0
31997,18 Throwback Fairy Winkles Toys Only '90s Kids...,1
31998,Alistair Darling unveils UK's 2010 Budget,0


### Create Train and test splits

In [4]:
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=2)

### Tokenize the Data

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)

In [7]:
# Pad the sequences to the same length
max_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length)

# Model

### Define the Model 

In [8]:
def LSTM_model():
    inputs = Input(name='inputs', shape=[max_length])
    layer = Embedding(len(tokenizer.word_index)+1, 50, input_length=max_length)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(1, name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

### Compile the model

In [9]:
model = LSTM_model()
model.summary()
model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

2023-02-09 14:21:53.733681: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 26)]              0         
                                                                 
 embedding (Embedding)       (None, 26, 50)            1062350   
                                                                 
 lstm (LSTM)                 (None, 64)                29440     
                                                                 
 out_layer (Dense)           (None, 1)                 65        
                                                                 
 activation (Activation)     (None, 1)                 0         
                                                                 
Total params: 1,091,855
Trainable params: 1,091,855
Non-trainable params: 0
_________________________________________________________________


### Fit the model on the training data

In [10]:
model.fit(
    padded_sequences, 
    y_train, 
    epochs=10, 
    verbose=1,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe07b16f7f0>

### View model accuracy

In [12]:
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded_sequences = pad_sequences(test_sequences, maxlen=max_length)
print(f"Accuracy is: {model.evaluate(test_padded_sequences, y_test)[1]}")

Accuracy is: 0.981374979019165


# Predictions

### Make a prediction of a random text in the test set

In [13]:
label_chosen = 1
#+++++++++++++++++++++++++++++++++++++
random_row = y_test[y_test.map(lambda x: x==label_chosen)].sample()
random_text = [X_test[random_row.index].values[0]]
test_response_seq = tokenizer.texts_to_sequences(random_text)
test_response_padded_sequences = pad_sequences(test_response_seq, maxlen=max_length)
prediction_ = max(model.predict(test_response_padded_sequences))
print(f"\nText is: \n{random_text[0]}")
print(f"\nWith a probability of: {prediction_}")
print(f"\nThe prediction is: {round(prediction_[0])}")


Text is: 
How Many Jamie Lee Curtis Movies Have You Seen

With a probability of: [0.9997887]

The prediction is: 1


### Make a prediction of 1 invented text

In [16]:
test_text = '3 days ago I walked around town'
#++++++++++++++++++++++++++++++++++++++++++++++
test_sequ = tokenizer.texts_to_sequences([test_text])
test_padded_sequ = pad_sequences(test_sequ, maxlen=max_length)
pred = max(model.predict(test_padded_sequ))
print(f"With a probability of: {pred}")
print(f"The prediction is: {round(pred[0])}")

With a probability of: [0.17083187]
The prediction is: 0
