In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn import set_config; set_config(display='diagram')
from tensorflow.keras.preprocessing.text import Tokenizer
import string
import os
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from sklearn.model_selection import train_test_split 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, Sequential

from tensorflow.keras.callbacks import EarlyStopping

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lucaspicot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load the dataset already preprocessed

In [2]:
data = pd.read_csv('tokenized_df.csv')

In [3]:
## Delete the first column

data = data.drop(columns= 'Unnamed: 0')

In [4]:
data

Unnamed: 0,text,label
0,"['washington', 'reuters', 'head', 'conservativ...",1
1,"['washington', 'reuters', 'transgender', 'peop...",1
2,"['washington', 'reuters', 'special', 'counsel'...",1
3,"['washington', 'reuters', 'trump', 'campaign',...",1
4,"['seattlewashington', 'reuters', 'president', ...",1
...,...,...
93221,"['email', 'released', 'wikileaks', 'sunday', '...",1
93222,"['washington', 'reuters', 'hackers', 'believed...",0
93223,"['know', 'fantasyland', 'republicans', 'never'...",1
93224,"['migrants', 'refuse', 'leave', 'train', 'refu...",0


In [5]:
indexes = np.random.randint(0, high=93225, size=40000) #try to catch 40% percent 


In [6]:
len(indexes)

40000

In [7]:
data = data.loc[indexes,:] #get only 40K rows

# Splitting Data

In [8]:
## Splitting the data

X = data['text']             
y = data['label'] 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
y_train.astype('int8')
y_test.astype('int8')

1766     1
43584    1
70624    1
90360    0
22232    0
        ..
8057     1
11856    1
77743    0
75704    0
67738    0
Name: label, Length: 12000, dtype: int8

# Vectorized

In [9]:
from gensim.models import Word2Vec ##check window parameter after

word2vec = Word2Vec(sentences=X_train, vector_size=10, min_count=5)

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

In [11]:
X_test_embed = embedding(word2vec, X_test)

In [12]:
# Embed the training and test sentences
X_train_embed = embedding(word2vec, X_train)

# Padding to make the input of the same length

In [13]:
# Pad the training and test embedded sentences
X_train_padded = pad_sequences(X_train_embed, dtype='float32', padding='pre', maxlen=500, value = -99.0)

In [14]:
X_test_padded = pad_sequences(X_test_embed, dtype='float32', padding='pre', maxlen=500, value = -99.0)

In [15]:
#Check shape

X_train_padded.shape , X_test_padded.shape

((28000, 500, 10), (12000, 500, 10))

In [16]:
X_train_padded

array([[[-9.9000000e+01, -9.9000000e+01, -9.9000000e+01, ...,
         -9.9000000e+01, -9.9000000e+01, -9.9000000e+01],
        [-9.9000000e+01, -9.9000000e+01, -9.9000000e+01, ...,
         -9.9000000e+01, -9.9000000e+01, -9.9000000e+01],
        [-9.9000000e+01, -9.9000000e+01, -9.9000000e+01, ...,
         -9.9000000e+01, -9.9000000e+01, -9.9000000e+01],
        ...,
        [ 8.4064347e-01, -3.2482895e-01,  1.5019696e+00, ...,
         -2.4320903e-01, -2.1022305e-01,  4.9791235e-01],
        [ 1.1854537e+00,  4.4227305e-01, -1.8064916e-01, ...,
         -9.9707144e-01,  2.0635910e-01,  1.4725028e-01],
        [ 2.1605468e+00,  2.2353104e-01,  2.1596248e+00, ...,
         -9.5733780e-01,  9.3649161e-01, -1.2988801e-01]],

       [[ 2.0217366e+00,  1.6488731e+00,  1.0038257e+00, ...,
         -3.4896666e-01,  7.7490789e-01, -4.7623243e-02],
        [ 1.1854537e+00,  4.4227305e-01, -1.8064916e-01, ...,
         -9.9707144e-01,  2.0635910e-01,  1.4725028e-01],
        [ 4.6736097e-01, 

# Create the model

In [17]:
from tensorflow.keras import regularizers

In [18]:
#Zein's model 



reg_l1 = regularizers.L1(0.01)
reg_l2 = regularizers.L2(0.01)

model = Sequential([
    layers.Masking(mask_value= -99.0),
    layers.LSTM(40, activation='tanh'),
    layers.Dense(40, activation = 'relu', kernel_regularizer=reg_l1),
    layers.Dropout(rate=0.2),
    layers.Dense(20, activation = 'relu', kernel_regularizer=reg_l1 ),
    layers.Dropout(rate=0.2),
    layers.Dense(10, activation = 'relu', kernel_regularizer=reg_l1),
    layers.Dropout(rate=0.2),
    layers.Dense(1, activation="sigmoid")
    

])

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])


2022-09-09 10:43:14.885080: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Early stopping and train the model

es = EarlyStopping(patience = 15, verbose=2, monitor='val_loss', restore_best_weights = True)

model.fit(X_train_padded, y_train, batch_size = 16, verbose=2 ,callbacks = [es], validation_split=0.15, epochs = 100)

Epoch 1/100
1488/1488 - 156s - loss: 0.7061 - accuracy: 0.4992 - val_loss: 0.7060 - val_accuracy: 0.5150
Epoch 2/100
1488/1488 - 159s - loss: 0.7061 - accuracy: 0.4994 - val_loss: 0.7058 - val_accuracy: 0.5150
Epoch 3/100
1488/1488 - 167s - loss: 0.7062 - accuracy: 0.5018 - val_loss: 0.7060 - val_accuracy: 0.5150
Epoch 4/100
1488/1488 - 168s - loss: 0.7062 - accuracy: 0.4986 - val_loss: 0.7059 - val_accuracy: 0.5150
Epoch 5/100
1488/1488 - 186s - loss: 0.7061 - accuracy: 0.5001 - val_loss: 0.7060 - val_accuracy: 0.5150
Epoch 6/100


In [None]:
model.evaluate(X_test_padded, y_test)