In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn import set_config; set_config(display='diagram')
from tensorflow.keras.preprocessing.text import Tokenizer
import string
import os
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from sklearn.model_selection import train_test_split 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, Sequential

from tensorflow.keras.callbacks import EarlyStopping

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lucaspicot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load the dataset already preprocessed

In [2]:
data = pd.read_csv("/Users/lucaspicot/code/Zen1400/fake_news/raw_data/tokenized_df.csv")

In [3]:
## Delete the first column

data = data.drop(columns= 'Unnamed: 0')

In [4]:
data

Unnamed: 0,text,label
0,"['washington', 'reuters', 'head', 'conservativ...",1
1,"['washington', 'reuters', 'transgender', 'peop...",1
2,"['washington', 'reuters', 'special', 'counsel'...",1
3,"['washington', 'reuters', 'trump', 'campaign',...",1
4,"['seattlewashington', 'reuters', 'president', ...",1
...,...,...
93221,"['email', 'released', 'wikileaks', 'sunday', '...",1
93222,"['washington', 'reuters', 'hackers', 'believed...",0
93223,"['know', 'fantasyland', 'republicans', 'never'...",1
93224,"['migrants', 'refuse', 'leave', 'train', 'refu...",0


In [5]:
indexes = np.random.randint(0, high=93225, size=40000) #try to catch 40% percent 


In [6]:
len(indexes)

40000

In [7]:
data = data.loc[indexes,:] #get only 40K rows

# Splitting Data

In [8]:
## Splitting the data

X = data['text']             
y = data['label'] 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
y_train.astype('int8')
y_test.astype('int8')

31226    0
33806    0
6438     1
9116     1
64418    0
        ..
47278    1
89639    1
60753    0
47004    1
72169    0
Name: label, Length: 12000, dtype: int8

# Vectorized

In [10]:
from gensim.models import Word2Vec ##check window parameter after

word2vec = Word2Vec(sentences=X_train, vector_size=10, min_count=5)

In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

In [12]:
X_test_embed = embedding(word2vec, X_test)

In [13]:
# Embed the training and test sentences
X_train_embed = embedding(word2vec, X_train)

# Padding to make the input of the same length

In [14]:
# Pad the training and test embedded sentences
X_train_padded = pad_sequences(X_train_embed, dtype='float32', padding='pre', maxlen=500, value = -99.0)

In [15]:
X_test_padded = pad_sequences(X_test_embed, dtype='float32', padding='pre', maxlen=500, value = -99.0)

In [16]:
#Check shape

X_train_padded.shape , X_test_padded.shape

((28000, 500, 10), (12000, 500, 10))

In [18]:
X_train_padded

array([[[-5.7712281e-01, -5.1936036e-01,  5.8623981e-01, ...,
          8.4643590e-01,  1.0535728e+00, -1.2819780e+00],
        [-1.1708661e+00,  9.0038085e-01,  5.2296704e-01, ...,
         -9.0019703e-01, -2.2798823e-01, -1.4007531e+00],
        [-3.4860459e-01,  9.4914734e-01, -4.7574520e-02, ...,
         -5.3607225e-01,  1.1720301e+00, -9.6821845e-01],
        ...,
        [ 3.1153548e-01,  1.0276473e+00,  4.0099910e-01, ...,
         -3.4593168e-01,  2.0931157e-01, -6.7018974e-01],
        [-3.4860459e-01,  9.4914734e-01, -4.7574520e-02, ...,
         -5.3607225e-01,  1.1720301e+00, -9.6821845e-01],
        [-1.4375784e+00,  1.3912442e-03,  5.0007927e-01, ...,
         -4.8208591e-01,  1.9115931e+00, -2.2102029e+00]],

       [[-1.0062619e-02,  8.4898734e-01,  5.7506561e-01, ...,
         -3.7789342e-01, -5.8738768e-02, -5.5502898e-01],
        [-8.0681944e-01, -4.2781797e-01, -2.4429040e-01, ...,
          5.6494439e-01,  1.0326473e+00, -2.8137615e-01],
        [-1.1708661e+00, 

# Create the model

In [19]:
from tensorflow.keras import regularizers

In [23]:
#Zein's model 



reg_l1 = regularizers.L1(0.01)
reg_l2 = regularizers.L2(0.01)

model = Sequential([
    layers.Masking(mask_value= -99.0),
    layers.LSTM(20, activation='tanh'),
    layers.Dense(20, activation = 'relu', kernel_regularizer=reg_l1),
    layers.Dropout(rate=0.2),
    layers.Dense(15, activation = 'relu', kernel_regularizer=reg_l1 ),
    layers.Dropout(rate=0.2),
    layers.Dense(10, activation = 'relu', kernel_regularizer=reg_l1),
    layers.Dropout(rate=0.2),
    layers.Dense(1, activation="sigmoid")
    

])

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])


In [24]:
# Early stopping and train the model

es = EarlyStopping(patience = 15, verbose=2, monitor='val_loss', restore_best_weights = True)

model.fit(X_train_padded, y_train, batch_size = 16, verbose=2 ,callbacks = [es], validation_split=0.15, epochs = 100)

Epoch 1/100
1488/1488 - 156s - loss: 0.8577 - accuracy: 0.5041 - val_loss: 0.6975 - val_accuracy: 0.4990
Epoch 2/100
1488/1488 - 154s - loss: 0.6974 - accuracy: 0.5011 - val_loss: 0.6976 - val_accuracy: 0.4990
Epoch 3/100
1488/1488 - 153s - loss: 0.6974 - accuracy: 0.5031 - val_loss: 0.6975 - val_accuracy: 0.5010
Epoch 4/100
1488/1488 - 154s - loss: 0.6974 - accuracy: 0.5014 - val_loss: 0.6975 - val_accuracy: 0.4990
Epoch 5/100
1488/1488 - 154s - loss: 0.6974 - accuracy: 0.5000 - val_loss: 0.6975 - val_accuracy: 0.4990
Epoch 6/100
1488/1488 - 154s - loss: 0.6974 - accuracy: 0.5014 - val_loss: 0.6975 - val_accuracy: 0.4990
Epoch 7/100
1488/1488 - 153s - loss: 0.6974 - accuracy: 0.5016 - val_loss: 0.6975 - val_accuracy: 0.4990
Epoch 8/100
1488/1488 - 154s - loss: 0.6974 - accuracy: 0.4958 - val_loss: 0.6975 - val_accuracy: 0.4990
Epoch 9/100
1488/1488 - 152s - loss: 0.6974 - accuracy: 0.5050 - val_loss: 0.6976 - val_accuracy: 0.4990
Epoch 10/100
1488/1488 - 152s - loss: 0.6974 - accuracy

<tensorflow.python.keras.callbacks.History at 0x19364f940>

In [None]:
model.evaluate(X_test_padded, y_test)