In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn import set_config; set_config(display='diagram')
from tensorflow.keras.preprocessing.text import Tokenizer
import string
import os
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from sklearn.model_selection import train_test_split 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, Sequential

from tensorflow.keras.callbacks import EarlyStopping

KeyboardInterrupt: 

# Load the dataset already preprocessed

In [None]:
data = pd.read_csv('tokenized_df.csv')

In [3]:
## Check dataset if it's already cleaned
data

Unnamed: 0.1,Unnamed: 0,text,label
0,0,"['washington', 'reuters', 'head', 'conservativ...",1
1,1,"['washington', 'reuters', 'transgender', 'peop...",1
2,2,"['washington', 'reuters', 'special', 'counsel'...",1
3,3,"['washington', 'reuters', 'trump', 'campaign',...",1
4,4,"['seattlewashington', 'reuters', 'president', ...",1
...,...,...,...
93221,93221,"['email', 'released', 'wikileaks', 'sunday', '...",1
93222,93222,"['washington', 'reuters', 'hackers', 'believed...",0
93223,93223,"['know', 'fantasyland', 'republicans', 'never'...",1
93224,93224,"['migrants', 'refuse', 'leave', 'train', 'refu...",0


In [4]:
## Delete the first column

data = data.drop(columns= 'Unnamed: 0')

In [5]:
data

Unnamed: 0,text,label
0,"['washington', 'reuters', 'head', 'conservativ...",1
1,"['washington', 'reuters', 'transgender', 'peop...",1
2,"['washington', 'reuters', 'special', 'counsel'...",1
3,"['washington', 'reuters', 'trump', 'campaign',...",1
4,"['seattlewashington', 'reuters', 'president', ...",1
...,...,...
93221,"['email', 'released', 'wikileaks', 'sunday', '...",1
93222,"['washington', 'reuters', 'hackers', 'believed...",0
93223,"['know', 'fantasyland', 'republicans', 'never'...",1
93224,"['migrants', 'refuse', 'leave', 'train', 'refu...",0


# Splitting Data

In [6]:
## Splitting the data

X = data['text']             
y = data['label'] 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [7]:
X_train

80155    ['london', 'reuters', 'british', 'lawmakers', ...
44331    ['hillary', 'clinton', 'seeks', 'even', 'unfit...
22340    ['jake', 'tapper', 'laid', 'kellyanne', 'conwa...
82292    ['washington', 'reuters', 'us', 'democratic', ...
91496    ['washington', 'reuters', 'house', 'representa...
                               ...                        
5717     ['washington', 'reuters', 'top', 'republican',...
40622    ['belgian', 'historian', 'david', 'engels', 'c...
91165    ['mexico', 'city', 'reuters', 'negotiators', '...
86998    ['republicans', 'may', 'done', 'everything', '...
66687    ['president', 'trump', 'greeted', 'nhl', 'stan...
Name: text, Length: 65258, dtype: object

# Vectorized

In [8]:
# Initializing the tokenizer

tokenizer = Tokenizer()

# The tokenization learns a dictionary that maps a token (integer) to each word
# It can be done only on the train set - we are not supposed to know the test set!
# This tokenization also lowercases your words, apply some filters, and so on - you can check the doc if you want

tokenizer.fit_on_texts(X_train)

In [9]:
# We apply the tokenization to the train and test set

X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)

In [10]:
X_train_token

[[710,
  12,
  648,
  299,
  3453,
  298,
  98,
  2500,
  40,
  16,
  97,
  471,
  172,
  843,
  6634,
  220,
  19,
  8849,
  420,
  13344,
  522,
  302,
  359,
  512,
  619,
  1049,
  1000,
  1410,
  512,
  769,
  1227,
  62,
  2279,
  707,
  126,
  274,
  648,
  299,
  1502,
  9696,
  16,
  843,
  52],
 [33,
  14,
  2299,
  34,
  4391,
  129,
  10,
  381,
  3271,
  17230,
  140,
  578,
  17231,
  732,
  12849,
  12424,
  33,
  14,
  5452,
  16665,
  4164,
  30929,
  2461,
  791,
  456,
  7,
  2546,
  791,
  2699,
  3137,
  1525,
  6745,
  1041,
  6745,
  75,
  268,
  1232,
  497,
  1078,
  1470,
  4,
  20,
  44,
  917,
  1485,
  129,
  10,
  1485,
  2080,
  949,
  824,
  7759,
  4687,
  26730,
  5546,
  2068,
  20,
  823,
  174,
  165,
  11178,
  430,
  280,
  4832,
  1841,
  3437,
  22628,
  1232,
  25,
  28,
  423,
  3149,
  129,
  10,
  5,
  6322,
  28,
  84,
  110,
  584,
  4,
  251,
  337,
  26103,
  4110,
  545,
  56,
  433,
  95,
  131,
  358,
  19,
  4603,
  44,
  34,
  2392,

In [11]:
# Calculating the number of different words in the training set

vocab_size = len(tokenizer.word_index)

vocab_size

231195

# Padding to make the input of the same length

In [12]:
X_train_padded = pad_sequences(X_train_token, dtype='float32', padding='post', maxlen = 500)

X_test_padded = pad_sequences(X_test_token, dtype='float32', padding='post', maxlen = 500)

In [13]:
#Check shape

X_train_padded.shape , X_test_padded.shape

((65258, 500), (27968, 500))

# Create the model

In [17]:
#Zein's model 

from tensorflow.keras import regularizers

reg_l1 = regularizers.L1(0.01)
reg_l2 = regularizers.L2(0.01)

model = Sequential([
    layers.Embedding(
    input_dim=vocab_size+1,
    output_dim= 30,
    mask_zero=True, ),
    layers.LSTM(10, activation = 'tanh'),
    layers.Dense(20, activation = 'relu'),
    #layers.Dropout(rate=0.2),
    layers.Dense(20, activation = 'relu'),
    #layers.Dropout(rate=0.2),
    layers.Dense(10, activation = 'relu'),
    layers.Dense(1, activation="sigmoid")
    

])

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])


In [18]:
# Early stopping and train the model

es = EarlyStopping(patience = 2)

model.fit(X_train_padded, y_train, callbacks = [es], epochs = 2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x193c63d00>

In [19]:
model.evaluate(X_test_padded, y_test)



[0.6308562755584717, 0.5632866024971008]