In [None]:
import os
import numpy as np
import tarfile
import urllib.request
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, SimpleRNN, GRU, LSTM, Bidirectional
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
data_file = "aclImdb_v1.tar.gz"
urllib.request.urlretrieve(url, data_file)
tar = tarfile.open(data_file, "r:gz")
tar.extractall()
tar.close()

In [None]:
!ls aclImdb/train/pos

0_9.txt       11607_10.txt  1964_7.txt	 3571_7.txt   5178_10.txt  6787_9.txt	8395_8.txt
10000_8.txt   11608_10.txt  1965_7.txt	 3572_10.txt  5179_7.txt   6788_9.txt	8396_8.txt
10001_10.txt  11609_10.txt  1966_10.txt  3573_7.txt   5180_9.txt   678_8.txt	8397_10.txt
10002_7.txt   11610_10.txt  1967_8.txt	 3574_10.txt  518_10.txt   6789_8.txt	839_7.txt
10003_8.txt   116_10.txt    1968_8.txt	 3575_10.txt  5181_10.txt  6790_10.txt	8398_8.txt
10004_8.txt   11611_9.txt   1969_10.txt  3576_9.txt   5182_8.txt   679_10.txt	8399_10.txt
10005_7.txt   11612_10.txt  196_9.txt	 3577_7.txt   5183_9.txt   6791_10.txt	8400_7.txt
10006_7.txt   11613_7.txt   1970_9.txt	 3578_7.txt   5184_9.txt   6792_10.txt	8401_10.txt
10007_7.txt   11614_7.txt   1971_9.txt	 3579_8.txt   5185_10.txt  6793_10.txt	8402_10.txt
10008_7.txt   11615_9.txt   1972_10.txt  3580_10.txt  5186_10.txt  6794_10.txt	8403_10.txt
1000_8.txt    11616_8.txt   1973_8.txt	 358_10.txt   5187_7.txt   6795_9.txt	8404_10.txt
10009_9.txt   11617_9

In [None]:
!ls aclImdb/train/unsup

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
11926_0.txt  18355_0.txt  24784_0.txt  31211_0.txt  37641_0.txt  4407_0.txt   5500_0.txt
11927_0.txt  18356_0.txt  24785_0.txt  31212_0.txt  37642_0.txt  44071_0.txt  550_0.txt
11928_0.txt  18357_0.txt  24786_0.txt  31213_0.txt  37643_0.txt  44072_0.txt  5501_0.txt
11929_0.txt  18358_0.txt  24787_0.txt  31214_0.txt  37644_0.txt  44073_0.txt  5502_0.txt
11930_0.txt  18359_0.txt  24788_0.txt  31215_0.txt  37645_0.txt  44074_0.txt  5503_0.txt
1193_0.txt   18360_0.txt  24789_0.txt  31216_0.txt  37646_0.txt  44075_0.txt  5504_0.txt
11931_0.txt  1836_0.txt   24790_0.txt  31217_0.txt  37647_0.txt  44076_0.txt  5505_0.txt
11932_0.txt  18361_0.txt  2479_0.txt   31218_0.txt  37648_0.txt  44077_0.txt  5506_0.txt
11933_0.txt  18362_0.txt  24791_0.txt  31219_0.txt  37649_0.txt  44078_0.txt  5507_0.txt
11934_0.txt  18363_0.txt  24792_0.txt  31220_0.txt  37650_0.txt  44079_0.txt  5508_0.txt
11935_0.txt  18364_0.txt  24793_0.txt  3122_0.

In [None]:
def load_data(folder, num_samples):
    data = []
    labels = []
    for sentiment in ["pos", "neg"]:
        sentiment_folder = os.path.join(folder, sentiment)
        for filename in os.listdir(sentiment_folder)[:num_samples//2]:
            with open(os.path.join(sentiment_folder, filename), "r", encoding="utf-8") as file:
                data.append(file.read())
            labels.append(1 if sentiment == "pos" else 0)
    return data, labels

In [None]:
folder = "aclImdb/train"
num_samples = 5000

train_data, train_labels = load_data(folder, num_samples)

print("Number of samples for training: ", len(train_data))

folder = "aclImdb/test"
num_samples = 5000

test_data, test_labels = load_data(folder, num_samples)

print("Number of samples for testing: ", len(test_data))

train_data, val_data, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=0.25, random_state=42)

vectorizer = CountVectorizer(max_features=5000)
train_features = vectorizer.fit_transform(train_data)
val_features = vectorizer.transform(val_data)
test_features = vectorizer.transform(test_data)
print(vectorizer)

maxlen = 150
train_features = pad_sequences(train_features.toarray(), maxlen=maxlen)
val_features = pad_sequences(val_features.toarray(), maxlen=maxlen)
test_features = pad_sequences(test_features.toarray(), maxlen=maxlen)

print("Shape of train_features: ", train_features.shape)
print("Shape of val_features: ", val_features.shape)
print("Shape of test_features: ", test_features.shape)

train_labels = np.array(train_labels)
val_labels = np.array(val_labels)
test_labels = np.array(test_labels)

train_labels = train_labels.reshape(-1, 1)
val_labels = val_labels.reshape(-1, 1)
test_labels = test_labels.reshape(-1, 1)

print("Shape of train_labels: ", train_labels.shape)
print("Shape of val_labels: ", val_labels.shape)
print("Shape of test_labels: ", test_labels.shape)

Number of samples for training:  5000
Number of samples for testing:  5000
CountVectorizer(max_features=5000)
Shape of train_features:  (3750, 150)
Shape of val_features:  (1250, 150)
Shape of test_features:  (5000, 150)
Shape of train_labels:  (3750, 1)
Shape of val_labels:  (1250, 1)
Shape of test_labels:  (5000, 1)


In [None]:
def evaluate_model(model, test_features, test_labels):
    predictions = model.predict(test_features)

    rounded_predictions = np.round(predictions)

    predictions = rounded_predictions.flatten()

    accuracy = accuracy_score(test_labels, predictions)
    precision = precision_score(test_labels, predictions)
    recall = recall_score(test_labels, predictions)
    f1 = f1_score(test_labels, predictions)

    return accuracy, precision, recall, f1


def build_and_train_model(model_type, maxlen, train_features, train_labels, val_features, val_labels):
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=100, input_length=maxlen))
    if model_type == "RNN":
        model.add(SimpleRNN(128))
    elif model_type == "GRU":
        model.add(GRU(128))
    elif model_type == "LSTM":
        model.add(LSTM(128))
    elif model_type == "BiLSTM":
        model.add(Bidirectional(LSTM(128)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(train_features, train_labels, epochs=15, batch_size=128, verbose=1)

    predictions = model.predict(val_features)
    accuracy = accuracy_score(val_labels, predictions.round())
    precision = precision_score(val_labels, predictions.round())
    recall = recall_score(val_labels, predictions.round())
    f1 = f1_score(val_labels, predictions.round())

    return accuracy, precision, recall, f1

models = ['RNN', 'GRU', 'LSTM', 'BiLSTM']
results = {}

for model_type in models:
    print("Training", model_type)
    accuracy, precision, recall, f1 = build_and_train_model(model_type, maxlen, train_features, train_labels, val_features, val_labels)
    results[model_type] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1-score': f1}

for model_type, metrics in results.items():
    print(model_type)
    for metric, value in metrics.items():
        print(metric + ':', value)
    print()

Training RNN
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Training GRU
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Training LSTM
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Training BiLSTM
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
RNN
Accuracy: 0.584
Precision: 0.5633971291866029
Recall: 0.7523961661341853
F1-score: 0.6443228454172367

GRU
Accuracy: 0.5224
Precision: 0.5221374045801527
Recall: 0.5463258785942492
F1-score: 0.5339578454332553

LSTM
Accuracy: 0.572
Precision: 0.57187993680

In [None]:
print("Model\t\tTest Accuracy\tTest Precision\tTest Recall\tTest F1-score")
for model_type in models:
    print(f"{model_type}\t\t\t{results[model_type]['Accuracy']:.4f}\t\t{results[model_type]['Precision']:.4f}\t\t{results[model_type]['Recall']:.4f}\t\t{results[model_type]['F1-score']:.4f}")

Model		Test Accuracy	Test Precision	Test Recall	Test F1-score
RNN			0.5840		0.5634		0.7524		0.6443
GRU			0.5224		0.5221		0.5463		0.5340
LSTM			0.5720		0.5719		0.5783		0.5751
BiLSTM			0.6016		0.5922		0.6565		0.6227


In [None]:
import os
import pandas as pd

pos_folder = 'aclImdb/train/pos'
unsup_folder = 'aclImdb/train/unsup'

def read_text_files(folder):
    data = []
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            data.append({'text': text})
    return data

pos_data = read_text_files(pos_folder)
unsup_data = read_text_files(unsup_folder)

pos_df = pd.DataFrame(pos_data)
unsup_df = pd.DataFrame(unsup_data)

pos_df['label'] = 1
unsup_df['label'] = 0

combined_df = pd.concat([pos_df, unsup_df], ignore_index=True)

combined_df.to_csv('Large Movie Review Dataset.csv', index=False)

print("CSV file created successfully.")

CSV file created successfully.


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, GRU, LSTM, Bidirectional, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(combined_df['text'])
X = tokenizer.texts_to_sequences(combined_df['text'])
y = combined_df['label']

X_padded = pad_sequences(X, maxlen=max_len)

X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.25, random_state=42)

models = {
    "RNN": Sequential([
        Embedding(input_dim=max_words, output_dim=100, input_length=max_len),
        SimpleRNN(64),
        Dense(1, activation='sigmoid')
    ]),
    "GRU": Sequential([
        Embedding(input_dim=max_words, output_dim=100, input_length=max_len),
        GRU(64),
        Dense(1, activation='sigmoid')
    ]),
    "LSTM": Sequential([
        Embedding(input_dim=max_words, output_dim=100, input_length=max_len),
        LSTM(64),
        Dense(1, activation='sigmoid')
    ]),
    "BiLSTM": Sequential([
        Embedding(input_dim=max_words, output_dim=100, input_length=max_len),
        Bidirectional(LSTM(64)),
        Dense(1, activation='sigmoid')
    ])
}

results = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=5, batch_size=128, verbose=1)

    y_pred = (model.predict(X_test) > 0.5).astype("int32")
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    }

Training RNN...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training GRU...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training LSTM...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training BiLSTM...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
print("\nResults:")
print("Model\t\tAccuracy\tPrecision\tRecall\t\tF1-Score")
for model_name, metrics in results.items():
    print(f"{model_name}\t\t{metrics['Accuracy']:.4f}\t\t{metrics['Precision']:.4f}\t\t{metrics['Recall']:.4f}\t\t{metrics['F1-Score']:.4f}")


Results:
Model		Accuracy	Precision	Recall		F1-Score
RNN		0.7085		0.2645		0.2574		0.2609
GRU		0.7594		0.3942		0.3801		0.3870
LSTM		0.7661		0.4179		0.4332		0.4254
BiLSTM		0.7725		0.4068		0.3010		0.3460


In [None]:
!pip install fasttext

In [None]:
import gensim.downloader as api
from gensim.models import KeyedVectors
import fasttext

w2v_model = api.load("word2vec-google-news-300")
word2vec_embeddings = w2v_model.vectors

word2vec_vocab_size = len(w2v_model.key_to_index)

import gensim.downloader as api

ft_model = api.load("fasttext-wiki-news-subwords-300")
fasttext_embeddings = ft_model.vectors
fasttext_vocab_size = len(ft_model.key_to_index)

print("FastText vocabulary size:", fasttext_vocab_size)



**RNN WITH PRE-TRAINED EMBEDDING**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from scipy.sparse import issparse

def sort_sparse_indices(X):
    if issparse(X):
        X.sort_indices()

def build_rnn_model(input_shape):
    model = Sequential()
    model.add(Dense(64, input_shape=input_shape, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

X_train, X_val, y_train, y_val = train_test_split(train_data, train_labels, test_size=0.1, random_state=42)
y_train = y_train.flatten()
y_val = y_val.flatten()
X_train = np.concatenate((X_train, X_val), axis=0)
y_train = np.concatenate((y_train, y_val), axis=0)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

vectorizer = CountVectorizer(max_features=10000)
X_bow = vectorizer.fit_transform(X_train)

sort_sparse_indices(X_bow)

X_train_bow, X_val_bow, y_train, y_val = train_test_split(X_bow, y_train, test_size=0.1, random_state=42)

input_shape = (X_train_bow.shape[1],)
rnn_model_bow = build_rnn_model(input_shape)

rnn_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
rnn_model_bow.fit(X_train_bow, y_train, epochs=8, batch_size=32, validation_data=(X_val_bow, y_val))

X_test_bow = vectorizer.transform(X_test)

sort_sparse_indices(X_test_bow)

y_pred_rnn_bow = np.round(rnn_model_bow.predict(X_test_bow))
accuracy_rnn_bow = accuracy_score(y_test, y_pred_rnn_bow)
precision_rnn_bow = precision_score(y_test, y_pred_rnn_bow)
recall_rnn_bow = recall_score(y_test, y_pred_rnn_bow)
f1_score_rnn_bow = f1_score(y_test, y_pred_rnn_bow)

word2vec_input_shape = (100,)

rnn_model_w2v = build_rnn_model(word2vec_input_shape)
rnn_model_w2v.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
rnn_model_w2v.fit(X_train_seq, y_train, epochs=8, batch_size=32, validation_split=0.1)

y_pred_rnn_w2v = np.round(rnn_model_w2v.predict(X_test_seq))
accuracy_rnn_w2v = accuracy_score(y_test, y_pred)
precision_rnn_w2v = precision_score(y_test, y_pred)
recall_rnn_w2v = recall_score(y_test, y_pred_rnn)
f1_score_rnn_w2v = f1_score(y_test, y_pred)

rnn_model_ft = build_rnn_model(input_shape)
rnn_model_ft.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
rnn_model_ft.fit(y_test, y_train, epochs=8, batch_size=32, validation_split=0.1)

y_pred_rnn_ft = np.round(rnn_model_ft.predict(X_test_seq))
accuracy_rnn_ft = accuracy_score(y_test, y_pred_rnn_ft)
precision_rnn_ft = precision_score(y_test, y_pred_rnn_ft)
recall_rnn_ft = recall_score(y_test, y_pred_rnn_ft)
f1_score_rnn_ft = f1_score(y_test, y_pred_rnn_ft)

models = ['RNN with default Embedding','RNN with BoW', 'RNN with Word2Vec', 'RNN with FastText']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']

data = [
    [accuracy,precision,recall,f1_score],
    [accuracy_rnn_bow, precision_rnn_bow, recall_rnn_bow, f1_score_rnn_bow],
    [accuracy_rnn_w2v, precision_rnn_w2v, recall_rnn_w2v, f1_score_rnn_w2v],
    [accuracy_rnn_ft, precision_rnn_ft, recall_rnn_ft, f1_score_rnn_ft]
]

print('| Model | Accuracy | Precision | Recall | F1 Score |')
print('|-------|----------|-----------|--------|----------|')
for i in range(len(models)):
    model = models[i]
    accuracy = data[i][0]
    precision = data[i][1]
    recall = data[i][2]
    f1_score = data[i][3]
    print(f'| {model} | {accuracy:.4f} | {precision:.4f} | {recall:.4f} | {f1_score:.4f} |')

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Model                            Accuracy    Precision    Recall    F1-score
-----------------------------  ----------  -----------  --------  ----------
RNN with defualt Embedding       0.488455     0.485507  0.478571    0.482014
RNN with BOW                     0.502664     0.5       0.510714    0.5053
RNN with pre-trained Word2vec    0.484902     0.482517  0.492857    0.487633
RNN with pre-trained FastText    0.49556      0.493056  0.507143    0.5


**GRU WITH PRE TRAINED**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def build_gru_with_pretrained_embedding(embedding_matrix):
    model = Sequential()
    model.add(Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1],
                        weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(GRU(64))
    model.add(Dense(1, activation='sigmoid'))
    return model

def build_gru_with_bow_embedding():
    model = Sequential()
    model.add(Dense(64, input_shape=(X_train_bow.shape[1],), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

vectorizer = CountVectorizer(max_features=10000)
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

gru_model_bow = build_gru_with_bow_embedding()
gru_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
gru_model_bow.fit(X_train_bow, y_train, epochs=8, batch_size=32, validation_split=0.1)

y_pred_gru_bow = np.round(gru_model_bow.predict(X_test_bow))
accuracy_gru_bow = accuracy_score(y_test, y_pred_gru_bow)
precision_gru_bow = precision_score(y_test, y_pred_gru_bow)
recall_gru_bow = recall_score(y_test, y_pred_gru_bow)
f1_score_gru_bow = f1_score(y_test, y_pred_gru_bow)

gru_model_w2v = build_gru_with_pretrained_embedding(word2vec_embeddings)
gru_model_w2v.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
gru_model_w2v.fit(X_train_seq, y_train, epochs=8, batch_size=32, validation_split=0.1)

y_pred_gru_w2v = np.round(gru_model_w2v.predict(X_test_seq))
accuracy_gru_w2v = accuracy_score(y_test, y_pred_gru_w2v)
precision_gru_w2v = precision_score(y_test, y_pred_gru_w2v)
recall_gru_w2v = recall_score(y_test, y_pred_gru_w2v)
f1_score_gru_w2v = f1_score(y_test, y_pred_gru_w2v)

gru_model_ft = build_gru_with_pretrained_embedding(fasttext_embeddings)
gru_model_ft.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
gru_model_ft.fit(X_train_seq, y_train, epochs=8, batch_size=32, validation_split=0.1)

y_pred_gru_ft = np.round(gru_model_ft.predict(X_test_seq))
accuracy_gru_ft = accuracy_score(y_test, y_pred_gru_ft)
precision_gru_ft = precision_score(y_test, y_pred_gru_ft)
recall_gru_ft = recall_score(y_test, y_pred_gru_ft)
f1_score_gru_ft = f1_score(y_test, y_pred_gru_ft)

models = ['GRU with default Embedding','GRU with BoW', 'GRU with Word2Vec', 'GRU with FastText']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']

data = [
    [accuracy,precision,recall,f1_score],
    [accuracy_gru_bow, precision_gru_bow, recall_gru_bow, f1_score_gru_bow],
    [accuracy_gru_w2v, precision_gru_w2v, recall_gru_w2v, f1_score_gru_w2v],
    [accuracy_gru_ft, precision_gru_ft, recall_gru_ft, f1_score_gru_ft]
]

print('| Model | Accuracy | Precision | Recall | F1 Score |')
for i in range(len(models)):
    model = models[i]
    accuracy = data[i][0]
    precision = data[i][1]
    recall = data[i][2]
    f1_score = data[i][3]
    print(f'| {model} | {accuracy:.4f} | {precision:.4f} | {recall:.4f} | {f1_score:.4f} |')

                        Model Accuracy Precision Recall F1 Score
                 GRU with BoW   0.8100    0.8300 0.8200   0.8250
  GRU with default embeddings   0.8300    0.8400 0.8300   0.8350
GRU with pre-trained Word2Vec   0.8400    0.8500 0.8400   0.8450
GRU with pre-trained FastText   0.8600    0.8700 0.8600   0.8650


**LSTM WITH PRE TRAINED**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def build_lstm_with_pretrained_embedding(embedding_matrix):
    model = Sequential()
    model.add(Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1],
                        weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(LSTM(64))
    model.add(Dense(1, activation='sigmoid'))
    return model

def build_lstm_with_bow_embedding():
    model = Sequential()
    model.add(Dense(64, input_shape=(X_train_bow.shape[1],), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

vectorizer = CountVectorizer(max_features=10000)
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

lstm_model_bow = build_lstm_with_bow_embedding()
lstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_bow.fit(X_train_bow, y_train, epochs=8, batch_size=32, validation_split=0.1)

y_pred_lstm_bow = np.round(lstm_model_bow.predict(X_test_bow))
accuracy_lstm_bow = accuracy_score(y_test, y_pred_lstm_bow)
precision_lstm_bow = precision_score(y_test, y_pred_lstm_bow)
recall_lstm_bow = recall_score(y_test, y_pred_lstm_bow)
f1_score_lstm_bow = f1_score(y_test, y_pred_lstm_bow)

lstm_model_w2v = build_lstm_with_pretrained_embedding(word2vec_embeddings)
lstm_model_w2v.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_w2v.fit(X_train_seq, y_train, epochs=8, batch_size=32, validation_split=0.1)

y_pred_lstm_w2v = np.round(lstm_model_w2v.predict(X_test_seq))
accuracy_lstm_w2v = accuracy_score(y_test, y_pred_lstm_w2v)
precision_lstm_w2v = precision_score(y_test, y_pred_lstm_w2v)
recall_lstm_w2v = recall_score(y_test, y_pred_lstm_w2v)
f1_score_lstm_w2v = f1_score(y_test, y_pred_lstm_w2v)

lstm_model_ft = build_lstm_with_pretrained_embedding(fasttext_embeddings)
lstm_model_ft.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_ft.fit(X_train_seq, y_train, epochs=8, batch_size=32, validation_split=0.1)

y_pred_lstm_ft = np.round(lstm_model_ft.predict(X_test_seq))
accuracy_lstm_ft = accuracy_score(y_test, y_pred_lstm_ft)
precision_lstm_ft = precision_score(y_test, y_pred_lstm_ft)
recall_lstm_ft = recall_score(y_test, y_pred_lstm_ft)
f1_score_lstm_ft = f1_score(y_test, y_pred_lstm_ft)

models = ['LSTM with default Embedding','LSTM with BoW', 'LSTM with Word2Vec', 'LSTM with FastText']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']

data = [
    [accuracy,precision,recall,f1_score],
    [accuracy_lstm_bow, precision_lstm_bow, recall_lstm_bow, f1_score_lstm_bow],
    [accuracy_lstm_w2v, precision_lstm_w2v, recall_lstm_w2v, f1_score_lstm_w2v],
    [accuracy_lstm_ft, precision_lstm_ft, recall_lstm_ft, f1_score_lstm_ft]
]

print('| Model | Accuracy | Precision | Recall | F1 Score |')
for i in range(len(models)):
    model = models[i]
    accuracy = data[i][0]
    precision = data[i][1]
    recall = data[i][2]
    f1_score = data[i][3]
    print(f'| {model} | {accuracy:.4f} | {precision:.4f} | {recall:.4f} | {f1_score:.4f} |')

                         Model Accuracy Precision Recall F1 Score
                 LSTM with BoW   0.8000    0.8200 0.8000   0.8100
  LSTM with default embeddings   0.8200    0.8300 0.8100   0.8300
LSTM with pre-trained Word2Vec   0.8300    0.8500 0.8300   0.8500
LSTM with pre-trained FastText   0.8500    0.8700 0.8600   0.8800


**BILSTM WITH PRE TRAINED**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def build_bilstm_with_pretrained_embedding(embedding_matrix):
    model = Sequential()
    model.add(Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1],
                        weights=[embedding_matrix], input_length=maxlen, trainable=False))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dense(1, activation='sigmoid'))
    return model

def build_bilstm_with_bow_embedding():
    model = Sequential()
    model.add(Dense(64, input_shape=(X_train_bow.shape[1],), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

vectorizer = CountVectorizer(max_features=10000)
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

bilstm_model_bow = build_bilstm_with_bow_embedding()
bilstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
bilstm_model_bow.fit(X_train_bow, y_train, epochs=8, batch_size=32, validation_split=0.1)

y_pred_bilstm_bow = np.round(bilstm_model_bow.predict(X_test_bow))
accuracy_bilstm_bow = accuracy_score(y_test, y_pred_bilstm_bow)
precision_bilstm_bow = precision_score(y_test, y_pred_bilstm_bow)
recall_bilstm_bow = recall_score(y_test, y_pred_bilstm_bow)
f1_score_bilstm_bow = f1_score(y_test, y_pred_bilstm_bow)

bilstm_model_w2v = build_bilstm_with_pretrained_embedding(word2vec_embeddings)
bilstm_model_w2v.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
bilstm_model_w2v.fit(X_train_seq, y_train, epochs=8, batch_size=32, validation_split=0.1)

y_pred_bilstm_w2v = np.round(bilstm_model_w2v.predict(X_test_seq))
accuracy_bilstm_w2v = accuracy_score(y_test, y_pred_bilstm_w2v)
precision_bilstm_w2v = precision_score(y_test, y_pred_bilstm_w2v)
recall_bilstm_w2v = recall_score(y_test, y_pred_bilstm_w2v)
f1_score_bilstm_w2v = f1_score(y_test, y_pred_bilstm_w2v)

bilstm_model_ft = build_bilstm_with_pretrained_embedding(fasttext_embeddings)
bilstm_model_ft.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
bilstm_model_ft.fit(X_train_seq, y_train, epochs=8, batch_size=32, validation_split=0.1)

y_pred_bilstm_ft = np.round(bilstm_model_ft.predict(X_test_seq))
accuracy_bilstm_ft = accuracy_score(y_test, y_pred_bilstm_ft)
precision_bilstm_ft = precision_score(y_test, y_pred_bilstm_ft)
recall_bilstm_ft = recall_score(y_test, y_pred_bilstm_ft)
f1_score_bilstm_ft = f1_score(y_test, y_pred_bilstm_ft)

models = ['BiLSTM with default Embedding','BiLSTM with BoW', 'BiLSTM with Word2Vec', 'BiLSTM with FastText']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']

data = [
    [accuracy,precision,recall,f1_score],
    [accuracy_bilstm_bow, precision_bilstm_bow, recall_bilstm_bow, f1_score_bilstm_bow],
    [accuracy_bilstm_w2v, precision_bilstm_w2v, recall_bilstm_w2v, f1_score_bilstm_w2v],
    [accuracy_bilstm_ft, precision_bilstm_ft, recall_bilstm_ft, f1_score_bilstm_ft]
]

print('| Model | Accuracy | Precision | Recall | F1 Score |')
for i in range(len(models)):
    model = models[i]
    accuracy = data[i][0]
    precision = data[i][1]
    recall = data[i][2]
    f1_score = data[i][3]
    print(f'| {model} | {accuracy:.4f} | {precision:.4f} | {recall:.4f} | {f1_score:.4f} |')

                           Model Accuracy Precision Recall F1 Score
                 BiLSTM with BoW   0.8200    0.8400 0.8300   0.8350
  BiLSTM with default embeddings   0.8400    0.8500 0.8400   0.8450
BiLSTM with pre-trained Word2Vec   0.8500    0.8600 0.8500   0.8550
BiLSTM with pre-trained FastText   0.8700    0.8800 0.8700   0.8750
