# Importing the libraries

In [None]:
import pandas as pd
import os
import numpy as np

import spacy
from spacy.tokens import Doc

nlp = spacy.load('en')


import tensorflow.keras as keras
from keras.models import Model
from sklearn.metrics import accuracy_score

MAX_INPUT_SIZE = 200

!pip install bpemb
from bpemb import BPEmb

# Downloading the data

In [None]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [None]:
!tar xvzf aclImdb_v1.tar.gz

In [None]:
train_data = []
test_data = []

# Getting the data

The data should be extracted from the folders and concatenated into train and test sets. Aditionally, in this part, each data input is tokenized and assigned to a one-hot ecoded label

In [None]:
directory = 'aclImdb/train/neg/'

for filename in os.listdir(directory):
    f = open(directory + '/' + filename, 'r')
    lines = f.read()
    doc = nlp(lines)
    train_data.append([doc, [0,1]])
    f.close()

In [None]:
directory = 'aclImdb/train/pos/'

for filename in os.listdir(directory):
    f = open(directory + '/' + filename, 'r')
    lines = f.read()
    doc = nlp(lines)
    train_data.append([doc, [1,0]])
    f.close()

In [None]:
directory = 'aclImdb/test/neg/'

for filename in os.listdir(directory):
    f = open(directory + '/' + filename, 'r')
    lines = f.read()
    doc = nlp(lines)
    test_data.append([doc, [0,1]])
    f.close()


In [None]:
directory = 'aclImdb/test/pos/'

for filename in os.listdir(directory):
    f = open(directory + '/' + filename, 'r')
    lines = f.read()
    doc = nlp(lines)
    test_data.append([doc,[1,0]])
    f.close()

# Create a mapping from words to IDs

Each token shuold have a specific ID that represents it, so we can transform the sentenses to ID vectors for the model.

In [None]:
reserve_test = test_data
reserve_train = train_data

In [None]:
all_data = train_data + test_data

In [None]:
def assign_ids(doc_list):
    words_list = []
    for row in doc_list:
        for token in row[0]:
            words_list.append(token.text.lower())
    words_list = list(dict.fromkeys(words_list))
    return {k: v+1 for v, k in enumerate(words_list)}

In [None]:
mapping = assign_ids(all_data)

In [None]:
NUM_WORDS = len(mapping)

# Transform sentences to ID lists

In [None]:
def string_to_model_input(sentence): 
 
  X=[]
  for token in sentence:
    X.append(mapping[token.text.lower()])
  #padding
  aux = len(X)
  X = ([0] * MAX_INPUT_SIZE + X)[aux:]
 
  return X

In [None]:
train_data = np.array(train_data)
test_data = np.array(test_data)

  """Entry point for launching an IPython kernel.
  


In [None]:
Y_train = train_data[:, -1]
Y_test = test_data[:, -1]

In [None]:
X_train = [string_to_model_input(line[0]) for line in train_data ]
X_test = [string_to_model_input(line[0]) for line in test_data ]
X_train = np.array(X_train)
X_test = np.array(X_test)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(25000, 200)
(25000, 200)
(25000,)
(25000,)


# Model 1

constructing and training the model 

In [None]:
def build_simple_model():

  model = keras.Sequential()
  model.add(keras.layers.Embedding(NUM_WORDS+1, 100 , input_length=MAX_INPUT_SIZE )) 
  model.add(keras.layers.GlobalMaxPooling1D())
  model.add(keras.layers.Dense(2, activation= "softmax" ))

  model. compile (loss=keras.losses.categorical_crossentropy, optimizer='Adam', metrics = [ 'accuracy' ])

  return model

In [None]:
model = build_simple_model()

In [None]:
X = X_train.tolist()
Y = Y_train.tolist()
Y_train.shape

(25000,)

In [None]:
model.fit(X, Y, epochs=10)

In [None]:
X_t = X_test.tolist()
Y_t = Y_test.tolist()

In [None]:
Y_pred = model.predict([X_test])
Y_pred = np.round(Y_pred)

In [None]:
accuracy_score(Y_t, Y_pred)

0.8612

# Model 2 (fastText)

## get the pretrained word embeddings

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip

In [None]:
!unzip wiki-news-300d-1M.vec.zip

In [None]:
from gensim.models import KeyedVectors
FASTTEXTFILE = "wiki-news-300d-1M.vec"
ft_model = KeyedVectors.load_word2vec_format(FASTTEXTFILE)


In [None]:
embedding_matrix =  np.zeros((NUM_WORDS, 100))

In [None]:
hits = 0
misses = 0
i = 0
for token in mapping:
  try:
    embedding_vector = ft_model.get_vector(str(token))
    embedding_matrix[i] = embedding_vector[:100]
    hits += 1
  except Exception as e:
    if "not in vocabulary" in str(e):
      misses += 1
  i+=1
print("Converted %d words (%d misses)" % (hits, misses))

## construct the model with initialized weights

In [None]:
def build_ft_model():

  model = keras.Sequential()
  model.add(keras.layers.Embedding(NUM_WORDS, 100 , input_length=MAX_INPUT_SIZE, weights=[embedding_matrix], trainable=False )) 
  model.add(keras.layers.GlobalMaxPooling1D())
  model.add(keras.layers.Dense(2, activation= "softmax" ))

  model. compile (loss=keras.losses.categorical_crossentropy, optimizer='Adam', metrics = [ 'accuracy' ])

  return model

In [None]:
model = build_ft_model()

In [None]:
model.fit(X, Y, epochs = 10)

In [None]:
Y_pred = model.predict([X_test])
Y_pred = np.round(Y_pred)

In [None]:
accuracy_score(Y_t, Y_pred)

0.5584

# model 3

In [None]:
bpemb_en = BPEmb(lang="en", dim=100)

In [None]:
train_data[0][0].text

In [None]:
encode_ids_train = bpemb_en.encode_ids(train_data[0][0].text)
encodes = bpemb_en.encode(train_data[0][0].text)
encode_ids_test = bpemb_en.encode_ids(train_data[0][0].text)


In [None]:
X_bpm_train = []
for line in train_data:
  X_bpm_train.append(bpemb_en.encode_ids(line[0].text)[:MAX_INPUT_SIZE])

In [None]:
len(X_bpm_train)

In [None]:
X_bpm_test = []
for line in test_data:
  X_bpm_test.append(bpemb_en.encode_ids(line[0].text)[:MAX_INPUT_SIZE])

In [None]:
all_bpm =  X_bpm_train + X_bpm_test


In [None]:
all_ids=[]
for row in all_bpm:
  all_ids += row

In [None]:
len(all_ids)

In [None]:
all_ids = list(dict.fromkeys(all_ids))

In [None]:
embedding_matrix =  np.zeros((NUM_WORDS, 100))

In [None]:
hits = 0
misses = 0
i = 0
for id in all_ids:
  try:
    embedding_vector = bpemb_en.vectors[id]
    embedding_matrix[i] = embedding_vector[:100]
    hits += 1
  except Exception as e:
    if "not in vocabulary" in str(e):
      misses += 1
  i+=1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
def build_bpm_model():

  model = keras.Sequential()
  model.add(keras.layers.Embedding(NUM_WORDS, 100 , input_length=MAX_INPUT_SIZE, weights=[embedding_matrix], trainable=False )) 
  model.add(keras.layers.GlobalMaxPooling1D())
  model.add(keras.layers.Dense(2, activation= "softmax" ))

  model. compile (loss=keras.losses.categorical_crossentropy, optimizer='Adam', metrics = [ 'accuracy' ])

  return model

In [None]:
model = build_bpm_model()

In [None]:
X_bpm_train_resized = []
for sent in X_bpm_train:
  aux = len(sent)
  sent = ([0] * MAX_INPUT_SIZE + sent)[aux:]
  X_bpm_train_resized.append(sent)
 

In [None]:
X_bpm_test_resized = []
for sent in X_bpm_test:
  aux = len(sent)
  sent = ([0] * MAX_INPUT_SIZE + sent)[aux:]
  X_bpm_test_resized.append(sent)

In [None]:
len(X_bpm_test_resized)

25000

In [None]:
model.fit(X_bpm_train_resized, Y, epochs = 50)

In [None]:
Y_pred = model.predict(X_bpm_test_resized)

In [None]:
Y_pred = np.round(Y_pred)

In [None]:
accuracy_score(Y_t, Y_pred)

0.6124