In [15]:
# https://ai.stanford.edu/~amaas/data/sentiment/

In [16]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import pickle
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [17]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [18]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import text_to_word_sequence, one_hot
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [19]:
#only if GPU is available
physical_devices = tf.config.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [20]:
'''
parameters : text - string
remove HTML content from the string such as tags (eg. <title></title>)
returns text_without_html
'''
def remove_html(text) :
    soup = BeautifulSoup(text)
    text_without_html = soup.get_text()
    return text_without_html

'''
parameters : text - string
remove punctuation (eg. '.', '!')
returns : text_without_puntuation - string
'''
def remove_punctuation(text) :
    text_without_puntuation = " ".join([[char for char in text if char not in string.punctuation]])
    return text_without_puntuation

'''
parameters : tokens - list of words
remove stop words (eg. 'a', 'the', 'are')
returns : tokens_without_stop_words
'''
def remove_stop_words(token) :
    stop_words = stopwords.words('english')
    token_without_stop_words = [word for word in token if word not in stop_words]
    return token_without_stop_words

'''
parameters : tokens - list of words
stem the words (eg. playing -> play)
returns : stemmed_words - list of words
'''
def stemmed_words(tokens) : 
    porter = PorterStemmer()
    stemmed_words = [porter.stem(word) for word in tokens]
    return stemmed_words

In [21]:
def clean_data(x) :
    texts = []
    for data in x :
        text = data.replace('<br /><br />', ' ')
        tokens = text_to_word_sequence(text)
        tokens = remove_stop_words(tokens)
        text = " ".join(tokens)
        texts.append(text)
    return texts

In [22]:
def make_tokenizer(x_train) :
    t = Tokenizer()
    t.fit_on_texts(x_train)
    return t

In [23]:
def encode_texts(dataset, tokenizer) :
    encoded_docs = tokenizer.texts_to_sequences(dataset)
    return encoded_docs

def encode_labels(labels) :
    le = LabelEncoder()
    le.fit(labels)
    return le.transform(labels)

In [24]:
df = pd.read_csv('./data/IMDB Dataset.csv').iloc[:20000, :]
df['sentiment'] = encode_labels(df['sentiment'])

In [25]:
x_train, x_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.1, random_state=42, shuffle=True)

x_train = clean_data(x_train)
# x_test = clean_data(x_test)

tokenizer = make_tokenizer(x_train)
num_words = len(tokenizer.word_index) + 1

x_train = pad_sequences(encode_texts(x_train, tokenizer), padding='post')
max_length = x_train.shape[1]
x_test = pad_sequences(encode_texts(x_test, tokenizer), maxlen=max_length, padding='post')

y_train = np.array(y_train)
y_test = np.array(y_test)

In [26]:
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [27]:
np.savez('./data/encoded_dataset.npz', name1=x_train, name2=y_train, name3=x_test, name4=y_test)

In [28]:
vocab_file = open("./data/vocab.pkl", "wb")
pickle.dump(tokenizer.word_index, vocab_file)
vocab_file.close()