In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from collections import Counter
plt.style.use('seaborn')

In [29]:
path = '/Users/babyhandzzz/Downloads/imdb_master.csv'

In [40]:
dataframe = pd.read_csv(path,encoding='latin1', usecols=['review','label'])
dataframe = dataframe.loc[dataframe.label != 'unsup']
dataframe.label.replace({'neg':0,'pos':1},inplace=True)
X = dataframe[['review']]
y = dataframe[['label']]

# Pre-Processing
___
## Removing Punctuation

In [41]:
def remove_punct(text):
    table = str.maketrans("","", string.punctuation)
    return text.translate(table)
X.review = X.review.map(lambda x: remove_punct(x))

## Removing Stopwords

In [43]:
stop = set(stopwords.words('english'))

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(text)

X.review = X.review.map(remove_stopwords)

# Word Frequency

In [45]:
def word_counter(text):
    count = Counter()
    for i in text.values:
        for word in i.split():
            count[word] += 1
    return count            

## Max Length is declared here

In [47]:
counter = word_counter(X.review)    
num_words = len(counter)
max_length = 850

## Train/Test Split

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16)

X_train.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

## Tokenization

In [95]:
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train.review)
word_index = tokenizer.word_index 

## Converting text to sequence of indeces
## Padding the sequences

In [96]:
train_sequence = tokenizer.texts_to_sequences(X_train.review)
train_padded = pad_sequences(train_sequence, maxlen=max_length, padding='post',truncating='post')

test_sequence = tokenizer.texts_to_sequences(X_test.review)
test_padded = pad_sequences(test_sequence, maxlen=max_length, padding='post',truncating='post')

In [106]:
# Making sure reverse operation produces the inverse of encoding
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
def decode(text):
    return " ".join([reverse_word_index.get(i,"?") for i in text]) 