# Data Preprocessing
The training dataset used in this project is the sentence polarity dataset v1.0 at [Cornell Dataset](https://www.cs.cornell.edu/people/pabo/movie-review-data/), which contains 5331 positive and negative reviews. 
This section is to prepare training set (0.8) and test set (0.2) for LSTM model training

We use the nltk library for dataset cleaning, and vectorize the words.

In [182]:
import collections
import sys
import numpy as np
import nltk
from nltk.corpus import stopwords

Download the necessary library: 'stopwords'

In [183]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

## 1.Read the file and Data cleaning
It is to read the file and convert 'gb18030' encoding to Unicode encoding.

In [184]:
def decode_file(infile, outfile):
    with open(infile, 'r',encoding='gb18030', errors='ignore') as f:
        txt = f.read()
    with open(outfile, 'w') as f:
        f.write(txt)

Remove punctuation through python regular expression

In [185]:
import re
special_char = re.compile("[^A-Za-z0-9 ]+")
def clean_sentence(line):
    line = line.lower().replace("<br />"," ")
    return re.sub(special_char, '', line)

Remove stopwords

In [186]:
#Using the NLTK to clean the stopwords
def clean_stopwords(words):
    stopword = {}.fromkeys([line.rstrip() for line in stopwords.words()])
    word_nostop = [w for w in words if w not in stopword]
    return word_nostop

Test the 'clean_stopwords' function

In [187]:
w = ['i','love','us',',','?']
w = clean_stopwords(w)
w = [clean_sentence(i) for i in w ]
w

['love', 'us', '', '']

In [188]:
decode_file('data/rt-polarity.pos', 'data/pos.txt')
decode_file('data/rt-polarity.neg', 'data/neg.txt')

## 2. Generate the wordlist

Read the dataset, extract all the words, and count the number of times each word appears. In order to avoid the interference of low-frequency words and reduce the model parameters, we only keep the first 9999 words and add the low-frequency words as 'None' to the wordlist.

In [189]:
#generate the wordlist and save to a file
word_list = []

with open('data/neg.txt', 'r') as f:
    f_lines = f.readlines()
    for line in f_lines:
        words = line.strip().split()
        words = [clean_sentence(i) for i in words ]
        word_list.extend(words)
num_pos = len(word_list)
print("Positive file finished")
print('the positive file has %d words'%num_pos)
with open('data/pos.txt', 'r') as f:
    f_lines = f.readlines()
    for line in f_lines:
        words = line.strip().split()
        words = [clean_sentence(i) for i in words ]
        word_list.extend(words)
num_neg = len(word_list)-num_pos
print("Negative file finished")
print('the negative file has %d words'%num_neg)

word_list = clean_stopwords(word_list)
counter = collections.Counter(word_list)

sorted_words = sorted(counter.items(), key=lambda x: x[1], reverse=True)
# the most frequency 10000 words
word_list = [word[0] for word in sorted_words]

word_list = ['<None>'] + word_list[:10000 - 1]
# save to the file
with open('data/vocab.txt', 'w') as f:
    for word in word_list:
        f.write(word + '\n')

Positive file finished
the positive file has 111596 words
Negative file finished
the negative file has 112407 words


## 3. Use wordlist to convert movie reviews into word vectors

Get the word index in the wordlist

In [190]:
def get_id_by_word(word, word2id):
    if word in word2id:
        return word2id[word]
    else:
        return word2id['<None>']

In [191]:
with open('data/vocab.txt', 'r') as f:
    vocab_list = f.read().strip().split('\n')
word2id = dict(zip(vocab_list, range(len(vocab_list))))

Convert the sentence to a vec. Because the length of different movie reviews is different, we choose the longest movie review and fill other film reviews with '0' value. Then convert it into a numpy array.

In [192]:
# convert the sentence to a vec
vec = []
with open('data/pos.txt', 'r') as f:
    f_lines = f.readlines()
    for line in f_lines:
        tmp_vec = [str(get_id_by_word(word, word2id)) for word in line.strip().split()]
        vec.append(tmp_vec)

with open('data/pos.vec', 'w') as f:
    for tmp_vec in vec:
        f.write(' '.join(tmp_vec) + '\n')
        
vec = []
with open('data/neg.txt', 'r') as f:
    f_lines = f.readlines()
    for line in f_lines:
        tmp_vec = [str(get_id_by_word(word, word2id)) for word in line.strip().split()]
        vec.append(tmp_vec)

with open('data/neg.vec', 'w') as f:
    for tmp_vec in vec:
        f.write(' '.join(tmp_vec) + '\n')

## 4. Generate training set and test set

In order to achieve better training results, we randomly shuffle the dataset

In [193]:
def shuffle_data(x, y, path):
    
    maxlen = max(map(len, x))
    data = np.zeros([len(x), maxlen], dtype=np.int32)
    for row in range(len(x)):
        data[row, :len(x[row])] = x[row]
    label = np.array(y)
    # shuffle the data
    state = np.random.get_state()
    np.random.shuffle(data)
    np.random.set_state(state)
    np.random.shuffle(label)
    # save the X and y 
    np.save(path + '_data', data)
    np.save(path + '_labels', label)

Divide the dataset into train set(80%) and test set(20%).

In [194]:
data = [[], []]
labels = [[], []]
rate = np.array([0.8, 0.2])
cumsum_rate = np.cumsum(rate)
with open('data/pos.vec', 'r') as f:
    f_lines = f.readlines()
    for line in f_lines:
        tmp_data = [int(word) for word in line.strip().split()]
        tmp_label = [1, ]
        index = int(np.searchsorted(cumsum_rate, np.random.rand(1) * 1.0))
        data[index].append(tmp_data)
        labels[index].append(tmp_label)
with open('data/neg.vec', 'r') as f:
    f_lines = f.readlines()
    for line in f_lines:
        tmp_data = [int(word) for word in line.strip().split()]
        tmp_label = [0, ]
        index = int(np.searchsorted(cumsum_rate, np.random.rand(1) * 1.0))
        data[index].append(tmp_data)
        labels[index].append(tmp_label)

shuffle_data(data[0], labels[0], 'data/train')
shuffle_data(data[1], labels[1], 'data/test')