In this file, we are going to load the data and get the tokenized data.

First we download the .jar file from http://ai.stanford.edu/~amaas/data/sentiment/

Then we decompress the file and get several folders of txt files with each file contains a comment from IMDB.

In [1]:
# load the data from several folders of multiple text files
import os
def load_data(data_path):
    domain = os.path.abspath(data_path)
    text = []
    file_list = os.listdir(data_path)
    for file_path in file_list:
        file_name = os.path.join(domain, file_path)
        file_content = open(file_name, 'r')
        text.append(file_content.read())
        file_content.close()
    return text

# indicate the locations of the folders    
train_neg = r'/Users/Sherryzzh/Documents/NLP/HW/aclImdb/train/neg'
train_pos = r'/Users/Sherryzzh/Documents/NLP/HW/aclImdb/train/pos'
test_neg = r'/Users/Sherryzzh/Documents/NLP/HW/aclImdb/test/neg'
test_pos = r'/Users/Sherryzzh/Documents/NLP/HW/aclImdb/test/pos'

# read the files in each folder and load them into the list
train_neg_data = load_data(train_neg)
train_pos_data = load_data(train_pos)
test_neg_data = load_data(test_neg)
test_pos_data = load_data(test_pos)

# choose the first 10000 in negative and positive set respectively, and merge them as the training set
train_split = 10000
train_data = train_neg_data[:train_split] + train_pos_data[:train_split]
# give labels for the training set
train_label = [0] * train_split + [1] * train_split

# merge the other 2500 negative and 2500 positive data as the validation set
val_data = train_neg_data[train_split:] + train_pos_data[train_split:]
val_label = [0] * (len(train_neg_data) - train_split) + [1] * (len(train_pos_data) - train_split)

# merge the negative and positive test data as the test set
test_data = test_neg_data + test_pos_data
test_label = [0] * (len(test_neg_data)) + [1] * (len(test_pos_data))

# check the length of each dataset
print ("Train dataset size is {}".format(len(train_data)))
print ("Val dataset size is {}".format(len(val_data)))
print ("Test dataset size is {}".format(len(test_data)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


Now we start the tokenization. We fisrt do it for unit words and save the files by pickle.

In [3]:
# the tokenization function 
import spacy
import string
import re
import pickle as pkl

# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# drop the html tags, lowercase and remove punctuation
def tokenize(sent): 
    dr = re.compile(r'<[^>]+>', re.S)
    dd = dr.sub('', sent)
    tokens = tokenizer(dd)
    uwords = [token.text.lower() for token in tokens if (token.text not in punctuations)]
    return uwords

# given a dataset, get the tokens for each data, and get all the tokens for the dataset
def tokenize_dataset(dataset):
    token_dataset = []
    all_tokens = []    
    for sample in dataset:
        tokens = tokenize(sample)
        token_dataset.append(tokens)
        all_tokens += tokens
    return token_dataset, all_tokens

# val set tokens
print ("Tokenizing val data")
val_data_tokens, _ = tokenize_dataset(val_data)
pkl.dump(val_data_tokens, open("val_data_tokens_nohtml.p", "wb"))

# test set tokens
print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_data)
pkl.dump(test_data_tokens, open("test_data_tokens_nohtml.p", "wb"))

# train set tokens
print ("Tokenizing train data")
train_data_tokens, all_train_tokens = tokenize_dataset(train_data)
pkl.dump(train_data_tokens, open("train_data_tokens_nohtml.p", "wb"))
pkl.dump(all_train_tokens, open("all_train_tokens_nohtml.p", "wb"))

Then we do tokenization for N grams.

In [None]:
# load preprocessed train, val and test datasets
train_data_tokens = pkl.load(open("train_data_tokens_nohtml.p", "rb"))
all_train_tokens = pkl.load(open("all_train_tokens_nohtml.p", "rb"))
val_data_tokens = pkl.load(open("val_data_tokens_nohtml.p", "rb"))
test_data_tokens = pkl.load(open("test_data_tokens_nohtml.p", "rb"))

# generate N grams from a list of unit words
def tokenize_ng(sent, order):
    out = []
    for oo in range(2, order+1):
        ng = [' '.join(t).strip() for t in zip(*[sent[i:] for i in range(oo)])]
        out += ng
    return sent + out    

def tokenize_dataset_ng(data_tokens, order):
    token_dataset = []
    all_tokens = []    
    for sample in data_tokens:
        tokens = tokenize_ng(sample, order)
        token_dataset.append(tokens)
        all_tokens += tokens
    return token_dataset, all_tokens

# val set tokens
print ("Tokenizing val data for 2-gram")
val_data_tokens_2g, _ = tokenize_dataset_ng(val_data_tokens, 2)
pkl.dump(val_data_tokens_2g, open("val_data_tokens_2g.p", "wb"))

# test set tokens
print ("Tokenizing test data for 2-gram")
test_data_tokens_2g, _ = tokenize_dataset_ng(test_data_tokens, 2)
pkl.dump(test_data_tokens_2g, open("test_data_tokens_2g.p", "wb"))

# train set tokens
print ("Tokenizing train data for 2-gram")
train_data_tokens_2g, all_train_tokens_2g = tokenize_dataset_ng(train_data_tokens, 2)
pkl.dump(train_data_tokens_2g, open("train_data_tokens_2g.p", "wb"))
pkl.dump(all_train_tokens_2g, open("all_train_tokens_2g.p", "wb"))

# val set tokens
print ("Tokenizing val data for 3-gram")
val_data_tokens_3g, _ = tokenize_dataset_ng(val_data_tokens, 3)
pkl.dump(val_data_tokens_3g, open("val_data_tokens_3g.p", "wb"))

# test set tokens
print ("Tokenizing test data for 3-gram")
test_data_tokens_3g, _ = tokenize_dataset_ng(test_data_tokens, 3)
pkl.dump(test_data_tokens_3g, open("test_data_tokens_3g.p", "wb"))

# train set tokens
print ("Tokenizing train data for 3-gram")
train_data_tokens_3g, all_train_tokens_3g = tokenize_dataset_ng(train_data_tokens, 3)
pkl.dump(train_data_tokens_3g, open("train_data_tokens_3g.p", "wb"))
pkl.dump(all_train_tokens_3g, open("all_train_tokens_3g.p", "wb"))

# val set tokens
print ("Tokenizing val data for 4-gram")
val_data_tokens_4g, _ = tokenize_dataset_ng(val_data_tokens, 4)
pkl.dump(val_data_tokens_4g, open("val_data_tokens_4g.p", "wb"))

# test set tokens
print ("Tokenizing test data for 4-gram")
test_data_tokens_4g, _ = tokenize_dataset_ng(test_data_tokens, 4)
pkl.dump(test_data_tokens_4g, open("test_data_tokens_4g.p", "wb"))

# train set tokens
print ("Tokenizing train data for 4-gram")
train_data_tokens_4g, all_train_tokens_4g = tokenize_dataset_ng(train_data_tokens, 4)
pkl.dump(train_data_tokens_4g, open("train_data_tokens_4g.p", "wb"))
pkl.dump(all_train_tokens_4g, open("all_train_tokens_4g.p", "wb"))

To test the effect of Tokenization schemes on the model, we try the tokenization that does not drop the html tags

In [None]:
# drop the html tags, lowercase and remove punctuation
def tokenize(sent): 
    tokens = tokenizer(sent)
    uwords = [token.text.lower() for token in tokens if (token.text not in punctuations)]
    return uwords

# val set tokens
print ("Tokenizing val data")
val_data_tokens, _ = tokenize_dataset(val_data)
pkl.dump(val_data_tokens, open("val_data_tokens.p", "wb"))

# test set tokens
print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_data)
pkl.dump(test_data_tokens, open("test_data_tokens.p", "wb"))

# train set tokens
print ("Tokenizing train data")
train_data_tokens, all_train_tokens = tokenize_dataset(train_data)
pkl.dump(train_data_tokens, open("train_data_tokens.p", "wb"))
pkl.dump(all_train_tokens, open("all_train_tokens.p", "wb"))