In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import pickle
from nltk import ngrams
from collections import Counter

In [2]:
def process_files(path,directory):
    file_names = []
    ratings = []
    labels = []
    file_list = os.listdir(os.path.join(path,directory))
    for f in file_list:
        if f[-4:] == '.txt':
            file_names.append(os.path.join(path,directory) + '/'+f)
            ratings.append(int(os.path.splitext(f)[0].split('_')[1]))
    if directory == 'pos':
        labels = [1]*len(file_names)
    else:
        labels = [0]*len(file_names)
    
    return file_names, ratings, labels

In [3]:
# train_pos_files, train_pos_ratings, train_pos_labels = process_files('./../aclImdb/train/','pos')
# train_neg_files, train_neg_ratings, train_neg_labels = process_files('./../aclImdb/train/','neg')
# test_pos_files, test_pos_ratings, test_pos_labels = process_files('./../aclImdb/test/','pos')
# test_neg_files, test_neg_ratings, test_neg_labels = process_files('./../aclImdb/test/','neg')

In [4]:
# train_files,train_ratings, train_labels = np.array(train_pos_files + train_neg_files),np.array(train_pos_ratings + train_neg_ratings), np.array(train_pos_labels + train_neg_labels)
# test_files,test_ratings, test_labels = np.array(test_pos_files + test_neg_files),np.array(test_pos_ratings + test_neg_ratings), np.array(test_pos_labels + test_neg_labels)

In [5]:
# train_idx, val_idx = train_test_split(np.arange(len(train_files)),test_size = 0.2)

In [6]:
# val_files,val_ratings,val_labels = train_files[val_idx],train_ratings[val_idx],train_labels[val_idx]
# train_files_1,train_ratings_1,train_labels_1 = train_files[train_idx],train_ratings[train_idx],train_labels[train_idx]

In [7]:
def pickling(file,path):
    pickle.dump(file,open(path,'wb'))
def unpickling(path):
    file_return=pickle.load(open(path,'rb'))
    return file_return

In [8]:
# pickling([train_files_1,train_ratings_1, train_labels_1],'req_train_files.p')
[train_files_1,train_ratings_1, train_labels_1] = unpickling('req_train_files.p')

In [9]:
# pickling([test_files,test_ratings, test_labels],'req_test_files.p')
[test_files,test_ratings, test_labels] = unpickling('req_test_files.p')

In [10]:
# pickling([val_files,val_ratings, val_labels],'req_val_files.p')
[val_files,val_ratings, val_labels] = unpickling('req_val_files.p')

In [11]:
# Let's write the tokenization function 

import spacy
import string
import nltk
# lem = nltk.stem.LancasterStemmer()
# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation
# nltk.download("wordnet")
# lowercase and remove punctuation
def tokenize(sent):
    tokens = tokenizer(sent)
#     temp_list = []
#     for token in tokens:
#         if (token.text.lower() not in stop_words):
#             if (token.text.lower() not in punctuations):
#                 temp_list.append(token.text.lower())
#     return temp_list
    return [token.text.lower() for token in tokens if (token.text not in punctuations)]
#     return [token.text for token in tokens]

# Example
tokens = tokenize(u"Apples isn't looking at buying U.K. startup for $1 billion")
print (tokens)

['apples', 'is', "n't", 'looking', 'at', 'buying', 'u.k.', 'startup', 'for', '1', 'billion']


In [12]:
def tokenize_n(array,n):
    grams = ngrams(array,n)
    return [' '.join(g) for g in grams]

In [13]:
def tokenize_till_n(sent,n = 1):
    tokens = tokenize(sent)
    final_list = [];
    if n == 1:
        final_list = tokens
    else:
        for i in range(1, n+1):
            final_list += tokenize_n(tokens, i)
    return final_list

In [14]:
def tokenize_dataset(dataset,n):
    token_dataset = []
    # we are keeping track of all tokens in dataset 
    # in order to create vocabulary later
    all_tokens = []
    
    for path in dataset:
        sample = open(path,'r',encoding="utf8").read()
        tokens = tokenize_till_n(sample, n = n)
        token_dataset.append(tokens)
        all_tokens += tokens

    return token_dataset, all_tokens

# N = 1

In [62]:
#val set tokens
print ("Tokenizing val data")
val_data_tokens, _ = tokenize_dataset(val_files,1)

Tokenizing val data


In [63]:
# pickling(val_data_tokens,'val_data_tokens_n=1.p')

In [220]:
# pickling(val_data_tokens,'val_data_tokens_no_preprocessing_n=1.p')

In [64]:
print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_files,1)
# pickling(test_data_tokens, "test_data_tokens_n=1.p")
# pickling(test_data_tokens, "test_data_tokens_no_preprocessing_n=1.p")

Tokenizing test data


In [65]:
print ("Tokenizing train data")
train_data_tokens, all_tokens = tokenize_dataset(train_files_1,1)
# pickling(train_data_tokens,"train_data_tokens_n=1.p")
# pickling(train_data_tokens,"train_data_tokens_no_preprocessing_n=1.p")

Tokenizing train data


In [66]:
# pickling(all_tokens, "all_train_tokens_n=1.p")
# pickling(all_tokens, "all_train_tokens_no_preprocessing_n=1.p")

# N = 2

In [67]:
#val set tokens
print ("Tokenizing val data")
val_data_tokens, _ = tokenize_dataset(val_files,2)

Tokenizing val data


In [68]:
# pickling(val_data_tokens,'val_data_tokens_n=2.p')

In [69]:
print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_files,2)
# pickling(test_data_tokens, "test_data_tokens_n=2.p")

Tokenizing test data


In [15]:
print ("Tokenizing train data")
train_data_tokens, all_tokens = tokenize_dataset(train_files_1,2)
# pickling(train_data_tokens,"train_data_tokens_n=2.p")

Tokenizing train data


In [16]:
# pickling(all_tokens, "all_train_tokens_n=2.p")

# N = 3

In [17]:
#val set tokens
print ("Tokenizing val data")
val_data_tokens, _ = tokenize_dataset(val_files,3)

Tokenizing val data


In [18]:
# pickling(val_data_tokens,'val_data_tokens_n=3.p')

In [19]:
print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_files,3)
# pickling(test_data_tokens, "test_data_tokens_n=3.p")

Tokenizing test data


In [20]:
print ("Tokenizing train data")
train_data_tokens, all_tokens = tokenize_dataset(train_files_1,3)
# pickling(train_data_tokens,"train_data_tokens_n=3.p")

Tokenizing train data


In [21]:
# pickling(all_tokens, "all_train_tokens_n=3.p")

# N = 4

In [22]:
#val set tokens
print ("Tokenizing val data")
val_data_tokens, _ = tokenize_dataset(val_files,4)

Tokenizing val data


In [23]:
# pickling(val_data_tokens,'val_data_tokens_n=4.p')

In [24]:
print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_files,4)
# pickling(test_data_tokens, "test_data_tokens_n=4.p")

Tokenizing test data


In [25]:
print ("Tokenizing train data")
train_data_tokens, all_tokens = tokenize_dataset(train_files_1,4)
# pickling(train_data_tokens,"train_data_tokens_n=4.p")

Tokenizing train data


In [26]:
# pickling(all_tokens, "all_train_tokens_n=4.p")

# N = 1 (no preprocessing)

In [27]:
def tokenize(sent):
    tokens = tokenizer(sent)
#     temp_list = []
#     for token in tokens:
#         if (token.text.lower() not in stop_words):
#             if (token.text.lower() not in punctuations):
#                 temp_list.append(token.text.lower())
#     return temp_list
    return [token.text.lower() for token in tokens if (token.text not in punctuations)]
#     return [token.text for token in tokens]

In [28]:
#val set tokens
print ("Tokenizing val data")
val_data_tokens, _ = tokenize_dataset(val_files,1)

Tokenizing val data


In [29]:
# pickling(val_data_tokens,'val_data_tokens_n=1.p')

In [30]:
# pickling(val_data_tokens,'val_data_tokens_no_preprocessing_n=1.p')

In [31]:
print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_files,1)
# pickling(test_data_tokens, "test_data_tokens_n=1.p")
# pickling(test_data_tokens, "test_data_tokens_no_preprocessing_n=1.p")

Tokenizing test data


In [32]:
print ("Tokenizing train data")
train_data_tokens, all_tokens = tokenize_dataset(train_files_1,1)
# pickling(train_data_tokens,"train_data_tokens_n=1.p")
# pickling(train_data_tokens,"train_data_tokens_no_preprocessing_n=1.p")

Tokenizing train data


In [33]:
# pickling(all_tokens, "all_train_tokens_n=1.p")
# pickling(all_tokens, "all_train_tokens_no_preprocessing_n=1.p")

# Tokens without stop words (n = 1)

In [46]:
stop_words = ["i","me","my","myself","we","our","ours","ourselves","you","your","yours","yourself","yourselves","he","him","his","himself","she","her","hers","herself","it","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","to","from","up","down","in","on","off","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","other","some","such","only","own","same","so","than","too","very","s","t","can","will","just","don","should","now"]

In [47]:
def tokenize(sent):
    tokens = tokenizer(sent)
    temp_list = []
    for token in tokens:
        if (token.text.lower() not in stop_words):
            if (token.text.lower() not in punctuations):
                temp_list.append(token.text.lower())
    return temp_list
#     return [token.text.lower() for token in tokens if (token.text not in punctuations)]
#     return [token.text for token in tokens]

In [48]:
#val set tokens
print ("Tokenizing val data")
val_data_tokens, _ = tokenize_dataset(val_files,1)

Tokenizing val data


In [49]:
pickling(val_data_tokens,'val_data_tokens_wo_stop_words_n=1.p')

In [220]:
# pickling(val_data_tokens,'val_data_tokens_no_preprocessing_n=1.p')

In [50]:
print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_files,1)
pickling(test_data_tokens, "test_data_wo_stop_tokens_n=1.p")
# pickling(test_data_tokens, "test_data_tokens_no_preprocessing_n=1.p")

Tokenizing test data


In [51]:
print ("Tokenizing train data")
train_data_tokens, all_tokens = tokenize_dataset(train_files_1,1)
pickling(train_data_tokens,"train_data_wo_stop_tokens_n=1.p")
# pickling(train_data_tokens,"train_data_tokens_no_preprocessing_n=1.p")

Tokenizing train data


In [52]:
pickling(all_tokens, "all_train_wo_stop_tokens_n=1.p")
# pickling(all_tokens, "all_train_tokens_no_preprocessing_n=1.p")

# N = 2 (wo stop words)

In [81]:
#val set tokens
print ("Tokenizing val data")
val_data_tokens, _ = tokenize_dataset(val_files,2)

Tokenizing val data


In [82]:
pickling(val_data_tokens,'val_data_tokens_wo_stop_words_n=2.p')

In [83]:
# pickling(val_data_tokens,'val_data_tokens_no_preprocessing_n=1.p')

In [84]:
print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_files,2)
pickling(test_data_tokens, "test_data_wo_stop_tokens_n=2.p")
# pickling(test_data_tokens, "test_data_tokens_no_preprocessing_n=1.p")

Tokenizing test data


In [85]:
print ("Tokenizing train data")
train_data_tokens, all_tokens = tokenize_dataset(train_files_1,2)
pickling(train_data_tokens,"train_data_wo_stop_tokens_n=2.p")
# pickling(train_data_tokens,"train_data_tokens_no_preprocessing_n=1.p")

Tokenizing train data


In [86]:
pickling(all_tokens, "all_train_wo_stop_tokens_n=2.p")
# pickling(all_tokens, "all_train_tokens_no_preprocessing_n=1.p")

# N = 3 (wo stop words)

In [87]:
#val set tokens
print ("Tokenizing val data")
val_data_tokens, _ = tokenize_dataset(val_files,3)

Tokenizing val data


In [88]:
pickling(val_data_tokens,'val_data_tokens_wo_stop_words_n=3.p')

In [89]:
# pickling(val_data_tokens,'val_data_tokens_no_preprocessing_n=1.p')

In [90]:
print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_files,3)
pickling(test_data_tokens, "test_data_wo_stop_tokens_n=3.p")
# pickling(test_data_tokens, "test_data_tokens_no_preprocessing_n=1.p")

Tokenizing test data


In [91]:
print ("Tokenizing train data")
train_data_tokens, all_tokens = tokenize_dataset(train_files_1,3)
pickling(train_data_tokens,"train_data_wo_stop_tokens_n=3.p")
# pickling(train_data_tokens,"train_data_tokens_no_preprocessing_n=1.p")

Tokenizing train data


In [92]:
pickling(all_tokens, "all_train_wo_stop_tokens_n=3.p")
# pickling(all_tokens, "all_train_tokens_no_preprocessing_n=1.p")

# N = 4 (wo stop words)

In [93]:
#val set tokens
print ("Tokenizing val data")
val_data_tokens, _ = tokenize_dataset(val_files,4)

Tokenizing val data


In [94]:
pickling(val_data_tokens,'val_data_tokens_wo_stop_words_n=4.p')

In [95]:
# pickling(val_data_tokens,'val_data_tokens_no_preprocessing_n=1.p')

In [96]:
print ("Tokenizing test data")
test_data_tokens, _ = tokenize_dataset(test_files,4)
pickling(test_data_tokens, "test_data_wo_stop_tokens_n=4.p")
# pickling(test_data_tokens, "test_data_tokens_no_preprocessing_n=1.p")

Tokenizing test data


In [97]:
print ("Tokenizing train data")
train_data_tokens, all_tokens = tokenize_dataset(train_files_1,4)
pickling(train_data_tokens,"train_data_wo_stop_tokens_n=4.p")
# pickling(train_data_tokens,"train_data_tokens_no_preprocessing_n=1.p")

Tokenizing train data


In [98]:
pickling(all_tokens, "all_train_wo_stop_tokens_n=4.p")
# pickling(all_tokens, "all_train_tokens_no_preprocessing_n=1.p")