In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import random

In [14]:
import collections
import numpy as np
import torch

#### Hyperparameters

In [18]:
Char_Embedding_Dim = 30
Word_Embedding_Dim = 50
Window_Size = 5
Number_of_Filter = 100
Hidden_Units = 50
Dropout_Rate = 0.5
Epochs = 50
Batch_Size = 10
L_rate = 0.001
Momentum = 0.9
max_num_sentence = 50
max_seq_length = 50
batch_size =32

#### Preprocessing

In [4]:
def load_data(path):
    """
    Input: training_final.csv file path
    output: list of dictionary [{'essay': , 'essay_set':, 'final_score':, 'scaled_score':}, 
                                {'essay2': , 'essay2_set':, 'final_score2':, 'scaled_score2':}]
            @Essay_info_list
                
    """
    opt = []
    data = pd.read_csv(path, header=0, index_col=0)
    data = data.reset_index(drop=True)
    data['essay_set'] = [float(n) for n in data['essay_set']]
    for i in range(data.shape[0]):
        example = {}
        #essay + essay_set + final_score + scaled_score
        ##Strip out the parse information and the phrase labels---we don't need those here
        text = re.sub(r'\s*(\(\d)|(\))\s*', '', data['essay'][i].lower())
        example['essay'] = text[1:] #essay
        example['essay_set'] = data['essay_set'][i] #essay_set
        example['final_score'] = data['final_score'][i] #final_score
        example['scaled_score'] = data['scaled_score'][i] #scaled_score
        
        opt.append(example)
    random.seed(123)
    random.shuffle(opt)
    return opt

In [5]:
data = load_data("./training_final.csv")

In [6]:
data = data[0:10]

In [7]:
data[0]

{'essay': 'he mood in exerpt is in between grateful and caring and hard because thats what @person1\'s mood ended they way it created was by the type of house he living with them and and also the people @person1 about. also another mood is comfortable for example when he said "i was born in newark, @location2, in a simple house explains his mood towards everything. ',
 'essay_set': 5.0,
 'final_score': 1.0,
 'scaled_score': 0.25}

#### Take pretrained glove as embedding

In [8]:
PADDING = "<PAD>"
UNKNOWN = "<UNK>"
max_seq_length = 7000

In [9]:
#Read in Glove
embeddings_indices = {}
f = open('glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_indices[word] = coefs

In [10]:
def tokenize(string):
    tokens = nltk.word_tokenize(string)
    for index, token in enumerate(tokens):
        if token == '@' and (index+1) < len(tokens):
            tokens[index+1] = '@' + re.sub('[0-9]+.*', '', tokens[index+1])
            tokens.pop(index)
    return tokens

In [11]:
def tokenize_to_sentences(text, create_vocab_flag=False):

    # tokenize a long text to a list of sentences
    sents = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\!|\?)\s', text)
    
    processed_sents = []
    for sent in sents:
        if re.search(r'(?<=\.{1}|\!|\?|\,)(@?[A-Z]+[a-zA-Z]*[0-9]*)', sent):
            s = re.split(r'(?=.{2,})(?<=\.{1}|\!|\?|\,)(@?[A-Z]+[a-zA-Z]*[0-9]*)', sent)
            ss = " ".join(s)
            ssL = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\!|\?)\s', ss)

            processed_sents.extend(ssL)
        else:
            processed_sents.append(sent)
    

    if create_vocab_flag:
        sent_tokens = [tokenize(sent) for sent in processed_sents]
        tokens = [w for sent in sent_tokens for w in sent]
        return tokens

    
    sent_tokens = []
    for sent in processed_sents:
        s_tokens = [sent.split()]
        sent_tokens.extend(s_tokens)
    
    return sent_tokens

In [12]:
sent_token = tokenize_to_sentences(data[1]['essay'])

In [15]:
def build_dictionary(dataset):
    """
    Input: @Essay_info_dict
    output: word_indices, len(vocabulary)
    """
    word_counter = collections.Counter()
    for example in dataset:
        word_counter.update(tokenize(example['essay']))
        
    vocabulary = [word for word in word_counter]
    vocabulary = [PADDING, UNKNOWN] + vocabulary
    
    word_indices = dict(zip(vocabulary, range(len(vocabulary))))
    
    return word_indices, len(vocabulary)

def sentences_to_padded_index_sequences_character(word_indices, dataset):
    """
    Annotate datasets with feature vectors. Adding right-sided padding
    Input: vocabulary with integer, Essay_info_list
    Output: Essay_info_list text as word_indices
    """
    for example in dataset:
        example['character_index'] = np.zeros(max_seq_length)
        
        token_sequence = tokenize(example['essay'])
        padding = max_seq_length - len(token_sequence)
        
        for i in range(max_seq_length):
            if i >= len(token_sequence):
                index = word_indices[PADDING]
                pass
            else:
                if token_sequence[i] in word_indices:
                    index = word_indices[token_sequence[i]]
                else:
                    index = word_indices[UNKNOWN]
            example['character_index'][i] = index

def sentences_to_padded_index_sequences_sentence(word_indices, dataset):
    """
    Annotate datasets with feature vectors. Adding right-sided padding
    Input: vocabulary with integer, Essay_info_list
    Output: Essay_info_list text as sentence_indices
    """
    for example in dataset:
        example['sentence_index'] = np.zeros((max_num_sentence, max_seq_length))
        #print(example['essay'])
        token_sequence = tokenize_to_sentences(example['essay'])
       
        #[['he', 'mood', 'in', 'exerpt', 'is', 'in', 'between']]
        for j,seq in enumerate(token_sequence):
            for i in range(max_seq_length):
                if i >= len(seq):
                    index = word_indices[PADDING]
                    pass
                else:
                    if seq[i] in word_indices:
                        index = word_indices[seq[i]]
                    else:
                        index = word_indices[UNKNOWN]
                example['sentence_index'][j][i] = index
        
        #example['text_index_sequence'] = np.array(example['text_index_sequence'])
        #example['index'] = torch.FloatTensor(example['text_index_sequence'])
        #example['essay_set'] = torch.LongTensor([example['essay_set']])
        
def indSeq_to_oneHot(indSeq, word_indices): 
    """
    Convert index_sequence into 1-hot sequence
    """
    print(indSeq.shape)
    pos = list(map(int, indSeq))
    n_values = len(word_indices)
    return np.eye(n_values)[pos]

In [16]:
word_indices, vocab_size = build_dictionary(data)
sentences_to_padded_index_sequences_character(word_indices, data)
sentences_to_padded_index_sequences_sentence(word_indices, data)

#### Batchify

In [17]:
# This is the iterator we'll use during training. 
# It's a generator that gives you one batch at a time.
def data_iter(source, batch_size):
    dataset_size = len(source)
    start = -1 * batch_size
    order = list(range(dataset_size))
    random.shuffle(order)

    while True:
        start += batch_size
        if start > dataset_size - batch_size:
            # Start another epoch.
            start = 0
            random.shuffle(order)   
        batch_indices = order[start:start + batch_size]
        yield [source[index] for index in batch_indices]

# This is the iterator we use when we're evaluating our model. 
# It gives a list of batches that you can then iterate through.
def eval_iter(source, batch_size):
    batches = []
    dataset_size = len(source)
    start = -1 * batch_size
    order = list(range(dataset_size))
    random.shuffle(order)

    while start < dataset_size - batch_size:
        start += batch_size
        batch_indices = order[start:start + batch_size]
        batch = [source[index] for index in batch_indices]
        batches.append(batch)
        
    return batches

# The following function gives batches of vectors and labels, 
# these are the inputs to your model and loss function
def get_batch(batch):
    vectors = []
    labels = []
    for dict in batch:
        vectors.append(dict["sentence_index"])
        labels.append(dict['scaled_score'])
    return vectors, labels

In [19]:
training_iter = data_iter(data, batch_size)
train_eval_iter = eval_iter(data[0:500], batch_size)
dev_iter = eval_iter(data[0:500], batch_size)

In [20]:
vectors, labels = get_batch(next(training_iter))

In [21]:
vectors, labels

([array([[   1.,  546.,    1., ...,    0.,    0.,    0.],
         [ 219.,  491.,  477., ...,    0.,    0.,    0.],
         [ 442.,  357.,  620., ...,    0.,    0.,    0.],
         ..., 
         [   0.,    0.,    0., ...,    0.,    0.,    0.],
         [   0.,    0.,    0., ...,    0.,    0.,    0.],
         [   0.,    0.,    0., ...,    0.,    0.,    0.]]),
  array([[ 377.,  605.,  700., ...,    0.,    0.,    0.],
         [   1.,  433.,   25., ...,    0.,    0.,    0.],
         [ 546.,  113.,    1., ...,    0.,    0.,    0.],
         ..., 
         [   0.,    0.,    0., ...,    0.,    0.,    0.],
         [   0.,    0.,    0., ...,    0.,    0.,    0.],
         [   0.,    0.,    0., ...,    0.,    0.,    0.]]),
  array([[ 523.,  373.,  209., ...,    0.,    0.,    0.],
         [ 682.,  605.,  123., ...,    0.,    0.,    0.],
         [  81.,  141.,   49., ...,  461.,  682.,  605.],
         ..., 
         [   0.,    0.,    0., ...,    0.,    0.,    0.],
         [   0.,    0.,

In [22]:
data[0]['sentence_index'].shape

(50, 50)

In [25]:
#indSeq_to_oneHot(data[0]['sentence_index'], word_indices).shape