In [50]:
# Set the glove file to use and the save name
# (paths are handled automatically, just give the file name)
glove_file = 'glove_840B_300d.txt'
save_name = 'ready_data_840B_300d.pkl'

In [51]:
import re
import numpy as np
import cPickle
from glover import glove

In [43]:
# Get all the data sentences from the data set
with open('./data/sentiment_data/datasetSentences.txt', 'r') as f:
    S = [line.strip() for line in f]
    
# Process each sentence into an ID and the sentence
sentences = [re.match('\d+\\\t(.*)', s).group(1) for s in S[1:]]


# Get the dictionary of phrases and their IDS
with open('./data/sentiment_data/dictionary.txt', 'r') as f:
    D = [re.match('([^\|]*)\|(\d+)',line).group(1, 2) for line in f]
    
D = [(d[0], int(d[1])) for d in D]


# Get the sentiment score of each phrase
with open('./data/sentiment_data/sentiment_labels.txt', 'r') as f:
    tmp = [line.strip() for line in f]

sentiments = [re.match('\d+\|(.*)',line).group(1) for line in tmp[1:]]
sentiments = np.array([float(s) for s in sentiments])


# Get the train/dev/test splits
with open('./data/sentiment_data/datasetSplit.txt', 'r') as f:
    tmp = [line.strip() for line in f]

split = [re.match('\d+\,(\d)',line).group(1) for line in tmp[1:]]
split = np.array([int(s) for s in split])


# Extract the sentiments for each sentence
phrase_length   = np.array([len(d[0]) for d in D])
sentence_length = np.array([len(s)    for s in sentences])

index = np.arange(len(phrase_length))

# Sentence sentiments
sent_sent = -np.ones(len(sentence_length))

# Step through each sentence
for sIdx, s in enumerate(sentences):
    # Step through the possible matches
    match_idx = index[sentence_length[sIdx]==phrase_length]
    for i in match_idx:
        if s == D[i][0]:
            # This is the matching "phrase"
            # Store its sentiment
            sent_sent[sIdx] = sentiments[D[i][1]]
            # No need to keep looking
            break

            
# Combine these to form the data set
review_target = zip(sentences, sent_sent)

# Split this into train/test/dev
train_RT = []
test_RT  = []
dev_RT   = []
# Don't assign unscored sentences
unscored_RT = []

for RT, S in zip(review_target, split):
    # If it didn't find a phrase, put it in unscored
    if RT[1] == -1.0:
        unscored_RT += [RT]
    else:
        # Use the split list to assign this data pair
        if S==1:
            train_RT += [RT]
        elif S==2:
            test_RT += [RT]
        else:
            dev_RT += [RT]

# Print the results
print 'Number of sentence/sentiment pairs:\nTrain:\t{}\nTest:\t{}\nDev:\t{}\nBad:\t{}'.format(
    len(train),len(test),len(dev),len(unscored))

Number of sentence/sentiment pairs:
Train:	8117
Test:	2125
Dev:	1044
Bad:	569


In [3]:
# Load the tools for creating sequences of word vectors
glove_dir = './data/glove_files/'

# Ensure the file ends with .txt
if re.search('\.txt$', glove_file):
    glove_file = glove_dir + glove_file
else:
    glove_file = glove_dir + glove_file + '.txt'

G = glove(glove_file)

100% complete; 332.67 seconds.
Done!


In [44]:
# Write a function to produce the vector representations of the reviews
def make_numeric(G, data):
    """Assume data are in the [(review, target)] format"""
    
    def review2seq(G, review):
        """For converting a single review"""
        # Tokenize the review
        tokens = re.split('\s+', review)

        # Express this token sequence as a vector sequence
        return np.concatenate([G.vec(t)[None, :] for t in tokens],
                              axis=0)
    
    
    # Pull out the reviews
    reviews, targets = zip(*data)
    
    # Initialize a list of the vector representations
    seqs = [None]*len(reviews)
    
    # Step through each one
    for i, r in enumerate(reviews):
        # Convert to a sequence of vectors
        seqs[i] = review2seq(G, r)
        
    return zip(seqs, targets)

In [48]:
# Create purely numerical datasets for each usable split
train = make_numeric(G, train_RT)
test  = make_numeric(G,  test_RT)
dev   = make_numeric(G,   dev_RT)

In [49]:
# Save it!
save_dict = {
    'train_RT': train_RT,
    'test_RT': test_RT,
    'dev_RT': dev_RT,
    'unscored_RT': unscored_RT,
    'train': train,
    'test': test,
    'dev': dev}

save_dir = './data/'
# Ensure the file ends with .pkl
if re.search('\.pkl$', save_name):
    save_name = save_dir + save_name
else:
    save_name = save_dir + save_name + '.pkl'

with open(save_name, 'w') as f:
    cPickle.dump(save_dict, f)