In [None]:
## Tutorial : WildML :
#http://www.wildml.com/2015/09/recurrent-neural-networks-tutorial-part-2-implementing-a-language-model-rnn-with-python-numpy-and-theano/ 
################

In [1]:
import numpy as np
import theano 
import theano.tensor as T

import csv
import itertools
import operator
import nltk
import sys
import os
import time
from datetime import datetime

In [2]:
# preprocessing
#### Tokens ####
vocabulary_size = 8000
unknown_token = 'UNKNOWN_TOKEN'
sentence_start_token = 'SENTENCE_START'
sentence_end_token = 'SENTENCE_END'

In [4]:
# preprocessing
##### Add sentence start and end tokens ###
with open('../../data/reddit-comments_raw.csv','rb') as f:
    reader = csv.reader(f,skipinitialspace=True)
    reader.next()
    # split comments into sentences
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].decode('utf-8').lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    # convert sentences into form (START,sentence,END)
    sentences = ['%s %s %s' %(sentence_start_token,x,sentence_end_token) for x in sentences]
print 'parsed sentences : %d'%(len(sentences))
    

parsed sentences : 79170


In [5]:
# pickle the preprocessed sentences 
import cPickle
#cPickle.dump(sentences, open('../../data/reddit_comments_preprocessed.pkl', 'wb')) 

In [7]:
loaded = cPickle.load(open('../../data/reddit_comments_preprocessed.pkl', 'rb'))
sentences = loaded

In [20]:
# tokenize sentences to words
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
# get the frequencies of words
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
# print number of unique words
print '#unique words : %d' %(len(word_freq))
# get the most common 8K words
vocab = word_freq.most_common(vocabulary_size-1)
index2word = [x[0] for x in vocab]
index2word.append(unknown_token)

#unique words : 65751


In [21]:
word2index = dict([(w,i) for i,w in enumerate(index2word)])

In [9]:
print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])

The least frequent word in our vocabulary is 'devoted' and appeared 10 times.


In [22]:
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word2index else unknown_token for w in sent]

In [18]:
print "\nExample sentence: '%s'" % sentences[0]
print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]


Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '[u'SENTENCE_START', u'i', u'joined', u'a', u'new', u'league', u'this', u'year', u'and', u'they', u'have', u'different', u'scoring', u'rules', u'than', u'i', u"'m", u'used', u'to', u'.', u'SENTENCE_END']'


In [23]:
# Create the training data
X_train = np.asarray([[word2index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word2index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [24]:
# Create the training data
#X_train = np.asarray([[word2index[w] for w in sent[:-1]] for sent in tokenized_sentences])
#y_train = np.asarray([[word2index[w] for w in sent[1:]] for sent in tokenized_sentences])
cPickle.dump((X_train,y_train), open('../../data/reddit_training_set.pkl', 'wb')) 

In [25]:
cPickle.dump((tokenized_sentences,vocab,word2index,index2word),open('../../data/reddit_metadata.pkl','wb'))