In [33]:
# Imports
import pandas as pd
import numpy as np
import nltk

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [34]:
# Read training and testing data
train = pd.read_csv('data/train.csv') # category, text
test = pd.read_csv('data/test.csv') # category, text

# Replace NaN with ''
train = train.fillna('')
test = test.fillna('')

In [70]:
# Imports
from nltk.tokenize import RegexpTokenizer

# Function to clean text
def clean_text_w2v(text):
    '''
    Function to clean text and modify string
    Process: decode > lowercase >  tokenize 
        Input: text string
        Output: cleaned and modified text string
    '''
    # Decode: utf-8
    text = text.decode('utf8')
    # RegExp tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # Convert text to lower case
    raw_text = text.lower()
    # Tokenize
    tokens = tokenizer.tokenize(raw_text)    
    return tokens

In [66]:
# Clean the training and testing texts
train_clean_X = []
for i in xrange(train.shape[0]):
    temp = train['text'].ix[i]
    train_clean_X.append(clean_text_w2v(temp))
    
test_clean_X = []
for i in xrange(test.shape[0]):
    temp = test['text'].ix[i]
    test_clean_X.append(clean_text_w2v(temp))
    
print test_clean_X[:5]

[[u'i', u'love', u'listing', u'rap', u'music'], [u'back', u'on', u'water', u'meditation'], [u'me', u'the', u'first', u'time', u'i', u'ever', u'pinned', u'someone', u'in', u'a', u'cradle', u'proud', u'of', u'myself', u'and', u'how', u'far', u'i', u've', u'gone', u'since', u'then', u'even', u'girls', u'can', u'be', u'just', u'as', u'strong', u'as', u'boys'], [u'any', u'single', u'ladies', u'from', u'circleville', u'on', u'here', u'22m'], [u'i', u'want', u'to', u'go', u'down', u'on', u'a', u'girl', u'so', u'bad', u'been', u'so', u'long', u'lethbridge']]


### Word2Vec - Average
- Continuous Bag of Words (CBOW) is faster
- Skipgram is slower so not using it at this moment

In [37]:
# Multiprocessing
from multiprocessing import cpu_count

# Gensim
from gensim.models.word2vec import Word2Vec

In [38]:
# Model: 
#       size = 300 as per http://arxiv.org/pdf/1408.5882v2.pdf
#       window = 5 max distance between the current and predicted word within a sentence.
#       min_count` = 10 (ignore all words with total frequency lower than this.)

# Initiate model
num_features = 300
model = Word2Vec(size=num_features, window=5, min_count=10, workers=cpu_count())

# Build vocabulary using training data
model.build_vocab(train_clean_X)

# Train using training data and save model
model.train(train_clean_X)
model.save('w2v/train')

In [39]:
# Feature vector of each word in vocabulary
print "Vocabulary: {} words".format(model.syn0.shape[0])
print "Word Vector length (# of features): ", model.syn0.shape[1]

Vocabulary: 1537 words
Word Vector length (# of features):  300


In [49]:
def buildWordVector(text, model, size):
    '''
    Function to average all of word vectors in a given paragraph
    https://districtdatalabs.silvrback.com/modern-methods-for-sentiment-analysis
    '''
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += model[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

def average_feature_vecs(docs, model, num_features):
    '''
    Given a set of documents (each document is a list of words), calculate
    the average feature vector for each one and return a numpy 2d array
    '''
    # Initialize a counter
    counter = 0
    
    # Pre-initialize an empty 2D numpy array for speed
    doc_feature_vecs = np.zeros((len(docs), num_features), dtype="float32")
    
    # Loop through the documents and get average feature vec
    for doc in docs:
        # Call make feature vector function
        doc_feature_vecs[counter] = buildWordVector(doc, model, num_features)
        # Increment the counter
        counter = counter + 1
    return doc_feature_vecs

In [52]:
# Get average feature vector
train_doc_vecs = average_feature_vecs(train_clean_X, model, num_features)

# Scale data: Center to the mean and component wise scale to unit variance
from sklearn.preprocessing import scale
train_doc_vecs = scale(train_doc_vecs)
print train_doc_vecs.shape

(14048, 300)




In [55]:
# Train using testing data and save model
model.train(test_clean_X)
model.save('w2v/test')



In [56]:
# Get average feature vector
test_doc_vecs = average_feature_vecs(test_clean_X, model, num_features)

# Scale data: Center to the mean and component wise scale to unit variance
from sklearn.preprocessing import scale
test_doc_vecs = scale(test_doc_vecs)
print test_doc_vecs.shape

(3599, 300)


### Tests

In [103]:
with open('happy.txt', 'r') as infile:
    happy = infile.readlines()
    
print happy    

happy_clean = []
for text in happy:
    temp = clean_text_w2v(text)
    happy_clean.append(temp)
    
print happy_clean

["I'm happy for him...really, I am. She's an amazing girl, and they deserve each other. He's happy &amp; thats all that matters...right?.....\n", 'Feel so happy with no reason... Just happy... Hey my brain, am I missing something? :))\n', 'We finished our first season of @TheBEATDance &amp; I am so happy &amp; proud &amp; thankful &amp; overwhelmed &amp; lots of other good stuff! So Amazing #2013\n', 'am i allowed to be happy about something, or do yo wanna distroy the little i have left?\n', "I am so happy right now I can't even focus on anything else\n", "Why am I being sneaked around her fam when I'm open about us.... But we both happy shit don't add up.\n", 'Heavens suppose to be the happiest place in the world I am happy everyday with the people I love but I feel like I live in heaven everyday:)\n', 'I am  so happy since I have get an $100,00 STARBUCKS GIFT-CARD for Free. I grab it here http://t.co/cg8M1Ubq\n', 'I am one #happy girl :)\n', 'I Am So HAPPY .\n']
[[u'i', u'm', u'happ

In [104]:
def pad_sentences(sentences, padding_word="<PAD/>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in xrange(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences

In [110]:
happy_padded = pad_sentences(happy_clean, padding_word="<PAD/>")
print happy_padded[:3]

[[u'i', u'm', u'happy', u'for', u'him', u'really', u'i', u'am', u'she', u's', u'an', u'amazing', u'girl', u'and', u'they', u'deserve', u'each', u'other', u'he', u's', u'happy', u'amp', u'thats', u'all', u'that', u'matters', u'right', '<PAD/>'], [u'feel', u'so', u'happy', u'with', u'no', u'reason', u'just', u'happy', u'hey', u'my', u'brain', u'am', u'i', u'missing', u'something', '<PAD/>', '<PAD/>', '<PAD/>', '<PAD/>', '<PAD/>', '<PAD/>', '<PAD/>', '<PAD/>', '<PAD/>', '<PAD/>', '<PAD/>', '<PAD/>', '<PAD/>'], [u'we', u'finished', u'our', u'first', u'season', u'of', u'thebeatdance', u'amp', u'i', u'am', u'so', u'happy', u'amp', u'proud', u'amp', u'thankful', u'amp', u'overwhelmed', u'amp', u'lots', u'of', u'other', u'good', u'stuff', u'so', u'amazing', u'2013', '<PAD/>']]


In [149]:
# Multiprocessing
from multiprocessing import cpu_count

# Gensim
from gensim.models.word2vec import Word2Vec

In [150]:
# Model: 
#       size = 100 as per http://arxiv.org/pdf/1408.5882v2.pdf
#       window = 5 max distance between the current and predicted word within a sentence.
#       min_count` = 10 (ignore all words with total frequency lower than this.)

# Initiate model
num_features = 5
model = Word2Vec(size=num_features, window=5, min_count=1, workers=cpu_count())

# Build vocabulary 
model.build_vocab(happy_padded)

# Train 
model.train(happy_padded)

# Feature vector of each word in vocabulary
print "Vocabulary: {} words".format(model.syn0.shape[0])
print "Word Vector length (# of features): ", model.syn0.shape[1]



Vocabulary: 113 words
Word Vector length (# of features):  5


In [153]:
def create(text, model, size):
    vec = np.zeros(size).reshape((1, size))
    for word in text:
        temp = model[word].reshape((1, size))
        vec = np.vstack([vec, temp])
    vec_flat = vec.flatten()
    return vec_flat

arr = []
for i in happy_padded[:1]:
    temp = create(i, model, 5)
    print temp.shape
    arr.append(temp)

5
(145,)


In [146]:
import itertools
from collections import Counter

def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]

20