Encoding of Text
================

The goal of this script is to train a Word2Vec skip-gram model over the provided data.

In [38]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
from matplotlib import pylab
from six.moves import range
from sklearn.manifold import TSNE

In [39]:
# set the paths required for the data here:

root_path = "../Data"
train_path = os.path.join(root, "Train")

### Task 1: read the data from the data file and convert it into a well formatted dataset

In [40]:
# function to read the data from the file and store it in a well formatted lists

def read_data(filename):
    """ store the file data as a list of words
    
        input: path of the data file
        output: data list, labels, sentence_completion_mark
        
    """
    data = [] # actual words 
    labels = [] # keep the track of the labels of the data
    sentence_mark = [] # integers to keep track of the end of sentence
    
    with open(filename) as f:
        # tf.compat makes the string data compatible with tensorflow
        count = 0; 
        for line in f:
            
            line = line.strip() # strip it off the newline character
            
            if(line == ""):
                sentence_mark.append(count)
            else: 
                data_point = tf.compat.as_str(line).split("\t") # split at the tab character
                data.append(data_point[0]) # the word in data
                labels.append(data_point[1]) # the label in labels list

            # increment counter:
            count += 1
    
    return data, labels, sentence_mark

In [45]:
# use the above function to get the data
words, tags, sents = read_data(os.path.join(train, "wnut_ner_evaluation/data/train"))

# test the data if it is coming properly
print('Data size %d' % len(words))
print("Some of the sample words in the vocabulary: ", words[10: 100])

print('\n\nLabels size %d' % len(tags))
print("Corresponding labels of the words in the vocabulary: ", tags[10: 100])

Data size 46469
Some of the sample words in the vocabulary:  ['me', '*wink*', 'Made', 'it', 'back', 'home', 'to', 'GA', '.', 'It', 'sucks', 'not', 'to', 'be', 'at', 'Disney', 'world', ',', 'but', 'its', 'good', 'to', 'be', 'home', '.', 'Time', 'to', 'start', 'planning', 'the', 'next', 'Disney', 'World', 'trip', '.', "'", 'Breaking', 'Dawn', "'", 'Returns', 'to', 'Vancouver', 'on', 'January', '11th', 'http://bit.ly/dbDMs8', '@ls_n', 'perhaps', ',', 'but', 'folks', 'may', 'find', 'something', 'in', 'the', 'gallery', 'that', 'is', 'helpful', 'in', 'their', 'day-to-day', 'work', 'as', 'well', '.', 'Even', 'just', 'to', 'use', 'it', '.', '@Carr0t', 'aye', 'been', 'tonight', '-', 'excellent', 'RT', '@LilTwist', ':', 'RT', 'this', 'if', 'you', 'want', 'me', 'to', 'go']


Labels size 46469
Corresponding labels of the words in the vocabulary:  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo-loc', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-facility', 'I-facility', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '

In [46]:
# function to build a formatted dataset from the list of words 
# so that the context of the words are taken into consideration

def build_dataset(words):
    
    """ build a dataset from the list of words 
    
        input: list of words
        output: the formatted data, count, dictionary (map), reverse_dictionary
        
    """
    
    vocabulary_size = len(words)  # size of the dictionary to be formed
    
    # form the dictionary
    count = [] 
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
        
    # create a list from the dictionary
    data = list()
    for word in words:       
        index = dictionary[word]
        data.append(index)
  
    # a reverse dictionary for mapping the words to their unique id
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    
    return data, count, dictionary, reverse_dictionary

In [47]:
# use the above function to convert the list of words into a well formatted dict

data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words', count[20:30])
print('Sample data', data[:20])
del words  # Hint to reduce memory.

# naive way for constructing the tweets from the dict:
print("what the data corresponds to: ")
string = ""
for datum in data[:20]:
    string += reverse_dictionary[datum] + " "
print(string)

Most common words [('when', 283), ('?', 275), ('i', 254), ('day', 253), ('time', 243), ('at', 233), ('be', 220), ('tonight', 209), ('that', 208), ('me', 208)]
Sample data [9114, 5971, 84, 49, 26, 46, 231, 95, 132, 1815, 29, 6665, 2924, 12, 102, 135, 3, 1071, 0, 78]
what the data corresponds to: 
@SammieLynnsMom @tg10781 they will be all done by Sunday trust me *wink* Made it back home to GA . It 
