In [2]:
'''
Steps to be followed: 

1. Load dependencies
2. Prepare corpus
3. Define Model
4. Analyze Model
5. Plot word cluster using t-SNE
6. Plot model on tensorboard
'''

import multiprocessing
import os, json, requests
import re
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector




In [3]:
nltk.download("punkt")
nltk.download("stopwords")

def _to_wordlist(raw):
    
    clean = re.sub("[^a-zA-Z]"," ",raw)
    words = clean.split()
    return (list(map(lambda x:x.lower(), words)))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akash.sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akash.sharma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
filepath = "http://www.gutenberg.org/files/33224/33224-0.txt"
raw_corpus = requests.get(filepath)
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#from nltk import sent_tokenize
raw_sentences = tokenizer.tokenize(raw_corpus.text)
#raw_sentences = sent_tokenize(raw_corpus.text)
sentences = []

for sentence in raw_sentences:
    if(len(raw_sentences) > 0):
        sentences.append(_to_wordlist(sentence))

In [5]:
#Defining gensim model

#Arguments being seed- for generating same output later, workers = no. of cpu threads, min_count = min. frequency of words, window = context size, sample = downsampling
model = w2v.Word2Vec(sg = 1, seed = 1, workers = multiprocessing.cpu_count(), min_count = 3, window = 7, sample = 1e-3)\

#So building vocab first
model.build_vocab(sentences)
#Training the model
model.train(sentences, total_examples = model.corpus_count, epochs =120)

(35990527, 51075960)

In [6]:
if not os.path.exists(os.path.join("trained", 'sample')):
    os.makedirs(os.path.join("trained", 'sample'))
    
model.save(os.path.join("trained", 'sample',".w2v"))

In [7]:
#Finding relations between the words using most_similar()
model.wv.most_similar('sky')

  if np.issubdtype(vec.dtype, np.int):


[('serene', 0.8594950437545776),
 ('predominates', 0.6454615592956543),
 ('luxuriantly', 0.6405662894248962),
 ('tempestuous', 0.5351181030273438),
 ('feeble', 0.50838702917099),
 ('weather', 0.5022791624069214),
 ('whatsoever', 0.4882858097553253),
 ('monsoons', 0.47658997774124146),
 ('intercepted', 0.4733385145664215),
 ('delightful', 0.4719574451446533)]

In [8]:
#When we call most_similar on a word not in the vocabulary
#Limitation 
model.wv.most_similar('laughter')

KeyError: "word 'laughter' not in vocabulary"

In [None]:
#Checking Analogy
model.most_similar_cosmul(positive =['earth', 'moon'], negative =['orbit'])

In [None]:
#Using t-SNE algorithm (used for dimensionality reduction)
tane = sklearn.manifold.TSNE(n_components=2, random_state =0)
all_word_vectors_matrix = model.wv.vectors
all_word_vectors_matrix_2d = tane.fit_transform(all_word_vectors_matrix)

points = pd.DataFrame([ (word, coords[0], coords[1]) for word, coords in [ (word, all_word_vectors_matrix_2d[model.wv.vocab[word].index]) for word in model.wv.vocab]], columns= ["word", "x", "y"])

sns.set_context("poster")
ax = points.plot.scatter("x", "y", s=10, figsize=(20, 12))
fig = ax.get_figure()  

In [None]:
vocab_list = points.word.values.tolist()
embeddings = all_word_vectors_matrix

embedding_var = tf.Variable(all_word_vectors_matrix, dtype= 'float32', name = 'embedding')
projector_config = projector.ProjectorConfig()

embedding = projector_config.embeddings.add()
embedding.tensor_name = embedding_var.name

LOG_DIR = './'
metadata_file = os.path.join("sample.tsv")

with open(os.path.join(LOG_DIR, metadata_file), 'wt') as metadata: 
    metadata.writelines("%s\n" % w.encode('utf=8') for w in vocab_list)
    
embedding.metadata_path = os.path.join(os.getcwd(), metadata_file)

#Saving summary in LOG_DIR
summary_writer = tf.summary.FileWriter(LOG_DIR)

#The next line writes a projector_config.pbtxt in the LOG_DIR. TensorBoard will
#read this file during startup 

projector.visualize_embeddings(summary_writer, projector_config)

saver = tf.train.Saver([embedding_var])

with tf.Session() as sess:
    #Initialize the model
    sess.run(tf.global_variables_initializer())
    
    saver.save(sess, os.path.join(LOG_DIR, metadata_file + '.ckpt'))    

In [1]:
# Building a sentiment classifier using CNN

class TextCNN(object):
    '''Embedding layer -> Convolutional -> Max-Pooling -> Softmax'''
    
    def __init__(self, sequence_length, num_classes, vocab_size, embedding_size, filter_sizes, num_filters, 12_reg_lambda = 0, pre_trained =False):
        
        #Placeholders
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name = "input_x")
        self.input_y = tf.placeholder(tf.float32, [None, numm_classes], name = "input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name = "dropout_keep_probs")
        #Keeping track of regularization losses
        12_loss = tf.constant(0.0)
        
        #Embedding Layer
        with tf.device('/cpu:0'), tf.name_scope("embedding"):
            if pre_trained:
                W_ = tf.variable(tf.constant(0, shape = [vocab_size, embedding_size]), trainable = False, name ='W')
                self.embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_size], name = 'pre-trained')
                W= tf.assign(W_, self.embedding_placeholder)
                else:
                    W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, name ="W"))
                    self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
        
        #Convolution  + pooling
        
        pooled_output = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                #Convolution layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name = "W")
                b = tf.variable(tf.constant(0.1, shape = [num_filters]), name="b")
                conv = tf.nn.conv2d(self.embedded_chars_expanded, W, strides =[1,1,1,1], padding ='VALID', name ="pool") 
                pooled_outputs.append(pooled)
                
        #Combining pooled features
        num_filters_total = num_filters*len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
        
        #Dropout for regularization from overfitting
        with tf.name_scope("dropout"):
            self.h_deop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
        
        #Predictions by tf.argmax()
        
        #Final scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable("W", shape =[num_filters_total, num_classes], initializer = tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape = [num_classes]), name ="b")
            l2_loss += tf.nn.12_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name = "scores")
            self.predictions = tf.argmax(self.scores, 1, name = "predictions")
            
        
        #Evaluation 
        
        #Mean-Entropy loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(labels = self.input_y, logits = self.scores)
            self.loss = tf.reduce_mean(losses) + 12_reg_lambda*12_loss
            
        #Accuracy
        with tf.name_scope("loss"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name ="accuracy")


SyntaxError: invalid token (<ipython-input-1-7e419fd58fcb>, line 6)