# Text 4: Word2Vec
**Internet Analytics - Lab 4**

---

**Group:** *H*

**Names:**

* *Antoine Basseto*
* *Andrea Pinto*
* *Jérémy Baffou*


---

#### Instructions

*This is a template for part 4 of the lab. Clearly write your answers, comments and interpretations in Markodown cells. Don't forget that you can add $\LaTeX$ equations in these cells. Feel free to add or remove any cell.*

*Please properly comment your code. Code readability will be considered for grading. To avoid long cells of codes in the notebook, you can also embed long python functions and classes in a separate module. Don’t forget to hand in your module if that is the case. In multiple exercises, you are required to come up with your own method to solve various problems. Be creative and clearly motivate and explain your methods. Creativity and clarity will be considered for grading.*

In [46]:
import pickle
import string
from collections import defaultdict, Counter
import re
import numpy as np
from scipy.sparse import csr_matrix
from collections import defaultdict
import json
from utils import *
import gensim
from sklearn.cluster import KMeans

courses = load_json('data/courses.txt')
stopwords = load_pkl('data/stopwords.pkl')

## Redo pre-processing

In [14]:
courses_rdd = sc.parallelize(courses)

In [15]:
courses_processed = courses_rdd.map(lambda c : {"courseId" : c["courseId"], "name" : c["name"], "description" : [word.casefold() for word in c["description"].translate(str.maketrans('', '', string.punctuation)).split() if (word.lower() not in stopwords and not word.isdigit())]})

In [95]:
# remove punctuations, stop words and digits
courses_processed = courses_rdd.map(lambda c :
                                    {"courseId" : c["courseId"], 
                                     "name" : c["name"], 
                                     "description" : [word 
                                                      for word in c["description"]
                                                      .translate(str.maketrans('', '', string.punctuation))
                                                      .split() 
                                                      if (lower(word) not in stopwords and not word.isdigit())]})

In [17]:
courses_word_aggregation = courses_processed.flatMap(lambda c : c["description"]) #flatten all words lists

In [18]:
words_number = courses_word_aggregation.count()
words_count = courses_word_aggregation.map(lambda w : (w,1)).reduceByKey(lambda w1,w2 : w1+w2).map(lambda w : (w[1],w[0])).sortByKey(False)
words_freq = words_count.map(lambda w : (w[1],w[0]/words_number))
words_freq_for_plot = np.asarray(words_freq.map(lambda w : w[1]).collect())

In [19]:
really_frequent_indices = np.where(words_freq_for_plot > np.quantile(words_freq_for_plot,0.60)) # maybe change quantile
really_frequent_words = set(words_freq.take(really_frequent_indices[0][-1]))

In [20]:
bag_of_words_per_course = courses_processed.map(lambda c :{"courseId" : c["courseId"], "name" : c["name"], "description" : [w for w in c["description"] if w not in really_frequent_words]}) 

### N-Grams ??

## Get pre-trained model

In [2]:
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('/ix/model.txt')  

## Exercise 4.12 : Clustering word vectors

In [87]:
# Normalize word vectors so KMeans using euclidean distance gives good results even if not using cosine similarity
word_vectors.init_sims(replace=True) # DOES NOT WORK !!!!!!!!!

# Get a default word_vector for words out of the vocabulary of the model
default_wv = word_vectors.get_vector("dog")

# Process the data according to the word2vec model
data_w2v = defaultdict(lambda: default_wv) 
for word in bag_of_words_per_course.flatMap(lambda c : c["description"]).collect():
    if word in word_vectors.vocab:
        data_w2v.update({word: word_vectors.get_vector(word)})
        
X = list(data_w2v.values())

In [32]:
kmeans = KMeans(n_clusters=15, random_state=0).fit(X)

In [33]:
nearest_clusters = kmeans.predict(X)

In [34]:
kmeans.cluster_centers_

array([[ 0.03221008,  0.00469214,  0.03713237, ...,  0.01891559,
         0.03382844,  0.03261775],
       [ 0.01280225,  0.02789977,  0.02947793, ...,  0.02125974,
        -0.00800326,  0.00366061],
       [ 0.07371299, -0.00060918,  0.04464344, ...,  0.03224631,
         0.04182734, -0.0090212 ],
       ...,
       [ 0.05495139,  0.00670257,  0.05589775, ...,  0.00577426,
         0.02130381,  0.00441768],
       [ 0.01165707, -0.00202622,  0.03584902, ...,  0.01634453,
         0.0179659 ,  0.02161639],
       [-0.00742456,  0.01266945,  0.03792582, ...,  0.02705007,
        -0.00732489,  0.00050998]])

In [43]:
for i in range(15):
    print(word_vectors.most_similar_cosmul(positive=[kmeans.cluster_centers_[i]], topn=10))
    print()

[('knowing', 0.6640102384436029), ('indeed', 0.6589506274898533), ('even', 0.6582542000540269), ('importantly', 0.6557958480387266), ('merely', 0.6556871851420182), ('ignore', 0.6556722527990587), ('something', 0.6556632498167626), ('whatever', 0.6554329244219337), ('realize', 0.6548817161126291), ('believing', 0.6547720638907041)]

[('annot-bar', 0.607324636592633), ('NORWICH', 0.6048064686943688), ('9,409,073', 0.6011652768683883), ('shrew-forms', 0.6008239955275159), ('NOEDITSECTION', 0.6007137278586562), ('hlist', 0.5996329573430155), ('Biology1', 0.5994184596425842), ('notoc', 0.5989300335803941), ('122,165', 0.5985543535220248), ('andcoordinates', 0.5984760215423677)]

[('down-regulation', 0.7589179626007169), ('gangliosides', 0.7588205590833987), ('PTPkappa', 0.7587521133850987), ('microglial', 0.758565173021969), ('STAT1', 0.7582569799655369), ('MMP-2', 0.7575887951713374), ('up-regulation', 0.7575480623744709), ('calcineurin', 0.7570417966444474), ('Th17', 0.7570044628402458),

## Exercise 4.13 : Document similarity search

In [115]:
IDF = {}
for word in bag_of_words_per_course.flatMap(lambda c : c["description"]).collect():
    if word not in IDF:
        IDF[word] = 1
    else:
        IDF[word] += 1

In [103]:
courses_with_vectors = bag_of_words_per_course.map(lambda c :
                                    {"courseId" : c["courseId"], 
                                     "name" : c["name"],
                                     "vector" : np.sum([data_w2v[word] * TF/IDF[word] 
                                                      for word, TF in Counter(c["description"]).items()], axis=0)})

In [92]:
courses_with_vectors.collect()

[{'courseId': 'MSE-440',
  'name': 'Composites technology',
  'vector': array([ 1.61075556e+00,  5.00087261e-01,  1.15105557e+00,  5.76933980e-01,
         -1.26610816e+00,  1.08220041e+00,  3.51139337e-01,  5.27508616e-01,
         -1.41354531e-01, -5.30667126e-01,  6.33897185e-01, -4.08990145e-01,
         -6.05026066e-01, -4.92788047e-01,  1.01056612e+00, -8.74475300e-01,
         -7.84984946e-01,  1.04300022e+00,  6.76486552e-01, -9.84635055e-01,
         -8.24350119e-01,  7.98302770e-01,  6.31768882e-01,  3.68542880e-01,
         -5.25573075e-01, -3.22221279e-01,  8.08444977e-01,  9.90690589e-02,
         -4.21341121e-01, -5.51106930e-01,  3.06551665e-01, -1.46067786e+00,
          3.30671102e-01, -2.30237588e-01, -2.02339247e-01,  1.52869391e+00,
          2.14724749e-01,  4.60650437e-02, -9.83509004e-01,  1.13641703e+00,
         -9.43968475e-01,  5.24098516e-01,  8.87838423e-01,  5.36891043e-01,
          1.02847554e-01, -4.78180945e-01, -1.66337860e+00,  5.35686053e-02,
      

In [121]:
word_vectors.cosine_similarities(data_w2v["Facebook"], [courses_with_vectors.take(1)[0]["vector"]])[0]

0.17855678

In [100]:
def search_course(courses_with_vectors, word, topn=5):
    new_rdd = courses_with_vectors.sortBy(lambda c:
                                          word_vectors.similarity(word, c["vector"]))
    return new_rdd.take(topn)

In [122]:
tmp = data_w2v["Facebook"]
courses_with_vectors.sortBy(lambda c: word_vectors
                            .cosine_similarities(tmp, [c["vector"]])[0]).take(5)

Py4JJavaError: An error occurred while calling o51.broadcast.
: java.lang.OutOfMemoryError: Java heap space


In [101]:
search_course(courses_with_vectors, "Facebook", topn=5)

Py4JJavaError: An error occurred while calling o51.broadcast.
: java.lang.OutOfMemoryError: Java heap space
	at java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:57)
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:335)
	at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$3.apply(TorrentBroadcast.scala:286)
	at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$3.apply(TorrentBroadcast.scala:286)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.allocateNewChunkIfNeeded(ChunkedByteBufferOutputStream.scala:87)
	at org.apache.spark.util.io.ChunkedByteBufferOutputStream.write(ChunkedByteBufferOutputStream.scala:75)
	at net.jpountz.lz4.LZ4BlockOutputStream.flushBufferedData(LZ4BlockOutputStream.java:220)
	at net.jpountz.lz4.LZ4BlockOutputStream.write(LZ4BlockOutputStream.java:173)
	at java.io.ObjectOutputStream$BlockDataOutputStream.write(ObjectOutputStream.java:1853)
	at java.io.ObjectOutputStream.write(ObjectOutputStream.java:709)
	at org.apache.spark.util.Utils$$anonfun$copyStream$1.apply$mcJ$sp(Utils.scala:351)
	at org.apache.spark.util.Utils$$anonfun$copyStream$1.apply(Utils.scala:336)
	at org.apache.spark.util.Utils$$anonfun$copyStream$1.apply(Utils.scala:336)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1381)
	at org.apache.spark.util.Utils$.copyStream(Utils.scala:357)
	at org.apache.spark.api.python.PythonBroadcast$$anonfun$writeObject$1.apply$mcJ$sp(PythonRDD.scala:649)
	at org.apache.spark.api.python.PythonBroadcast$$anonfun$writeObject$1.apply(PythonRDD.scala:646)
	at org.apache.spark.api.python.PythonBroadcast$$anonfun$writeObject$1.apply(PythonRDD.scala:646)
	at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1347)
	at org.apache.spark.api.python.PythonBroadcast.writeObject(PythonRDD.scala:646)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at java.io.ObjectStreamClass.invokeWriteObject(ObjectStreamClass.java:1028)
	at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1496)
	at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432)
	at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178)
	at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:348)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:43)
	at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$blockifyObject$2.apply(TorrentBroadcast.scala:291)
	at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$blockifyObject$2.apply(TorrentBroadcast.scala:291)


## Exercise 4.14: Document similarity search with outside terms