In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import pyspark as ps
from pyspark import SparkContext
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import col, lower, regexp_replace, split

In [2]:
SparkContext.setSystemProperty('spark.driver.memory', '3g')
SparkContext.setSystemProperty('spark.executor.memory', '3g')

spark = ps.sql.SparkSession.builder \
            .master("local[4]") \
            .appName("word2vec") \
            .getOrCreate()

In [3]:
def clean_text(c):
    c = lower(c)
    c = regexp_replace(c, "^rt ", "")
    c = regexp_replace(c, "(https?\://)\S+", "")
    c = regexp_replace(c, "[^a-zA-Z0-9\\s]", "")
#     c = regexp_replace(c, "[0-9]", "")
    c = split(c, "\\s+") # tokenization...
    return c

In [4]:
path = '/Users/andradea/Documents/cr-engine/datasets/ASAP/'
files = os.listdir(path)
files = [f for f in files if 'CR' in f and '.csv' in f]

In [5]:
file = files[0]
df = spark.read.csv(path + file, header=True, inferSchema=True)

In [6]:
df = df.select(clean_text(col("response")).alias("response"))

In [7]:
df.show(5)

+--------------------+
|            response|
+--------------------+
|[white, a, white,...|
|[dark, gray, i, w...|
|[black, black, be...|
|[white, painting,...|
|[dark, gray, i, t...|
+--------------------+
only showing top 5 rows



In [8]:
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=150, minCount=1, numPartitions=400, stepSize=0.025, maxIter=10, seed=42, 
                    windowSize=5, maxSentenceLength=1000, inputCol="response", outputCol="result")
model = word2Vec.fit(df)
model.getVectors().show()

+--------+--------------------+
|    word|              vector|
+--------+--------------------+
|    rate|[0.68907362222671...|
|      45|[-895507.4375,-92...|
|    rage|[-18434.92578125,...|
| absorbs|[-1.353594175488E...|
|  egrees|[-0.2343023121356...|
|     est|[118.70458984375,...|
|    ound|[566.875793457031...|
|perature|[-12.729274749755...|
|    used|[-1.136143488E9,9...|
|     eye|[0.33923476934432...|
|averages|[3102.23510742187...|
|  rature|[0.54148471355438...|
|       e|[1.32436202409164...|
| obsorbs|[-0.3662510812282...|
|     snt|[-0.8332204818725...|
|      se|[-9.819492E7,-2.7...|
|    down|[25409.896484375,...|
|  doghou|[-134.966796875,2...|
|    side|[-6.2198796E7,-2....|
|    4143|[0.58533424139022...|
+--------+--------------------+
only showing top 20 rows



In [9]:
word_vector = model.getVectors().toPandas()

In [10]:
# word_index
word_index = {}
for i in range(word_vector.word.count()):
    word_index[word_vector.iloc[i, 0]] = i

In [11]:
# embedding_index
embeddings_index = {}
for i in range(word_vector.vector.count()):
    embeddings_index[i] = word_vector.iloc[i, 1]

In [12]:
# embedding_matrix
embedding_matrix = np.zeros((len(word_index) + 1, len(embeddings_index[0])))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(i)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [13]:
path_save = '/Users/andradea/Documents/GitHub/CRAIS/Embeddings/'

with open(path_save + 'word_index.pkl', 'wb') as f:
    pickle.dump(word_index, f, pickle.HIGHEST_PROTOCOL)

with open(path_save + 'embeddings_index.pkl', 'wb') as f:
    pickle.dump(embeddings_index, f, pickle.HIGHEST_PROTOCOL)