In [1]:
import os
import pickle
import pyspark as ps
import numpy as np
from pyspark import SparkContext
from pyspark.sql.functions import col, lower, regexp_replace, split
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec, Word2VecModel
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from nltk.stem.porter import *

## Create Spark context

In [2]:
SparkContext.setSystemProperty('spark.driver.memory', '3g')
SparkContext.setSystemProperty('spark.executor.memory', '3g')

spark = ps.sql.SparkSession.builder \
            .master("local[4]") \
            .appName("word2vec") \
            .getOrCreate()

## Define helper functions

In [3]:
tokenizer = Tokenizer(inputCol="text", outputCol="vector")
remover = StopWordsRemover()
remover.setInputCol("vector")
remover.setOutputCol("vector_no_stopw")
stopwords = remover.getStopWords()
stemmer = PorterStemmer()
stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))

def tokenize_df(df):    
    df = df.select(clean_text(col("text")).alias("text"))
    df = tokenizer.transform(df).select("vector")
    df = remover.transform(df).select("vector_no_stopw")
    df = (df
        .withColumn("vector_stemmed", stemmer_udf("vector_no_stopw"))
        .select("vector_stemmed")
        )
    
    return df

    
def clean_text(c):
  c = lower(c)
  c = regexp_replace(c, "^rt ", "")
  c = regexp_replace(c, "(https?\://)\S+", "")
  c = regexp_replace(c, "[^a-zA-Z0-9\\s]", "")
  #c = split(c, "\\s+") tokenization...
  return c


def stem(in_vec):
    out_vec = []
    for t in in_vec:
        t_stem = stemmer.stem(t)
        if len(t_stem) > 2:
            out_vec.append(t_stem)       
    return out_vec

In [4]:
df = spark.createDataFrame([
    ("Hi I heard about Spark", ),
    ("I wish Java could use case classes", ),
    ("Logistic regression models are neat", )
], ["text"])

In [5]:
df_filt = tokenize_df(df)
df_filt.head(2)

[Row(vector_stemmed=['heard', 'spark']),
 Row(vector_stemmed=['wish', 'java', 'use', 'case', 'class'])]

In [6]:
path = '/Users/andradea/Documents/languages/en_US/'
model = Word2VecModel.load(path + "/word2vec-model")
model.getVectors().show()

+-------------+--------------------+
|         word|              vector|
+-------------+--------------------+
|     nijinski|[-0.0625451356172...|
|       ciresi|[-0.0682697072625...|
|         koel|[0.00719995377585...|
|        doili|[23.2668304443359...|
|         onam|[0.17379367351531...|
|      rahmani|[-0.0066115041263...|
|    budgetwis|[-0.0064925411716...|
|        dredd|[-1.5024898052215...|
|       gaiden|[0.02443256787955...|
|     autofocu|[-0.2609729170799...|
|     quotient|[-1.1691495180130...|
|   hirschhorn|[-0.0670437887310...|
|     clarissa|[-0.1497923582792...|
|     incident|[7.36339569091796...|
|      holsman|[-0.0202226024121...|
|meteorologist|[-6.8392672538757...|
|       gaslit|[-0.0221786983311...|
|      aikenit|[0.09228715300559...|
|  seventhgrad|[-4.1771936416625...|
|        hetch|[-1.6678887605667...|
+-------------+--------------------+
only showing top 20 rows



In [7]:
word_vector = model.getVectors().toPandas()

In [8]:
word_vector.head()

Unnamed: 0,word,vector
0,nijinski,"[-0.06254513561725616, -0.04033434018492699, 0..."
1,ciresi,"[-0.06826970726251602, -0.11408036947250366, 0..."
2,koel,"[0.007199953775852919, -0.04827534779906273, -..."
3,doili,"[23.266830444335938, -64.8271713256836, 7.6823..."
4,onam,"[0.17379367351531982, -0.17130979895591736, -0..."


In [9]:
# word_index
word_index = {}
for i in range(word_vector.word.count()):
    word_index[word_vector.iloc[i, 0]] = i

In [10]:
# embedding_index
embeddings_index = {}
for i in range(word_vector.vector.count()):
    embeddings_index[i] = word_vector.iloc[i, 1]

In [11]:
# embedding_matrix
embedding_matrix = np.zeros((len(word_index) + 1, len(embeddings_index[0])))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(i)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [12]:
np.savetxt('/Users/andradea/Documents/languages/en_US/embeddings.txt', embedding_matrix, delimiter=",")

In [13]:
with open('/Users/andradea/Documents/languages/en_US/word_index.pkl', 'wb') as f:
    pickle.dump(word_index, f, pickle.HIGHEST_PROTOCOL)

In [14]:
with open('/Users/andradea/Documents/languages/en_US/embeddings_index.pkl', 'wb') as f:
    pickle.dump(embeddings_index, f, pickle.HIGHEST_PROTOCOL)