# Embedding Creation

## Import libraries

In [1]:
import os
import pyspark as ps
from pyspark import SparkContext
from pyspark.sql.functions import col, lower, regexp_replace, split
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from nltk.stem.porter import *

## Create Spark context

In [2]:
SparkContext.setSystemProperty('spark.driver.memory', '3g')
SparkContext.setSystemProperty('spark.executor.memory', '3g')

spark = ps.sql.SparkSession.builder \
            .master("local[4]") \
            .appName("word2vec") \
            .getOrCreate()

## Load and subset data

In [3]:
path = '/Users/andradea/Documents/languages/en_US/'
files = os.listdir(path)
files = [f for f in files if f.startswith('en') == True and 'twitter' not in f]
print(files)

df = spark.createDataFrame([['']])

for file in files:
    df = df.union(spark.read.text(path + file))
    
# df = df.sample(withReplacement=False, fraction=0.75, seed=42)

old_col = df.schema.names[0]
df = df.selectExpr(old_col + ' as text')

print('total number of lines in df:', df.count())

['en_US.blogs.txt', 'en_US.news.txt']
total number of lines in df: 1909531


In [4]:
tokenizer = Tokenizer(inputCol="text", outputCol="vector")
remover = StopWordsRemover()
remover.setInputCol("vector")
remover.setOutputCol("vector_no_stopw")
stopwords = remover.getStopWords()
stemmer = PorterStemmer()
stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))

def tokenize_df(df):    
    df = df.select(clean_text(col("text")).alias("text"))
    df = tokenizer.transform(df).select("vector")
    df = remover.transform(df).select("vector_no_stopw")
    df = (df
        .withColumn("vector_stemmed", stemmer_udf("vector_no_stopw"))
        .select("vector_stemmed")
        )
    
    return df

    
def clean_text(c):
    c = lower(c)
    c = regexp_replace(c, "^rt ", "")
    c = regexp_replace(c, "(https?\://)\S+", "")
    c = regexp_replace(c, "[^a-zA-Z0-9\\s]", "")
    c = regexp_replace(c, "[0-9]", "")
  #c = split(c, "\\s+") tokenization...
    return c


def stem(in_vec):
    out_vec = []
    for t in in_vec:
        t_stem = stemmer.stem(t)
        if len(t_stem) > 2:
            out_vec.append(t_stem)       
    return out_vec

In [5]:
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=150, minCount=5, numPartitions=400, stepSize=0.025, maxIter=1, seed=42, 
                    windowSize=5, maxSentenceLength=1000, inputCol="vector_stemmed", outputCol="result")

df = tokenize_df(df)
model = word2Vec.fit(df)
model.getVectors().show()

+-------------+--------------------+
|         word|              vector|
+-------------+--------------------+
|     nijinski|[-0.0625451356172...|
|       ciresi|[-0.0682697072625...|
|         koel|[0.00719995377585...|
|        doili|[23.2668304443359...|
|         onam|[0.17379367351531...|
|      rahmani|[-0.0066115041263...|
|    budgetwis|[-0.0064925411716...|
|        dredd|[-1.5024898052215...|
|       gaiden|[0.02443256787955...|
|     autofocu|[-0.2609729170799...|
|     quotient|[-1.1691495180130...|
|   hirschhorn|[-0.0670437887310...|
|     clarissa|[-0.1497923582792...|
|     incident|[7.36339569091796...|
|      holsman|[-0.0202226024121...|
|meteorologist|[-6.8392672538757...|
|       gaslit|[-0.0221786983311...|
|      aikenit|[0.09228715300559...|
|  seventhgrad|[-4.1771936416625...|
|        hetch|[-1.6678887605667...|
+-------------+--------------------+
only showing top 20 rows



In [6]:
model.save(path + "/word2vec-model")