In [1]:
from pyspark.ml.feature import Word2VecModel
from pyspark.sql import functions as F

In [None]:
ROOT = '/common/users/shared/cs543_fall22_group3'

In [2]:
processed_df = spark.read.json('{ROOT}/combined/combined_processed')
processed_df.show()

+--------------------+--------------------+
|        cleaned_text|       selected_text|
+--------------------+--------------------+
|london,financial,...|London's financia...|
|the,department,la...|The Department of...|
|mayor,gov,plan,sh...|MAYOR: GOV’S PLAN...|
|girardi,yankees,c...|Girardi on Yankee...|
|the,young,turks,h...|The Young Turks H...|
|emerge,market,inc...|Emerging markets ...|
|fool,try,develop,...|A Fool tries to d...|
|the,comedy,write,...|The comedy, writt...|
|another,call,rewr...|Another Call to R...|
|blake,shelton,gwe...|Blake Shelton & G...|
|read,nico,branham...|Read Nico Branham...|
|plus,xero,boss,re...|PLUS: Xero boss r...|
|awka-the,claim,go...|AWKA-THE claim by...|
|though,subways,bu...|Though subways an...|
|forget,not,christ...|‘Forgotten’? Not ...|
|leader,opposition...|Leader of the opp...|
|natalia,kill,say,...|Natalia Kills say...|
|superior,fall,2-1...|Superior falls to...|
|thom,patterson,cn...|By Thom Patterson...|
|finance,minister,...|EU finance

In [3]:
processed_df.count()

40954102

In [4]:
processed_df = processed_df.withColumn('cleaned_text', F.split(F.col('cleaned_text'), ','))
processed_df.show()

+--------------------+--------------------+
|        cleaned_text|       selected_text|
+--------------------+--------------------+
|[london, financia...|London's financia...|
|[the, department,...|The Department of...|
|[mayor, gov, plan...|MAYOR: GOV’S PLAN...|
|[girardi, yankees...|Girardi on Yankee...|
|[the, young, turk...|The Young Turks H...|
|[emerge, market, ...|Emerging markets ...|
|[fool, try, devel...|A Fool tries to d...|
|[the, comedy, wri...|The comedy, writt...|
|[another, call, r...|Another Call to R...|
|[blake, shelton, ...|Blake Shelton & G...|
|[read, nico, bran...|Read Nico Branham...|
|[plus, xero, boss...|PLUS: Xero boss r...|
|[awka-the, claim,...|AWKA-THE claim by...|
|[though, subways,...|Though subways an...|
|[forget, not, chr...|‘Forgotten’? Not ...|
|[leader, oppositi...|Leader of the opp...|
|[natalia, kill, s...|Natalia Kills say...|
|[superior, fall, ...|Superior falls to...|
|[thom, patterson,...|By Thom Patterson...|
|[finance, ministe...|EU finance

In [5]:
word2vec_model = Word2VecModel.load(f'{ROOT}/models/word2vec')

In [6]:
vectorized_df = word2vec_model.transform(processed_df.select('cleaned_text'))
vectorized_df.show()

+--------------------+--------------------+
|        cleaned_text|      output_vectors|
+--------------------+--------------------+
|[london, financia...|[-0.5109536331146...|
|[the, department,...|[-0.3147776111887...|
|[mayor, gov, plan...|[-5.5029392242431...|
|[girardi, yankees...|[0.12310679753621...|
|[the, young, turk...|[-0.2074893119434...|
|[emerge, market, ...|[-0.4438817477361...|
|[fool, try, devel...|[-0.2903663915349...|
|[the, comedy, wri...|[0.14530817537822...|
|[another, call, r...|[-0.0227802122632...|
|[blake, shelton, ...|[0.2999736661076895]|
|[read, nico, bran...|[0.0738221023763929]|
|[plus, xero, boss...|[-0.2142722297992...|
|[awka-the, claim,...|[-0.0676146328735...|
|[though, subways,...|[-0.0927275374531...|
|[forget, not, chr...|[0.20289693609811...|
|[leader, oppositi...|[-0.0717920884490...|
|[natalia, kill, s...|[0.05694128102105...|
|[superior, fall, ...|[-0.0910834123690...|
|[thom, patterson,...|[-0.2146621572605...|
|[finance, ministe...|[-0.493275

In [7]:
vectorized_df.printSchema()

root
 |-- cleaned_text: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- output_vectors: vector (nullable = true)



In [None]:
vectorized_df.write.mode('Overwrite').json(f'{ROOT}/combined/combined_vectors_result')