In [1]:
# Install Java, Spark, and Findspark
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
!tar xf spark-2.4.7-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [1 In0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Conn                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Conn0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com] [Connecting to                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com] [Connecting to                                                                               Get:4 https://developer.download.nvidia.com/comp

In [2]:
# I have yelp 100k dataset loaded into my google drive
# this allows me to access that data from inside spark
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# Start a SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("word2vec").getOrCreate()

In [4]:
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.ml.feature import Word2Vec
import string

schema = StructType([StructField("text", StringType(), True)])

In [5]:
# This URL is on my gdrive only! I loaded it there from a download from the internet
url = "/content/gdrive/MyDrive/Cleaned_Tweets_030821_AGP.csv"
df = spark.read.schema(schema).csv(url, sep=",", header=False)

In [6]:
df.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                        |
+----------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                        |
|Australia  Manufacture Covid19 Vaccine  give it   Citizens for free of cost AFP quotes Prime Minister                       |
|CovidVaccine"                                                                                                               |
|CoronavirusVaccine CoronaVaccine CovidVaccine Australia is doing very good                                                  |
|Deaths due  COVID19 in Affected Countries                                                                     

In [7]:
# punctuation removal due to relatively small datasize
def remove_punctuation(txt):
  return "".join(l if l not in string.punctuation else "" for l in txt)

In [8]:
from pyspark.sql.functions import col, udf

remove_punctuation_udf = udf(remove_punctuation, StringType())
remove_punctuation_udf

<function __main__.remove_punctuation>

In [9]:
df = df.withColumn("clean_text", remove_punctuation_udf(col("text")))
df.show()

+--------------------+--------------------+
|                text|          clean_text|
+--------------------+--------------------+
|                text|                text|
|Australia  Manufa...|Australia  Manufa...|
|       CovidVaccine"|        CovidVaccine|
|CoronavirusVaccin...|CoronavirusVaccin...|
|Deaths due  COVID...|Deaths due  COVID...|
|          Read More |          Read More |
|                   "|                    |
|   Stay safe  di ...|   Stay safe  di ...|
|  This is what pa...|  This is what pa...|
|The Multisystem I...|The Multisystem I...|
|               The "|                The |
| Well lets qualif...| Well lets qualif...|
|Most countries wi...|Most countries wi...|
|DNA  zooms up cha...|DNA  zooms up cha...|
|Biocon Executive ...|Biocon Executive ...|
|            its over|            its over|
|Covid19Millionare...|Covid19Millionare...|
|corona CovidVaccine"| corona CovidVaccine|
|Great news s vacc...|Great news s vacc...|
|    Pharmaceutical "|     Pharm

In [10]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml import Pipeline

# "Creating pipeline..."
tokenizer = Tokenizer(inputCol="clean_text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text', outputCol='features')

pipeline = Pipeline(stages=[tokenizer, stopremove])

# "Training model..."
pipeline_stg = pipeline.fit(df)
final_df = pipeline_stg.transform(df)
final_df.show()

+--------------------+--------------------+--------------------+--------------------+
|                text|          clean_text|          token_text|            features|
+--------------------+--------------------+--------------------+--------------------+
|                text|                text|              [text]|              [text]|
|Australia  Manufa...|Australia  Manufa...|[australia, , man...|[australia, , man...|
|       CovidVaccine"|        CovidVaccine|      [covidvaccine]|      [covidvaccine]|
|CoronavirusVaccin...|CoronavirusVaccin...|[coronavirusvacci...|[coronavirusvacci...|
|Deaths due  COVID...|Deaths due  COVID...|[deaths, due, , c...|[deaths, due, , c...|
|          Read More |          Read More |        [read, more]|              [read]|
|                   "|                    |                  []|                  []|
|   Stay safe  di ...|   Stay safe  di ...|[, , , stay, safe...|[, , , stay, safe...|
|  This is what pa...|  This is what pa...|[, , this, 

In [11]:
# in class I had used vectorsize of 200 and max iterations of 2
word2vec = Word2Vec(
    vectorSize=300,
    seed=42,
    inputCol="features",
    outputCol="model"
).setMaxIter(3)
model = word2vec.fit(final_df)

In [12]:
model.getVectors().show(truncate=False)

+--------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [13]:
vecs = model.getVectors()

In [14]:
vecs.filter(vecs["word"] == "incident").show()

+--------+--------------------+
|    word|              vector|
+--------+--------------------+
|incident|[0.14139033854007...|
+--------+--------------------+



In [None]:
model.findSynonymsArray("hope", 50)

[('praying', 0.500312328338623),
 ('hoping', 0.45793357491493225),
 ('brings', 0.4556865692138672),
 ('greet', 0.45250147581100464),
 ('wishing', 0.4514220058917999),
 ('glimmer', 0.44977787137031555),
 ('ray', 0.4484940767288208),
 ('brighter', 0.4479285776615143),
 ('pray', 0.44383904337882996),
 ('sincerely', 0.4395686984062195),
 ('hopefully', 0.4384905695915222),
 ('honest', 0.43100762367248535),
 ('horizon', 0.4261099398136139),
 ('2021year', 0.4175177216529846),
 ('healthier', 0.41611096262931824),
 ('light', 0.40951186418533325),
 ('woods', 0.4037843942642212),
 ('thks', 0.3997972011566162),
 ('hahaha', 0.3987346291542053),
 ('inshaallah', 0.39487749338150024),
 ('goodbye', 0.39442649483680725),
 ('happiness', 0.39382195472717285),
 ('normalcy', 0.39160463213920593),
 ('hopeful', 0.3908766806125641),
 ('taste', 0.386140376329422),
 ('tunnel', 0.3825678825378418),
 ('wishes', 0.37954065203666687),
 ('feelings', 0.37901991605758667),
 ('wish', 0.3779437839984894),
 ('rocking', 0.

In [None]:
!pip install wordcloud



In [None]:
from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

In [None]:
moderna = model.findSynonymsArray("moderna", 100)
moderna

[('comirnaty', 0.6570561528205872),
 ('pfizerbiontech', 0.641274094581604),
 ('moderns', 0.6307011246681213),
 ('pfizervaccine', 0.6121554970741272),
 ('pfizercovidvaccine', 0.6095624566078186),
 ('modernavaccine', 0.5773386359214783),
 ('pfizer', 0.5756853818893433),
 ('modena', 0.5601515173912048),
 ('daycovidvaccine', 0.5587019324302673),
 ('notthrowingawaymyshot', 0.5530538558959961),
 ('post2nd', 0.5372170805931091),
 ('secondvaccine', 0.5257701277732849),
 ('wahoo', 0.5237321257591248),
 ('501pharmacy', 0.5215190052986145),
 ('implanted', 0.5143691301345825),
 ('2d', 0.5085158348083496),
 ('woohoo', 0.5054645538330078),
 ('thankyouscience', 0.5020851492881775),
 ('delaying', 0.5008731484413147),
 ('dosage', 0.5007596611976624),
 ('covidvac', 0.49036848545074463),
 ('modernas', 0.4900924861431122),
 ('covidvaccinated', 0.479920893907547),
 ('novovax', 0.47923246026039124),
 ('seconddose', 0.47578322887420654),
 ('stickittocovid', 0.4742739796638489),
 ('effica', 0.47414630651474),

In [None]:
pfizer = model.findSynonymsArray("pfizer", 100)
pfizer

[('pfizerbiontech', 0.6519420742988586),
 ('comirnaty', 0.6002109050750732),
 ('moderna', 0.5756853222846985),
 ('novovax', 0.5501601099967957),
 ('moderns', 0.5413220524787903),
 ('secondvaccine', 0.5261573791503906),
 ('bnt162b2', 0.5107253193855286),
 ('pfizervaccine', 0.4895603060722351),
 ('delaying', 0.483703076839447),
 ('pfizercovidvaccine', 0.48358893394470215),
 ('azoxford', 0.47107115387916565),
 ('effica', 0.46923503279685974),
 ('identical', 0.4667728543281555),
 ('65s', 0.4656617343425751),
 ('modernavaccine', 0.4652892053127289),
 ('modena', 0.4638868570327759),
 ('dosage', 0.4585341513156891),
 ('daycovidvaccine', 0.45676228404045105),
 ('substantially', 0.45426279306411743),
 ('ox', 0.45203855633735657),
 ('primer', 0.4502487778663635),
 ('phizer', 0.448318749666214),
 ('zenica', 0.4481217861175537),
 ('2d', 0.4477503001689911),
 ('astrazenecavaccine', 0.4467598497867584),
 ('biontech', 0.4428704082965851),
 ('covidvac', 0.431173712015152),
 ('twodose', 0.4275698661804

In [None]:
johnson = model.findSynonymsArray("johnson", 100)
johnson

[('johnsons', 0.8292554020881653),
 ('singleshot', 0.7203711867332458),
 ('singledose', 0.6794792413711548),
 ('oneshot', 0.6666111350059509),
 ('1shot', 0.643659234046936),
 ('onedose', 0.6319281458854675),
 ('boris', 0.5905872583389282),
 ('authorizes', 0.5809741020202637),
 ('pauses', 0.5757165551185608),
 ('ralph', 0.5592656135559082),
 ('janssen', 0.552723228931427),
 ('jampj', 0.5465896129608154),
 ('1dose', 0.5337861180305481),
 ('farage', 0.5323960781097412),
 ('stanley', 0.525837779045105),
 ('lilly', 0.5088249444961548),
 ('jampjs', 0.4949971139431),
 ('submits', 0.48676860332489014),
 ('advisers', 0.46180686354637146),
 ('effectcovidvaccine', 0.46108922362327576),
 ('endorses', 0.46092545986175537),
 ('eli', 0.45773378014564514),
 ('applied', 0.4577096998691559),
 ('plots', 0.4461324214935303),
 ('subsidiary', 0.4440677762031555),
 ('johnsonjohnson', 0.4402426779270172),
 ('vot', 0.4350687861442566),
 ('jnj', 0.43490445613861084),
 ('fda', 0.4321689307689667),
 ('paused', 0.

In [None]:
hope = model.findSynonymsArray("hope", 100)
hope

[('praying', 0.500312328338623),
 ('hoping', 0.45793357491493225),
 ('brings', 0.4556865692138672),
 ('greet', 0.45250147581100464),
 ('wishing', 0.4514220058917999),
 ('glimmer', 0.44977787137031555),
 ('ray', 0.4484940767288208),
 ('brighter', 0.4479285776615143),
 ('pray', 0.44383904337882996),
 ('sincerely', 0.4395686984062195),
 ('hopefully', 0.4384905695915222),
 ('honest', 0.43100762367248535),
 ('horizon', 0.4261099398136139),
 ('2021year', 0.4175177216529846),
 ('healthier', 0.41611096262931824),
 ('light', 0.40951186418533325),
 ('woods', 0.4037843942642212),
 ('thks', 0.3997972011566162),
 ('hahaha', 0.3987346291542053),
 ('inshaallah', 0.39487749338150024),
 ('goodbye', 0.39442649483680725),
 ('happiness', 0.39382195472717285),
 ('normalcy', 0.39160463213920593),
 ('hopeful', 0.3908766806125641),
 ('taste', 0.386140376329422),
 ('tunnel', 0.3825678825378418),
 ('wishes', 0.37954065203666687),
 ('feelings', 0.37901991605758667),
 ('wish', 0.3779437839984894),
 ('rocking', 0.

In [None]:
good = model.findSynonymsArray("good", 100)
good

[('great', 0.5856936573982239),
 ('bad', 0.5721865892410278),
 ('exciting', 0.51589435338974),
 ('newsbulletin', 0.4845781624317169),
 ('depressing', 0.4680946171283722),
 ('miserable', 0.45494866371154785),
 ('sky', 0.4437854588031769),
 ('bleak', 0.44320499897003174),
 ('surprisingly', 0.43422359228134155),
 ('awful', 0.433883398771286),
 ('bbc', 0.4329426884651184),
 ('presenter', 0.4284595549106598),
 ('cbc', 0.4252866208553314),
 ('nice', 0.42363297939300537),
 ('eunews', 0.4121701717376709),
 ('weary', 0.4093440771102905),
 ('gma', 0.40671616792678833),
 ('annoying', 0.4060322046279907),
 ('slightest', 0.40406790375709534),
 ('fantastic', 0.40390095114707947),
 ('goo', 0.4011977016925812),
 ('escilo', 0.4008265435695648),
 ('heartening', 0.4007836878299713),
 ('closeups', 0.3994161784648895),
 ('smach', 0.39906638860702515),
 ('epic', 0.39747923612594604),
 ('terrific', 0.39190807938575745),
 ('nubit', 0.3915402293205261),
 ('fabulous', 0.39029809832572937),
 ('imo', 0.3901559412

In [None]:
future = model.findSynonymsArray("future", 100)
future

[('poss', 0.49294596910476685),
 ('improvements', 0.4632216691970825),
 ('pemi', 0.43426916003227234),
 ('thrive', 0.426118940114975),
 ('defeat', 0.4218570590019226),
 ('ray', 0.4009418189525604),
 ('establish', 0.4006577432155609),
 ('beacon', 0.3996603786945343),
 ('returning', 0.3988704979419708),
 ('anytime', 0.3953809440135956),
 ('festive', 0.3907492756843567),
 ('compassion', 0.39061158895492554),
 ('foreseeable', 0.383004367351532),
 ('solidarity', 0.3825984597206116),
 ('sankranti', 0.38122251629829407),
 ('norm', 0.381041556596756),
 ('midst', 0.38063696026802063),
 ('relevance', 0.37894487380981445),
 ('everyo', 0.37874096632003784),
 ('savin', 0.37832385301589966),
 ('leftists', 0.37578701972961426),
 ('strive', 0.3756678104400635),
 ('prosperity', 0.37395766377449036),
 ('partial', 0.37342947721481323),
 ('buck', 0.3706914186477661),
 ('horizon', 0.37058529257774353),
 ('strengn', 0.3703593909740448),
 ('saf', 0.3674931824207306),
 ('contribute', 0.36740660667419434),
 ('

In [None]:
science = model.findSynonymsArray("science", 100)
science

[('provaccination', 0.4811190068721771),
 ('tee', 0.47122031450271606),
 ('igottheshot', 0.4462447464466095),
 ('scienceiscool', 0.4297068417072296),
 ('scienceisreal', 0.4268375337123871),
 ('scie', 0.4095132052898407),
 ('scientists', 0.4060291349887848),
 ('sciencematters', 0.4017610251903534),
 ('truly', 0.3969116508960724),
 ('communicar', 0.3880435824394226),
 ('scien', 0.38496917486190796),
 ('strongertoger', 0.3802069425582886),
 ('modern', 0.37446480989456177),
 ('scicomm', 0.37011468410491943),
 ('ingenuity', 0.3672982156276703),
 ('igotshot', 0.3651127815246582),
 ('vaccinessavelives', 0.3615151047706604),
 ('lauren', 0.35869330167770386),
 ('regain', 0.35825711488723755),
 ('thankful', 0.35805976390838623),
 ('goodness', 0.35148149728775024),
 ('fiction', 0.3514264225959778),
 ('familys', 0.3414199948310852),
 ('god', 0.34011372923851013),
 ('humbled', 0.34006646275520325),
 ('nursetwitter', 0.3386535942554474),
 ('imvaccinated', 0.337586909532547),
 ('grateful', 0.33691203

In [None]:
holiday = model.findSynonymsArray("holiday", 100)
holiday

[('wished', 0.48210036754608154),
 ('ticket', 0.4763231873512268),
 ('imagined', 0.4730503559112549),
 ('pin', 0.4717254936695099),
 ('movie', 0.46880045533180237),
 ('shirt', 0.4631127715110779),
 ('memories', 0.4619933068752289),
 ('valentinesday', 0.4604165554046631),
 ('holidays', 0.45675233006477356),
 ('newbeginnings', 0.4559170603752136),
 ('mcdonalds', 0.4476793110370636),
 ('gang', 0.4457070827484131),
 ('prosperous', 0.4430850148200989),
 ('ruin', 0.4407576322555542),
 ('realized', 0.4359463155269623),
 ('hoorah', 0.43545982241630554),
 ('cake', 0.4346703588962555),
 ('xmas', 0.434628427028656),
 ('semester', 0.43088123202323914),
 ('phography', 0.4301864802837372),
 ('dreaming', 0.4300714135169983),
 ('celebrate', 0.4280588924884796),
 ('mountains', 0.42542049288749695),
 ('drama', 0.42539775371551514),
 ('garings', 0.4237496554851532),
 ('christmas', 0.4202021360397339),
 ('rocking', 0.41852113604545593),
 ('wishes', 0.41664665937423706),
 ('bread', 0.4163958430290222),
 ('

In [None]:
alone = model.findSynonymsArray("alone", 100)
alone

[('dumbass', 0.48141735792160034),
 ('unnecessarily', 0.459983229637146),
 ('sink', 0.45474812388420105),
 ('leaving', 0.45267078280448914),
 ('bac', 0.43811917304992676),
 ('terrified', 0.4367775321006775),
 ('nobody', 0.4299369752407074),
 ('bored', 0.423705518245697),
 ('households', 0.42276984453201294),
 ('guessed', 0.4227248430252075),
 ('cali', 0.42262691259384155),
 ('socialize', 0.4171445071697235),
 ('kno', 0.4149831533432007),
 ('traveling', 0.4144546389579773),
 ('100k', 0.4139240086078644),
 ('gods', 0.4117472171783447),
 ('morons', 0.40860095620155334),
 ('guards', 0.40625444054603577),
 ('ditch', 0.40613898634910583),
 ('wanna', 0.4061344861984253),
 ('aren', 0.40580689907073975),
 ('shy', 0.4054259955883026),
 ('soc', 0.4038163423538208),
 ('400k', 0.4029030501842499),
 ('eat', 0.4018934667110443),
 ('fits', 0.4014336168766022),
 ('cuomos', 0.39981430768966675),
 ('indoors', 0.3989150822162628),
 ('ruin', 0.3987533450126648),
 ('ontarians', 0.3972642421722412),
 ('alarm

In [None]:
vaccine = model.findSynonymsArray("vaccine", 100)
vaccine

[('vacc', 0.4709358215332031),
 ('vaccines', 0.3917021155357361),
 ('vacci', 0.3789628744125366),
 ('covidvaccine', 0.3637239634990692),
 ('vaccin', 0.3484173119068146),
 ('appropriat', 0.3414127826690674),
 ('vac', 0.3318595588207245),
 ('jab', 0.3305227756500244),
 ('vaccination', 0.3293522596359253),
 ('throug', 0.3088783919811249),
 ('1stdose', 0.302866131067276),
 ('mychart', 0.2994568943977356),
 ('curative', 0.2980199158191681),
 ('2621', 0.297376811504364),
 ('appointmen', 0.29062628746032715),
 ('arranged', 0.2888718843460083),
 ('sandiego', 0.2851578891277313),
 ('sicker', 0.28472551703453064),
 ('tues', 0.28319641947746277),
 ('anticipate', 0.27911344170570374),
 ('feelgoodfriday', 0.27716583013534546),
 ('allowance', 0.2742919921875),
 ('javits', 0.27378347516059875),
 ('830', 0.26915672421455383),
 ('vacinne', 0.2687000632286072),
 ('onl', 0.26733341813087463),
 ('chal', 0.2653219401836395),
 ('danvers', 0.2642526626586914),
 ('225', 0.26044198870658875),
 ('yemen', 0.2604

In [None]:
scary = model.findSynonymsArray("scary", 100)
scary

[('fishy', 0.5966261625289917),
 ('spin', 0.5943763256072998),
 ('movies', 0.5915853381156921),
 ('stir', 0.5858093500137329),
 ('shock', 0.5845569968223572),
 ('opposite', 0.5844617486000061),
 ('depressing', 0.5834929943084717),
 ('fk', 0.5746005773544312),
 ('ranting', 0.5710521340370178),
 ('somewhere', 0.5650080442428589),
 ('actu', 0.5631411075592041),
 ('idiotic', 0.5604608654975891),
 ('couldve', 0.5584253668785095),
 ('shitting', 0.5503214001655579),
 ('inconvenient', 0.5482550263404846),
 ('trair', 0.5481576323509216),
 ('hmm', 0.5476829409599304),
 ('reinforcing', 0.5471763610839844),
 ('unfortunate', 0.5437905192375183),
 ('describing', 0.543411135673523),
 ('script', 0.5427531003952026),
 ('kinds', 0.5411144495010376),
 ('blessing', 0.5397617816925049),
 ('swineflu', 0.5368492007255554),
 ('laugh', 0.5353692770004272),
 ('entertaining', 0.5343003273010254),
 ('disheartening', 0.5326311588287354),
 ('headline', 0.5309947729110718),
 ('digging', 0.5303161144256592),
 ('absur

In [None]:
dead = model.findSynonymsArray("dead", 100)
dead

[('wound', 0.6373633146286011),
 ('stab', 0.6010900735855103),
 ('400k', 0.5433568358421326),
 ('spthebloodbath', 0.5037172436714172),
 ('202', 0.4964887797832489),
 ('december', 0.4963536560535431),
 ('137', 0.4916656017303467),
 ('killed', 0.45407551527023315),
 ('fainted', 0.44740381836891174),
 ('found', 0.44181913137435913),
 ('gibraltar', 0.4402710795402527),
 ('norway', 0.42367061972618103),
 ('vaccinedeath', 0.4220825135707855),
 ('grew', 0.4205002188682556),
 ('46', 0.4196164011955261),
 ('died', 0.4158477783203125),
 ('181', 0.4155276119709015),
 ('unusual', 0.4139809310436249),
 ('forcibly', 0.4101381003856659),
 ('53', 0.40900731086730957),
 ('28yearold', 0.4062897562980652),
 ('hankaaron', 0.406208872795105),
 ('swiss', 0.404727578163147),
 ('marathon', 0.40194037556648254),
 ('21', 0.40163129568099976),
 ('400000', 0.3982754349708557),
 ('graves', 0.3921254873275757),
 ('waterhouse', 0.3907744288444519),
 ('lap', 0.3890190124511719),
 ('fallen', 0.38762009143829346),
 ('b

In [None]:
death = model.findSynonymsArray("death", 100)
death

[('mortality', 0.5292860865592957),
 ('rises', 0.5261727571487427),
 ('labeled', 0.5158005952835083),
 ('329', 0.5151643753051758),
 ('hospitalisations', 0.513367235660553),
 ('ll', 0.5110110640525818),
 ('suspected', 0.5059399008750916),
 ('653', 0.5044009685516357),
 ('deaths', 0.5016180872917175),
 ('fatality', 0.49678486585617065),
 ('hospitalizations', 0.4953637421131134),
 ('501', 0.4901556968688965),
 ('22012021', 0.4872989058494568),
 ('norway', 0.48688289523124695),
 ('causes', 0.4862525463104248),
 ('incidence', 0.48567184805870056),
 ('complications', 0.484700083732605),
 ('cause', 0.48402246832847595),
 ('latent', 0.47883540391921997),
 ('shortly', 0.4752933084964752),
 ('survival', 0.4710102677345276),
 ('injuries', 0.47050559520721436),
 ('fudged', 0.46895164251327515),
 ('positivity', 0.4689134359359741),
 ('sever', 0.4667021632194519),
 ('unrelated', 0.46435198187828064),
 ('018', 0.4638267755508423),
 ('143', 0.4595738351345062),
 ('hospitalization', 0.4595073759555816

In [None]:
shot = model.findSynonymsArray("shot", 100)
shot

[('dose', 0.6954872012138367),
 ('hubby', 0.5677440762519836),
 ('jag', 0.5668014287948608),
 ('momma', 0.5578009486198425),
 ('round', 0.5366305112838745),
 ('vacinne', 0.5289687514305115),
 ('dosage', 0.5208116173744202),
 ('jab', 0.5170002579689026),
 ('poked', 0.5123833417892456),
 ('implanted', 0.5122433304786682),
 ('doze', 0.5081467032432556),
 ('woohoo', 0.5067422986030579),
 ('knocked', 0.5033774375915527),
 ('wahoo', 0.5019847750663757),
 ('24hours', 0.49884462356567383),
 ('daddy', 0.49304088950157166),
 ('seco', 0.4893454909324646),
 ('fauciouchie', 0.4867803752422333),
 ('dosis', 0.48235636949539185),
 ('yay', 0.47924453020095825),
 ('nanny', 0.47521689534187317),
 ('24hrs', 0.47028887271881104),
 ('jelly', 0.46591538190841675),
 ('48hrs', 0.4623539447784424),
 ('dads', 0.4589857757091522),
 ('hooray', 0.4579240679740906),
 ('mine', 0.45679771900177),
 ('poke', 0.4534945785999298),
 ('stabbed', 0.4522772431373596),
 ('bellletstalk', 0.45115968585014343),
 ('yeste', 0.44737

In [None]:
lucky = model.findSynonymsArray("lucky", 100)
lucky

[('fortunate', 0.7257573008537292),
 ('blessed', 0.6327465176582336),
 ('privileged', 0.6174687743186951),
 ('humbled', 0.5514963269233704),
 ('thankful', 0.5457369685173035),
 ('relieved', 0.5430518984794617),
 ('chuffed', 0.533189058303833),
 ('grateful', 0.5253987312316895),
 ('incredibly', 0.5093411803245544),
 ('elated', 0.5053091645240784),
 ('greatful', 0.5048177242279053),
 ('grat', 0.4840155243873596),
 ('appreciative', 0.48348110914230347),
 ('understatement', 0.48209521174430847),
 ('immensely', 0.4802990257740021),
 ('overjoyed', 0.4787628650665283),
 ('thrilled', 0.4766072928905487),
 ('snowed', 0.4745960831642151),
 ('gratefu', 0.46972355246543884),
 ('lighter', 0.46811196208000183),
 ('sked', 0.4671446681022644),
 ('unbelievably', 0.46139654517173767),
 ('sooooo', 0.46001940965652466),
 ('weirdly', 0.45979323983192444),
 ('proud', 0.4595033824443817),
 ('honored', 0.45317214727401733),
 ('delighted', 0.4529569149017334),
 ('overwhelmed', 0.4506203532218933),
 ('buzzing',

In [None]:
wordcloud = WordCloud().generate(alone)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

TypeError: ignored

In [None]:
w2v = model.transform(final_df)

In [None]:
w2v.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|          clean_text|          token_text|            features|               model|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|                text|              [text]|              [text]|[0.22215782105922...|
|Australia  Manufa...|Australia  Manufa...|[australia, , man...|[australia, , man...|[0.00505105259799...|
|       CovidVaccine"|        CovidVaccine|      [covidvaccine]|      [covidvaccine]|[-0.1073507741093...|
|CoronavirusVaccin...|CoronavirusVaccin...|[coronavirusvacci...|[coronavirusvacci...|[0.01087360344827...|
|Deaths due  COVID...|Deaths due  COVID...|[deaths, due, , c...|[deaths, due, , c...|[-0.0538873793557...|
|          Read More |          Read More |        [read, more]|              [read]|[-0.0483554415404...|
|                   "|               

In [None]:
w2v_clustering = w2v.select(
    "text", "model"
).withColumnRenamed("model", "features")

In [None]:
w2v_clustering.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
from pyspark.ml.feature import PCA
from pyspark.mllib.linalg import Vectors

pca = PCA(k=10, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(w2v_clustering)
result = model.transform(w2v_clustering)
result.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
result = result.select("text", "pcaFeatures")
w2v_clustering = result.withColumnRenamed("pcaFeatures", "features")
w2v_clustering.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                        |features                                                                                                                                                                                                       |
+----------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Trains a k-means model.
kmeans = KMeans().setK(7).setSeed(1)
km_model = kmeans.fit(w2v_clustering)

In [None]:
# Make predictions
predictions = km_model.transform(w2v_clustering)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator(distanceMeasure="cosine")


# silhouette score goes between -1, 1. 1 is better. Negative is bad
silhouette = evaluator.evaluate(predictions)
print("Silhouette with cosine distance = " + str(silhouette))

Silhouette with cosine distance = 0.16940588027118214


In [None]:
predictions.show()

+--------------------+--------------------+----------+
|                text|            features|prediction|
+--------------------+--------------------+----------+
|                text|[0.03122598254819...|         5|
|Australia  Manufa...|[0.09738639400465...|         4|
|       CovidVaccine"|[2.03635943364206...|         0|
|CoronavirusVaccin...|[0.69215245734151...|         3|
|Deaths due  COVID...|[0.19427422038203...|         4|
|          Read More |[-0.0210384173841...|         6|
|                   "|[0.0,0.0,0.0,0.0,...|         4|
|   Stay safe  di ...|[0.12410655136653...|         2|
|  This is what pa...|[-0.0643753518830...|         6|
|The Multisystem I...|[0.05338584554348...|         4|
|               The "|[0.0,0.0,0.0,0.0,...|         4|
| Well lets qualif...|[0.19085444089189...|         6|
|Most countries wi...|[-5.7805939870240...|         6|
|DNA  zooms up cha...|[0.29049361985843...|         2|
|Biocon Executive ...|[-0.0819690634713...|         4|
|         

In [None]:
predictions.filter("prediction=5").show(100, truncate=False)

+----------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+
|text                                                                                                            |features                                                                                                                                                                                                           |prediction|
+----------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------