# Importing the all important libraries

In [None]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Coronavirus').getOrCreate()

In [None]:
df=spark.read.csv('Corona_NLP_train.csv',header=True,inferSchema=True)

In [None]:
df.show(5)

+--------+------------+--------------------+----------+--------------------+---------+
|UserName|  ScreenName|            Location|   TweetAt|       OriginalTweet|Sentiment|
+--------+------------+--------------------+----------+--------------------+---------+
|    3799|       48751|              London|16-03-2020|@MeNyrbie @Phil_G...|  Neutral|
|    3800|       48752|                  UK|16-03-2020|advice Talk to yo...| Positive|
|    3801|       48753|           Vagabonds|16-03-2020|Coronavirus Austr...| Positive|
|    3802|       48754|                null|16-03-2020|My food stock is ...|     null|
|  PLEASE| don't panic| THERE WILL BE EN...|      null|                null|     null|
+--------+------------+--------------------+----------+--------------------+---------+
only showing top 5 rows



In [None]:
df.columns

['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment']

In [None]:
print((df.count(),len(df.columns)))

(68046, 6)


# Preparation of Data

In [None]:
from pyspark.sql.functions import length

In [None]:
df=df.withColumn('Tweet_length', length(df['OriginalTweet']))

In [None]:
df.show(5)

+--------+------------+--------------------+----------+--------------------+---------+------------+
|UserName|  ScreenName|            Location|   TweetAt|       OriginalTweet|Sentiment|Tweet_length|
+--------+------------+--------------------+----------+--------------------+---------+------------+
|    3799|       48751|              London|16-03-2020|@MeNyrbie @Phil_G...|  Neutral|         111|
|    3800|       48752|                  UK|16-03-2020|advice Talk to yo...| Positive|         237|
|    3801|       48753|           Vagabonds|16-03-2020|Coronavirus Austr...| Positive|         131|
|    3802|       48754|                null|16-03-2020|My food stock is ...|     null|          51|
|  PLEASE| don't panic| THERE WILL BE EN...|      null|                null|     null|        null|
+--------+------------+--------------------+----------+--------------------+---------+------------+
only showing top 5 rows



In [None]:
sentiments=['Positive','Negative','Neutral','Extremely Positive','Extremely Negative']

In [None]:
data=df.filter(df.Sentiment.isin(sentiments))

In [None]:
data.select('Sentiment').distinct().show()

+------------------+
|         Sentiment|
+------------------+
|Extremely Negative|
|           Neutral|
|          Positive|
|          Negative|
|Extremely Positive|
+------------------+



In [None]:
data.select('Sentiment').distinct().count()

5

# Grouping by Sentiment

In [None]:
data.groupby('Sentiment').count().show()

+------------------+-----+
|         Sentiment|count|
+------------------+-----+
|Extremely Negative| 3751|
|           Neutral| 5224|
|          Positive| 7718|
|          Negative| 6857|
|Extremely Positive| 4412|
+------------------+-----+



In [None]:
data.show(5)

+--------+----------+--------------------+----------+--------------------+---------+------------+
|UserName|ScreenName|            Location|   TweetAt|       OriginalTweet|Sentiment|Tweet_length|
+--------+----------+--------------------+----------+--------------------+---------+------------+
|    3799|     48751|              London|16-03-2020|@MeNyrbie @Phil_G...|  Neutral|         111|
|    3800|     48752|                  UK|16-03-2020|advice Talk to yo...| Positive|         237|
|    3801|     48753|           Vagabonds|16-03-2020|Coronavirus Austr...| Positive|         131|
|    3804|     48756|ÜT: 36.319708,-82...|16-03-2020|As news of the re...| Positive|         249|
|    3805|     48757|35.926541,-78.753267|16-03-2020|"Cashier at groce...| Positive|         184|
+--------+----------+--------------------+----------+--------------------+---------+------------+
only showing top 5 rows



In [None]:
print((data.count(),len(data.columns)))

(27962, 7)


In [None]:
from pyspark.sql.functions import isnan,when,count,col
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]
   ).show()

+--------+----------+--------+-------+-------------+---------+------------+
|UserName|ScreenName|Location|TweetAt|OriginalTweet|Sentiment|Tweet_length|
+--------+----------+--------+-------+-------------+---------+------------+
|       0|         0|    6152|      0|            0|        0|           0|
+--------+----------+--------+-------+-------------+---------+------------+



In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer,RegexTokenizer

In [None]:
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerTy
tokenizer=Tokenizer(inputCol="OriginalTweet", outputCol="token_text")
stopremove=StopWordsRemover(inputCol="token_text", outputCol="stop_tokens")
countvec=CountVectorizer(inputCol="stop_tokens", outputCol="c_vec")
idf=IDF(inputCol="c_vec", outputCol="tf_idf")

# Converting labels to numeric
labeltonum=StringIndexer(inputCol="Sentiment", outputCol="label")

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [None]:
cleaned=VectorAssembler(inputCols=["tf_idf","Tweet_length"], outputCol="features")

In [None]:
from pyspark.ml.classification import NaiveBayes
NB= NaiveBayes()

In [None]:
from pyspark.ml import Pipeline
data_prep_pipelines= Pipeline(stages=(labeltonum, tokenizer, stopremove, countvec, idf, cleaned  ))

In [None]:
clnr=data_prep_pipelines.fit(data)

In [None]:
clean_data=clnr.transform(data)

In [None]:
clean_data.show()

+--------+----------+--------------------+----------+--------------------+------------------+------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|UserName|ScreenName|            Location|   TweetAt|       OriginalTweet|         Sentiment|Tweet_length|label|          token_text|         stop_tokens|               c_vec|              tf_idf|            features|
+--------+----------+--------------------+----------+--------------------+------------------+------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|    3799|     48751|              London|16-03-2020|@MeNyrbie @Phil_G...|           Neutral|         111|  2.0|[@menyrbie, @phil...|[@menyrbie, @phil...|(78305,[13231,408...|(78305,[13231,408...|(78306,[13231,408...|
|    3800|     48752|                  UK|16-03-2020|advice Talk to yo...|          Positive|         237|  0.0|[advice, talk, t

In [None]:
clean_data=clean_data.select(['label', 'features'])

In [None]:
clean_data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  2.0|(78306,[13231,408...|
|  0.0|(78306,[13,14,133...|
|  0.0|(78306,[8,14,37,7...|
|  0.0|(78306,[7,8,31,47...|
|  0.0|(78306,[3,6,18,60...|
|  0.0|(78306,[1,6,8,13,...|
|  1.0|(78306,[11,13,14,...|
|  2.0|(78306,[48,70,149...|
|  3.0|(78306,[13,14,23,...|
|  0.0|(78306,[8,10,23,5...|
|  0.0|(78306,[4,8,24,38...|
|  4.0|(78306,[1,4,9,11,...|
|  1.0|(78306,[4,21,44,7...|
|  3.0|(78306,[10,37,54,...|
|  1.0|(78306,[4,8,24,33...|
|  4.0|(78306,[1,7,11,36...|
|  1.0|(78306,[1,4,7,34,...|
|  2.0|(78306,[5,47,48,6...|
|  0.0|(78306,[8,12,23,2...|
|  1.0|(78306,[6,28,33,9...|
+-----+--------------------+
only showing top 20 rows



# Machine Learning Model

In [None]:
(training, testing)= clean_data.randomSplit([0.7,0.3])

In [None]:
 sentiment_predictor=NB.fit(training)

In [None]:
test_results=sentiment_predictor.transform(testing)

In [None]:
test_results.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(78306,[0,1,2,7,1...|[-1756.6982074189...|[1.75104162056079...|       4.0|
|  0.0|(78306,[0,1,2,7,1...|[-1845.2087339364...|[1.25846020903792...|       1.0|
|  0.0|(78306,[0,1,2,12,...|[-1149.2362737275...|[0.00826678745894...|       3.0|
|  0.0|(78306,[0,1,2,12,...|[-1162.3324078311...|[4.06967562562278...|       4.0|
|  0.0|(78306,[0,1,2,19,...|[-2015.8274302821...|[1.0,8.1359598226...|       0.0|
|  0.0|(78306,[0,1,2,25,...|[-1871.5124164119...|[4.59978559111855...|       1.0|
|  0.0|(78306,[0,1,2,26,...|[-1695.0539077263...|[7.00925147583317...|       3.0|
|  0.0|(78306,[0,1,2,29,...|[-1843.7324284291...|[1.58059757058517...|       4.0|
|  0.0|(78306,[0,1,2,38,...|[-1819.7053578243...|[6.46535247002894...|       4.0|
|  0.0|(78306,[0

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
accuracy_evaluation= MulticlassClassificationEvaluator()
acc=accuracy_evaluation.evaluate(test_results)

In [None]:
print ("The Accuracy of the model is :>", acc)

The Accuracy of the model is :> 0.40263768590598215
