In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('CoronavirusNLP').getOrCreate()

In [3]:
Coviddata=spark.read.csv('Corona_NLP_train.csv',header=True,inferSchema=True)

In [4]:
Coviddata.show(5)

+--------+------------+--------------------+----------+--------------------+---------+
|UserName|  ScreenName|            Location|   TweetAt|       OriginalTweet|Sentiment|
+--------+------------+--------------------+----------+--------------------+---------+
|    3799|       48751|              London|16-03-2020|@MeNyrbie @Phil_G...|  Neutral|
|    3800|       48752|                  UK|16-03-2020|advice Talk to yo...| Positive|
|    3801|       48753|           Vagabonds|16-03-2020|Coronavirus Austr...| Positive|
|    3802|       48754|                null|16-03-2020|My food stock is ...|     null|
|  PLEASE| don't panic| THERE WILL BE EN...|      null|                null|     null|
+--------+------------+--------------------+----------+--------------------+---------+
only showing top 5 rows



In [4]:
Coviddata.columns

['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment']

In [5]:
print((Coviddata.count(),len(Coviddata.columns)))

(68046, 6)


# Data Preparation

In [6]:
from pyspark.sql.functions import length

In [7]:
Coviddata=Coviddata.withColumn('Tweet_length', length(Coviddata['OriginalTweet']))

In [8]:
Coviddata.show(5)

+--------+------------+--------------------+----------+--------------------+---------+------------+
|UserName|  ScreenName|            Location|   TweetAt|       OriginalTweet|Sentiment|Tweet_length|
+--------+------------+--------------------+----------+--------------------+---------+------------+
|    3799|       48751|              London|16-03-2020|@MeNyrbie @Phil_G...|  Neutral|         111|
|    3800|       48752|                  UK|16-03-2020|advice Talk to yo...| Positive|         237|
|    3801|       48753|           Vagabonds|16-03-2020|Coronavirus Austr...| Positive|         131|
|    3802|       48754|                null|16-03-2020|My food stock is ...|     null|          51|
|  PLEASE| don't panic| THERE WILL BE EN...|      null|                null|     null|        null|
+--------+------------+--------------------+----------+--------------------+---------+------------+
only showing top 5 rows



In [9]:
sentiments=['Positive','Negative','Neutral','Extremely Positive','Extremely Negative']

In [10]:
data=Coviddata.filter(Coviddata.Sentiment.isin(sentiments))

In [11]:
data.select('Sentiment').distinct().show()

+------------------+
|         Sentiment|
+------------------+
|Extremely Negative|
|           Neutral|
|          Positive|
|          Negative|
|Extremely Positive|
+------------------+



In [12]:
data.select('Sentiment').distinct().count()

5

In [13]:
data.groupby('Sentiment').count().show()

+------------------+-----+
|         Sentiment|count|
+------------------+-----+
|Extremely Negative| 3751|
|           Neutral| 5224|
|          Positive| 7718|
|          Negative| 6857|
|Extremely Positive| 4412|
+------------------+-----+



In [14]:
data.show(8)

+--------+----------+--------------------+----------+--------------------+---------+------------+
|UserName|ScreenName|            Location|   TweetAt|       OriginalTweet|Sentiment|Tweet_length|
+--------+----------+--------------------+----------+--------------------+---------+------------+
|    3799|     48751|              London|16-03-2020|@MeNyrbie @Phil_G...|  Neutral|         111|
|    3800|     48752|                  UK|16-03-2020|advice Talk to yo...| Positive|         237|
|    3801|     48753|           Vagabonds|16-03-2020|Coronavirus Austr...| Positive|         131|
|    3804|     48756|ÜT: 36.319708,-82...|16-03-2020|As news of the re...| Positive|         249|
|    3805|     48757|35.926541,-78.753267|16-03-2020|"Cashier at groce...| Positive|         184|
|    3807|     48759|     Atlanta, GA USA|16-03-2020|Due to COVID-19 o...| Positive|         280|
|    3808|     48760|    BHAVNAGAR,GUJRAT|16-03-2020|For corona preven...| Negative|         267|
|    3809|     48761

In [15]:
print((data.count(),len(data.columns)))

(27962, 7)


In [16]:
from pyspark.sql.functions import isnan,when,count,col
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]
   ).show()

+--------+----------+--------+-------+-------------+---------+------------+
|UserName|ScreenName|Location|TweetAt|OriginalTweet|Sentiment|Tweet_length|
+--------+----------+--------+-------+-------------+---------+------------+
|       0|         0|    6152|      0|            0|        0|           0|
+--------+----------+--------+-------+-------------+---------+------------+



# Text Processing using Tokenization, Stemming and TF-IDF

In [17]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer,RegexTokenizer

In [18]:
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType

tokenizer=Tokenizer(inputCol="OriginalTweet", outputCol="token_text")
stopremove=StopWordsRemover(inputCol="token_text", outputCol="stop_tokens")
countvec=CountVectorizer(inputCol="stop_tokens", outputCol="c_vec")
idf=IDF(inputCol="c_vec", outputCol="tf_idf")

#Convert labels to numeric
Labels_to_num=StringIndexer(inputCol="Sentiment", outputCol="label")

In [20]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [21]:
cleanedtext=VectorAssembler(inputCols=["tf_idf","Tweet_length"], outputCol="features")

# Model Building

In [22]:
from pyspark.ml.classification import NaiveBayes
NaiveB= NaiveBayes()

In [23]:
from pyspark.ml import Pipeline
datapipeline= Pipeline(stages=(Labels_to_num, tokenizer, stopremove, countvec, idf, cleanedtext))

In [24]:
f_data=datapipeline.fit(data)

In [25]:
cleaned_data=f_data.transform(data)

In [26]:
cleaned_data.show()

+--------+----------+--------------------+----------+--------------------+------------------+------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|UserName|ScreenName|            Location|   TweetAt|       OriginalTweet|         Sentiment|Tweet_length|label|          token_text|         stop_tokens|               c_vec|              tf_idf|            features|
+--------+----------+--------------------+----------+--------------------+------------------+------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|    3799|     48751|              London|16-03-2020|@MeNyrbie @Phil_G...|           Neutral|         111|  2.0|[@menyrbie, @phil...|[@menyrbie, @phil...|(78305,[14496,265...|(78305,[14496,265...|(78306,[14496,265...|
|    3800|     48752|                  UK|16-03-2020|advice Talk to yo...|          Positive|         237|  0.0|[advice, talk, t

In [27]:
cleaned_data=cleaned_data.select(['label', 'features'])

In [28]:
cleaned_data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  2.0|(78306,[14496,265...|
|  0.0|(78306,[13,14,133...|
|  0.0|(78306,[8,14,37,7...|
|  0.0|(78306,[7,8,31,47...|
|  0.0|(78306,[3,6,18,60...|
|  0.0|(78306,[1,6,8,13,...|
|  1.0|(78306,[11,13,14,...|
|  2.0|(78306,[48,70,147...|
|  3.0|(78306,[13,14,23,...|
|  0.0|(78306,[8,10,23,5...|
|  0.0|(78306,[4,8,24,38...|
|  4.0|(78306,[1,4,9,11,...|
|  1.0|(78306,[4,21,44,7...|
|  3.0|(78306,[10,37,54,...|
|  1.0|(78306,[4,8,24,33...|
|  4.0|(78306,[1,7,11,36...|
|  1.0|(78306,[1,4,7,34,...|
|  2.0|(78306,[5,47,48,6...|
|  0.0|(78306,[8,12,23,2...|
|  1.0|(78306,[6,28,33,9...|
+-----+--------------------+
only showing top 20 rows



# Model Fitting

In [29]:
(training, testing)= cleaned_data.randomSplit([0.7,0.3])

In [30]:
predictor=NaiveB.fit(training)

In [31]:
p_test=predictor.transform(testing)

In [32]:
p_test.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(78306,[0,1,2,12,...|[-1143.5999899435...|[0.96914164087953...|       0.0|
|  0.0|(78306,[0,1,2,17,...|[-2252.3386087643...|[2.52482603752474...|       4.0|
|  0.0|(78306,[0,1,2,23,...|[-1342.7138712802...|[0.99999821901192...|       0.0|
|  0.0|(78306,[0,1,2,29,...|[-1846.2153710491...|[1.38582176477374...|       1.0|
|  0.0|(78306,[0,1,2,38,...|[-1837.1749904956...|[1.02531258613975...|       1.0|
|  0.0|(78306,[0,1,2,40,...|[-1037.3850439223...|[2.83672560973375...|       3.0|
|  0.0|(78306,[0,1,3,4,6...|[-1425.6934259789...|[2.77145331652950...|       1.0|
|  0.0|(78306,[0,1,3,4,1...|[-1378.6232334484...|[0.99997676755749...|       0.0|
|  0.0|(78306,[0,1,3,4,1...|[-1215.1431995339...|[3.94330683715480...|       4.0|
|  0.0|(78306,[0

In [33]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [34]:
accuracy_evaluation= MulticlassClassificationEvaluator()
acc=accuracy_evaluation.evaluate(p_test)

In [35]:
print ("The Accuracy of the model is :>", acc)

The Accuracy of the model is :> 0.40452495881641615
