In [1]:
#Read the json FIle contain tweets and there labes 
#For train and test
from pyspark.sql import SparkSession
jobDir = "tweets1.json"
tweets = spark.read.json([jobDir])
tweets.count() #number of tweets in the file

7988

In [2]:
#select the object in json file
tweets = tweets.select("text", \
                     "Category" )

tweets.printSchema()

root
 |-- text: string (nullable = true)
 |-- Category: string (nullable = true)



In [3]:
from pyspark.ml.feature import RegexTokenizer,CountVectorizer
from pyspark.ml.classification import NaiveBayes
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words")
#convert a collection of text documents to vectors of token counts. 
countVectors = CountVectorizer(inputCol="words", outputCol="features")

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import  StringIndexer
#StringIndexer encodes a string column of labels to a column of label indices.
label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")
#pipeline
pipeline = Pipeline(stages=[regexTokenizer, label_stringIdx, countVectors])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
dataset.show(10,10)

+----------+--------+----------+-----+----------+
|      text|Category|     words|label|  features|
+----------+--------+----------+-----+----------+
|والل عج...|     POS|[والل, ...|  2.0|(17070,...|
|انه رنا...|     POS|[انه, ر...|  2.0|(17070,...|
|رنامج ج...|     POS|[رنامج,...|  2.0|(17070,...|
|قمه روع...|     POS|[قمه, ر...|  2.0|(17070,...|
|جميل اش...|     POS|[جميل, ...|  2.0|(17070,...|
|عاش ايد...|     POS|[عاش, ا...|  2.0|(17070,...|
|برنامج ...|     POS|[برنامج...|  2.0|(17070,...|
|حلو وال...|     POS|[حلو, و...|  2.0|(17070,...|
|برنامج ...|     POS|[برنامج...|  2.0|(17070,...|
|رايع جد...|     POS|[رايع, ...|  2.0|(17070,...|
+----------+--------+----------+-----+----------+
only showing top 10 rows



In [5]:
#Partition Training & Test sets
#80% train ,20% test
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 6394
Test Dataset Count: 1594


In [6]:
# NaiveBayes Count Vector Features
nb = NaiveBayes(smoothing=1 , modelType="multinomial")
model = nb.fit(trainingData)
predictions = model.transform(testData)# model will make predictions and score on the test set
predictions.filter(predictions['prediction'] == 1) \
    .select("text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 100)

+----------------------------------------------------------------------------------------------------+--------+-------------------------------------------------------------+-----+----------+
|                                                                                                text|Category|                                                  probability|label|prediction|
+----------------------------------------------------------------------------------------------------+--------+-------------------------------------------------------------+-----+----------+
|          فقط جلس ود موقف رفض حوار رييس جماع حزب ملتزم موقف شارع رافض حوار دون ضمانا موقف جبه انقاذ |     NEG|   [0.49196183024734186,0.507523933392624,5.1423636003414E-4]|  0.0|       1.0|
|                                                                                  طوارء تحسب مظاهرا | NEUTRAL| [0.489363171296104,0.49483514881304347,0.015801679890852523]|  1.0|       1.0|
|                                            

In [7]:
# Show the accuracy 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Model Accuracy: ", accuracy)

Model Accuracy:  0.66882364915199


In [8]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

pl.filter(pl.label == pl.prediction).count() / pl.count()
acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         728|
|  1.0|         412|
|  2.0|         454|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              780|
|       1.0|              369|
|       2.0|              445|
+----------+-----------------+

Model accuracy: 67.127%


In [9]:
#NaiveBayes using TF-IDF Features
from pyspark.ml.feature import HashingTF, IDF
#hashingTF
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
#idf
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2) #minDocFreq: remove sparse terms
#pipeline
pipeline = Pipeline(stages=[regexTokenizer, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
#Partition Training & Test sets
#80% train ,20% test
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 1) \
    .select("text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 20)

+--------------------+--------+--------------------+-----+----------+
|                text|Category|         probability|label|prediction|
+--------------------+--------+--------------------+-----+----------+
|انتم تريد يريد ال...| NEUTRAL|[0.47733571915763...|  1.0|       1.0|
|اسوان اذا قار نسب...| NEUTRAL|[0.43957728520581...|  1.0|       1.0|
|ندخل بء حرب اي دو...|     POS|[0.40077223999924...|  2.0|       1.0|
|قوه قوه بار الله ...|     POS|[0.36955896506318...|  2.0|       1.0|
|موشرا لد لء ضرب ع...| NEUTRAL|[0.34831800634019...|  1.0|       1.0|
|غداد تعين ركن زيد...| NEUTRAL|[0.34246581373144...|  1.0|       1.0|
|جماع اخو ارهاب قي...|     NEG|[0.33187391553347...|  0.0|       1.0|
|مستقبل تكن زياره ...|     NEG|[0.30086814253564...|  0.0|       1.0|
|منحو حجر وجد طايف...| NEUTRAL|[0.26702466936300...|  1.0|       1.0|
|كلام ركي منسوب ال...| NEUTRAL|[0.26258795015241...|  1.0|       1.0|
+--------------------+--------+--------------------+-----+----------+
only showing top 10 

In [10]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print ("Model Accuracy: ", accuracy)

Model Accuracy:  0.6424062852209331


In [11]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

pl.filter(pl.label == pl.prediction).count() / pl.count()
acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         728|
|  1.0|         412|
|  2.0|         454|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              742|
|       1.0|              419|
|       2.0|              433|
+----------+-----------------+

Model accuracy: 64.241%


In [12]:
#Cross-Validation
pipeline = Pipeline(stages=[regexTokenizer,countVectors, label_stringIdx])
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)

nb = NaiveBayes(smoothing=1)
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(nb.smoothing,[0.6, 0.8, 1.0])
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=nb, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)
cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)


In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print ("Model Accuracy: ", accuracy)

Model Accuracy:  0.6691115761107667


In [14]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

pl.filter(pl.label == pl.prediction).count() / pl.count()
acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         728|
|  1.0|         412|
|  2.0|         454|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              735|
|       1.0|              407|
|       2.0|              452|
+----------+-----------------+

Model accuracy: 66.939%
