In [1]:
#Read the json FIle contain tweets and there labes 
#For train and test
from pyspark.sql import SparkSession
jobDir = "tweets111.json"
tweets = spark.read.json([jobDir])
tweets.count() #number of tweets in the file

9988

In [2]:
#select the object in json file
tweets = tweets.select("text", \
                     "Category" )

tweets.printSchema()

root
 |-- text: string (nullable = true)
 |-- Category: string (nullable = true)



In [3]:
from pyspark.ml.feature import RegexTokenizer,CountVectorizer
from pyspark.ml.classification import NaiveBayes
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words")
#convert a collection of text documents to vectors of token counts. 
countVectors = CountVectorizer(inputCol="words", outputCol="features")

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import  StringIndexer
#StringIndexer encodes a string column of labels to a column of label indices.
label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")
#pipeline
pipeline = Pipeline(stages=[regexTokenizer, label_stringIdx, countVectors])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
dataset.show(5,False)

+-------------------------------------------------+--------+-----------------------------------------------------------+-----+----------------------------------------------------------------------------------------------+
|text                                             |Category|words                                                      |label|features                                                                                      |
+-------------------------------------------------+--------+-----------------------------------------------------------+-----+----------------------------------------------------------------------------------------------+
|والل عجب عشان كتاب انجليز صعب كلم                |POS     |[والل, عجب, عشان, كتاب, انجليز, صعب, كلم]                  |1.0  |(19722,[47,92,176,419,588,1350,6837],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])                           |
|انه رنامج رايع يترجم كلم قطع باقص سرع            |POS     |[انه, رنامج, رايع, يترجم, كلم, قطع, باقص, سرع]      

In [5]:
#Partition Training & Test sets
#80% train ,20% test
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 7957
Test Dataset Count: 2031


In [6]:
# NaiveBayes Count Vector Features
nb = NaiveBayes(smoothing=1 , modelType="multinomial")
model = nb.fit(trainingData)
predictions = model.transform(testData)# model will make predictions and score on the test set
predictions.filter(predictions['prediction'] == 1) \
    .select("text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(5,False)

+--------------------------------------------------------------------------------------------------------+--------+-------------------------------------------------------------+-----+----------+
|text                                                                                                    |Category|probability                                                  |label|prediction|
+--------------------------------------------------------------------------------------------------------+--------+-------------------------------------------------------------+-----+----------+
|احلي ينبسط 😔 💘                                                                                        |NEG     |[0.48608193125261606,0.5132334485666692,6.846201807147622E-4]|0.0  |1.0       |
|يحتاج تفهم دعم مساند اخذ ايد توفير مختص موهل تعامل معهم مدارس روضا ون …                                 |NEG     |[0.4762008480903491,0.5193832645538984,0.004415887355752598] |0.0  |1.0       |
|امور هامه لتنم زوج تفاعل ا

In [7]:
# Show the accuracy 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Model Accuracy: ", accuracy)

Model Accuracy:  0.6978025729777162


In [8]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

pl.filter(pl.label == pl.prediction).count() / pl.count()
acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         940|
|  1.0|         639|
|  2.0|         452|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              952|
|       1.0|              685|
|       2.0|              394|
+----------+-----------------+

Model accuracy: 70.015%


In [9]:
#NaiveBayes using TF-IDF Features
from pyspark.ml.feature import HashingTF, IDF
#hashingTF
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
#idf
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2) #minDocFreq: remove sparse terms
#pipeline
pipeline = Pipeline(stages=[regexTokenizer, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
#Partition Training & Test sets
#80% train ,20% test
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 1) \
    .select("text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(5,False)

+------------------------------------------------------------------------------------+--------+---------------------------------------------------------------+-----+----------+
|text                                                                                |Category|probability                                                    |label|prediction|
+------------------------------------------------------------------------------------+--------+---------------------------------------------------------------+-----+----------+
|📮 يعرف يتمني خير   🥀                                                              |NEG     |[0.49799508256428204,0.502004917433727,1.9910373682703385E-12] |0.0  |1.0       |
|اسوان رييس تظن كذب يضركبالعكسالكذب اذا تكرر كثاف يودي الي تشويش اذهان ناس مقال كامل |NEUTRAL |[0.45819918873525817,0.5418008111672028,9.753889349239166E-11] |2.0  |1.0       |
|لهم حسب تضيق حياه  وان منتصر يغلب وجع  لهم عون نجات افقد حيله 💭                    |POS     |[0.43158695499527694,0.

In [10]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print ("Model Accuracy: ", accuracy)

Model Accuracy:  0.658453687693364


In [11]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

pl.filter(pl.label == pl.prediction).count() / pl.count()
acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         940|
|  1.0|         639|
|  2.0|         452|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              879|
|       1.0|              637|
|       2.0|              515|
+----------+-----------------+

Model accuracy: 65.633%


In [12]:
#Cross-Validation
pipeline = Pipeline(stages=[regexTokenizer,countVectors, label_stringIdx])
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)

nb = NaiveBayes(smoothing=1)
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(nb.smoothing,[0.6, 0.8, 1.0])
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=nb, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)
cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)


In [13]:
predictions.filter(predictions['prediction'] == 0) \
    .select("text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(5,False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+------------------------------------------------------------------+-----+----------+
|text                                                                                                                                                                                                             |Category|probability                                                       |label|prediction|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+------------------------------------------------------------------+-----+----------+
|نتعلم نصيحه نتعلم اخطاء ننضج مرور عمر مرور اوغاد حيا ماهو كلام يباع دورا تنميه بشرية

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print ("Model Accuracy: ", accuracy)

Model Accuracy:  0.6940235476582404


In [15]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

pl.filter(pl.label == pl.prediction).count() / pl.count()
acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         940|
|  1.0|         639|
|  2.0|         452|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              871|
|       1.0|              675|
|       2.0|              485|
+----------+-----------------+

Model accuracy: 69.325%
