In [1]:
#Data Ingestion and Extraction
from pyspark.sql import SparkSession
jobDir = "tweets1.json"
tweets = spark.read.json([jobDir])
tweets.count() 

7992

In [2]:
tweets = tweets.select("text", \
                     "Category" )

tweets.printSchema()

root
 |-- text: string (nullable = true)
 |-- Category: string (nullable = true)



In [3]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import NaiveBayes
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words")

countVectors = CountVectorizer(inputCol="words", outputCol="features")




In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, countVectors, label_stringIdx])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
dataset.show(10,30)

+------------------------------+--------+------------------------------+------------------------------+-----+
|                          text|Category|                         words|                      features|label|
+------------------------------+--------+------------------------------+------------------------------+-----+
|والل عجب عشان كتاب انجليز ص...|     POS|[والل, عجب, عشان, كتاب, انج...|(17072,[68,109,147,351,644,...|  2.0|
|انه مفيد جدان انا اتعلم كثي...|     POS|[انه, مفيد, جدان, انا, اتعل...|(17072,[19,22,126,3800,9802...|  2.0|
|انه رنامج رايع يترجم كلم قط...|     POS|[انه, رنامج, رايع, يترجم, ك...|(17072,[19,147,242,251,1087...|  2.0|
|رنامج جميل جدا يترجم كلم جم...|     POS|[رنامج, جميل, جدا, يترجم, ك...|(17072,[82,127,147,166,242,...|  2.0|
|قمه روعه الف شكر تقدير مجهو...|     POS|[قمه, روعه, الف, شكر, تقدير...|(17072,[82,97,103,398,635,1...|  2.0|
|      جميل اشخاص ايجد انجلزيه |     POS|  [جميل, اشخاص, ايجد, انجلزيه]|(17072,[166,881,14206,16489...|  2.0|
|         

In [5]:
#Partition Training & Test sets
# set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))


Training Dataset Count: 5654
Test Dataset Count: 2335


In [6]:
# NaiveBayes Count Vector Features
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 1) \
    .select("text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 100)

+----------------------------------------------------------------------------------------------------+--------+--------------------------------------------------------------+-----+----------+
|                                                                                                text|Category|                                                   probability|label|prediction|
+----------------------------------------------------------------------------------------------------+--------+--------------------------------------------------------------+-----+----------+
|مضحك صفح اسلاميه جري ورا كلام استاذه مافيش اصل كتاب تاليف اسماعيل الام اسم عرب يغز اندلسشكلكو وحش...|     NEG|  [0.4952558690488377,0.5003486842382693,0.004395446712893042]|  0.0|       1.0|
|                     قصد مو ضروره خالف طيف معنا افضليه لك مخالف الهم تكون حال دخول طيف اخر استدع ا… | NEUTRAL| [0.49439186988298844,0.5000805562303184,0.005527573886693169]|  1.0|       1.0|
| ملاحق باسم يوسف زملاء اعلام تهم تعرف ا

In [7]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.6642816494568803

In [8]:
#NaiveBayes using TF-IDF Features
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 1) \
    .select("text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                          text|Category|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|جميل ان يكتشف حسن نوا تجا ظ...| NEUTRAL|[0.4977393044568139,0.50226...|  1.0|       1.0|
|اسكت وله بلا هبل شو جاب دوا...|     NEG|[0.4875593706290305,0.51244...|  0.0|       1.0|
| رطم حني مليان زياد ماش يدلدق |     POS|[0.472567594696991,0.527432...|  2.0|       1.0|
|عاجل مياد نت معلوما موكد عم...| NEUTRAL|[0.4691478048626397,0.53085...|  1.0|       1.0|
|يعن يعقل ان ينتخب شعب مصر ر...| NEUTRAL|[0.4560640377961834,0.54393...|  1.0|       1.0|
|الي ارد معر وجه حق دبر مذبح...| NEUTRAL|[0.44985451214263983,0.5501...|  1.0|       1.0|
|عالم يتج تقليل منتظم مستمر ...|     NEG|[0.4483661588382104,0.55163...|  0.0|       1.0|
|كلم هيب محتاج تتشال قاموس ع...|     NEG|[0.4360785502762834,0.56392...|  0.0|       1.0|
|لء حساب ي

In [9]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.6266986748408847

In [11]:
#Cross-Validation
pipeline = Pipeline(stages=[regexTokenizer,countVectors, label_stringIdx])
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
nb = NaiveBayes(smoothing=1)
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(nb.smoothing, [1, 3, 5])
             .build())
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=nb, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)
cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.6642816494568803

In [12]:
pl = predictions.select("label", "prediction")

In [13]:
pl.groupby('label').agg({'label': 'count'}).show()

+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|        1085|
|  1.0|         600|
|  2.0|         650|
+-----+------------+



In [14]:
pl.groupby('prediction').agg({'prediction': 'count'}).show()

+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|             1171|
|       1.0|              515|
|       2.0|              649|
+----------+-----------------+



In [16]:
acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 


Model accuracy: 66.767%
