In [1]:
#Read the json FIle contain tweets and there labes 
#For train and test
from pyspark.sql import SparkSession
jobDir = "tweets111.json"
tweets = spark.read.json([jobDir])
tweets.count() #number of tweets in the file

9988

In [2]:
#select the object in json file
tweets = tweets.select("text", \
                     "Category" )

tweets.printSchema()

root
 |-- text: string (nullable = true)
 |-- Category: string (nullable = true)



In [5]:
#Model Pipeline
from pyspark.ml.feature import RegexTokenizer,CountVectorizer
from pyspark.ml.classification import LinearSVC, OneVsRest
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words")
#convert a collection of text documents to vectors of token counts. 
countVectors = CountVectorizer(inputCol="words", outputCol="features")

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import  StringIndexer
#StringIndexer encodes a string column of labels to a column of label indices.
label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")
#pipeline
pipeline = Pipeline(stages=[regexTokenizer, label_stringIdx, countVectors])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
dataset.show(5,False)

+-------------------------------------------------+--------+-----------------------------------------------------------+-----+----------------------------------------------------------------------------------------------+
|text                                             |Category|words                                                      |label|features                                                                                      |
+-------------------------------------------------+--------+-----------------------------------------------------------+-----+----------------------------------------------------------------------------------------------+
|والل عجب عشان كتاب انجليز صعب كلم                |POS     |[والل, عجب, عشان, كتاب, انجليز, صعب, كلم]                  |1.0  |(19722,[47,92,176,419,588,1350,6837],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])                           |
|انه رنامج رايع يترجم كلم قطع باقص سرع            |POS     |[انه, رنامج, رايع, يترجم, كلم, قطع, باقص, سرع]      

In [7]:
#Partition Training & Test sets
#80% train ,20% test
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 7957
Test Dataset Count: 2031


In [8]:
# LinearSVC , OneVsRest Count Vector Features
lr = LinearSVC(maxIter=10, tol=1E-3, fitIntercept=True)
# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# train the multiclass model.
ovrModel = ovr.fit(trainingData)

# score the model on test data.
predictions = ovrModel.transform(testData)
predictions.filter(predictions['prediction'] == 1) \
    .select("text","Category","label","prediction") \
    .show(5,False)

+---------------------------------------------------------------------------------------------------------+--------+-----+----------+
|text                                                                                                     |Category|label|prediction|
+---------------------------------------------------------------------------------------------------------+--------+-----+----------+
| ابدء صباح مفتاح  نيه طيبه 💕 فه مفتاح باب رزق  كلم طيبه 💕 فه مفتاح باب قلوب  يوم جميل باذ الله رب نس … |POS     |1.0  |1.0       |
| ان تجد روحء تشب بكل شي                                                                                  |POS     |1.0  |1.0       |
| مسا قهو ☕ 🎉 💫 💕 عشاق قهوه 😍 💛                                                                      |POS     |1.0  |1.0       |
| وقت تحميل تعال 💕                                                                                       |POS     |1.0  |1.0       |
|( انت  وبس ) ذكري رحيل     الله يرحم رحم واسعه يجعل مثو اعال الجن    

In [9]:
# Show the accuracy 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Model Accuracy: ", accuracy)

Model Accuracy:  0.7131535469691566


In [10]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         940|
|  1.0|         639|
|  2.0|         452|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              974|
|       1.0|              675|
|       2.0|              382|
+----------+-----------------+

Model accuracy: 71.590%


In [19]:
#LinearSVC , OneVsRest using TF-IDF Features

from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)

lr = LinearSVC(maxIter=10, tol=1E-3, fitIntercept=True)
# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# train the multiclass model.
ovrModel = ovr.fit(trainingData)

# score the model on test data.
predictions = ovrModel.transform(testData)

predictions.filter(predictions['prediction'] == 1) \
    .select("text","Category","label","prediction") \
    .show(5,False)

+---------------------------------------------------------------------------------------------------------+--------+-----+----------+
|text                                                                                                     |Category|label|prediction|
+---------------------------------------------------------------------------------------------------------+--------+-----+----------+
|   دايم  اذا كذب خلك واثق تتراجع لان يكذب انفس يصدق )                                                    |POS     |1.0  |1.0       |
| ابدء صباح مفتاح  نيه طيبه 💕 فه مفتاح باب رزق  كلم طيبه 💕 فه مفتاح باب قلوب  يوم جميل باذ الله رب نس … |POS     |1.0  |1.0       |
| ان تجد روحء تشب بكل شي                                                                                  |POS     |1.0  |1.0       |
| مسا قهو ☕ 🎉 💫 💕 عشاق قهوه 😍 💛                                                                      |POS     |1.0  |1.0       |
| وقت تحميل تعال 💕                                                   

In [12]:
# Show the accuracy 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Model Accuracy: ", accuracy)

Model Accuracy:  0.6887541535337911


In [13]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         940|
|  1.0|         639|
|  2.0|         452|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              961|
|       1.0|              668|
|       2.0|              402|
+----------+-----------------+

Model accuracy: 69.129%


In [14]:
#Cross-Validation
pipeline = Pipeline(stages=[regexTokenizer,countVectors, label_stringIdx])
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)

lr = LinearSVC(maxIter=10, tol=1E-6, fitIntercept=True)
# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5])
 #            .addGrid(lr.regParam, [10.0,1.0,0.1,0.01]) 
              .build())
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=ovr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)
             
cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)

In [18]:
predictions.filter(predictions['prediction'] == 0) \
    .select("text","Category","label","prediction") \
    .show(20,False)

+-----------------------------------------------------------------------------------+--------+-----+----------+
|text                                                                               |Category|label|prediction|
+-----------------------------------------------------------------------------------+--------+-----+----------+
|                                                                                   |NEG     |0.0  |0.0       |
|                                                                                   |NEG     |0.0  |0.0       |
|                                                                                   |NEG     |0.0  |0.0       |
|                                                                                   |NEG     |0.0  |0.0       |
|                                                                                   |NEG     |0.0  |0.0       |
|                                                                                   |NEUTRAL |2.0  |0.0 

In [16]:
# Show the accuracy 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Model Accuracy: ", accuracy)

Model Accuracy:  0.7111088564461715


In [17]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         940|
|  1.0|         639|
|  2.0|         452|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|             1004|
|       1.0|              661|
|       2.0|              366|
+----------+-----------------+

Model accuracy: 71.492%
