In [1]:
#Read the json FIle contain tweets and there labes 
#For train and test
from pyspark.sql import SparkSession
jobDir = "tweets1.json"
tweets = spark.read.json([jobDir])
tweets.count() #number of tweets in the file

7988

In [2]:
#select the object in json file
tweets = tweets.select("text", \
                     "Category" )

tweets.printSchema()

root
 |-- text: string (nullable = true)
 |-- Category: string (nullable = true)



In [3]:
#Model Pipeline
from pyspark.ml.feature import RegexTokenizer,CountVectorizer
from pyspark.ml.classification import LinearSVC, OneVsRest
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words")
#convert a collection of text documents to vectors of token counts. 
countVectors = CountVectorizer(inputCol="words", outputCol="features")

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import  StringIndexer
#StringIndexer encodes a string column of labels to a column of label indices.
label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")
#pipeline
pipeline = Pipeline(stages=[regexTokenizer, label_stringIdx, countVectors])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
dataset.show(10,10)

+----------+--------+----------+-----+----------+
|      text|Category|     words|label|  features|
+----------+--------+----------+-----+----------+
|والل عج...|     POS|[والل, ...|  2.0|(17070,...|
|انه رنا...|     POS|[انه, ر...|  2.0|(17070,...|
|رنامج ج...|     POS|[رنامج,...|  2.0|(17070,...|
|قمه روع...|     POS|[قمه, ر...|  2.0|(17070,...|
|جميل اش...|     POS|[جميل, ...|  2.0|(17070,...|
|عاش ايد...|     POS|[عاش, ا...|  2.0|(17070,...|
|برنامج ...|     POS|[برنامج...|  2.0|(17070,...|
|حلو وال...|     POS|[حلو, و...|  2.0|(17070,...|
|برنامج ...|     POS|[برنامج...|  2.0|(17070,...|
|رايع جد...|     POS|[رايع, ...|  2.0|(17070,...|
+----------+--------+----------+-----+----------+
only showing top 10 rows



In [5]:
#Partition Training & Test sets
#80% train ,20% test
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 6394
Test Dataset Count: 1594


In [6]:
# LinearSVC , OneVsRest Count Vector Features
lr = LinearSVC(maxIter=10, tol=1E-3, fitIntercept=True)
# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# train the multiclass model.
ovrModel = ovr.fit(trainingData)

# score the model on test data.
predictions = ovrModel.transform(testData)
predictions.filter(predictions['prediction'] == 1) \
    .select("text","Category","label","prediction") \
    .show(n = 10, truncate = 30)

+------------------------------+--------+-----+----------+
|                          text|Category|label|prediction|
+------------------------------+--------+-----+----------+
|«ونظر موسي الي شمال جبال بن...| NEUTRAL|  1.0|       1.0|
|«ونظر موسي الي شمال جبال بن...| NEUTRAL|  1.0|       1.0|
|اب هرير رض الله عنه رسول ال...| NEUTRAL|  1.0|       1.0|
|ابار سعاد مستشار بامار منطق...|     POS|  2.0|       1.0|
|ابار سعاد مستشار بامار منطق...|     POS|  2.0|       1.0|
|ابراهيم عيسي سيح تهم ازدراء...| NEUTRAL|  1.0|       1.0|
|      اتالان مهتم جلب ريستانت | NEUTRAL|  1.0|       1.0|
|اتحاد يتج لني زياد مسابق دو...| NEUTRAL|  1.0|       1.0|
|                   اتفضل منظر | NEUTRAL|  1.0|       1.0|
|اتمني ارسال لجن مستشفي حقل ...|     NEG|  0.0|       1.0|
+------------------------------+--------+-----+----------+
only showing top 10 rows



In [7]:
# Show the accuracy 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Model Accuracy: ", accuracy)

Model Accuracy:  0.6749778571927301


In [8]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         728|
|  1.0|         412|
|  2.0|         454|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              807|
|       1.0|              358|
|       2.0|              429|
+----------+-----------------+

Model accuracy: 67.754%


In [9]:
#LinearSVC , OneVsRest using TF-IDF Features

from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)

lr = LinearSVC(maxIter=10, tol=1E-3, fitIntercept=True)
# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# train the multiclass model.
ovrModel = ovr.fit(trainingData)

# score the model on test data.
predictions = ovrModel.transform(testData)

predictions.filter(predictions['prediction'] == 1) \
    .select("text","Category","label","prediction") \
    .show(n = 10, truncate = 30) 

+------------------------------+--------+-----+----------+
|                          text|Category|label|prediction|
+------------------------------+--------+-----+----------+
|2 باب ازهر شيخ كنيسه دعامت ...|     NEG|  0.0|       1.0|
|«ونظر موسي الي شمال جبال بن...| NEUTRAL|  1.0|       1.0|
|«ونظر موسي الي شمال جبال بن...| NEUTRAL|  1.0|       1.0|
|اب هرير رض الله عنه رسول ال...| NEUTRAL|  1.0|       1.0|
|ابار سعاد مستشار بامار منطق...|     POS|  2.0|       1.0|
|ابار سعاد مستشار بامار منطق...|     POS|  2.0|       1.0|
|ابراهيم عيسي سيح تهم ازدراء...| NEUTRAL|  1.0|       1.0|
|ابو فتوح في دويقه كان تحكم ...|     NEG|  0.0|       1.0|
|      اتالان مهتم جلب ريستانت | NEUTRAL|  1.0|       1.0|
|اتحاد يتج لني زياد مسابق دو...| NEUTRAL|  1.0|       1.0|
+------------------------------+--------+-----+----------+
only showing top 10 rows



In [10]:
# Show the accuracy 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Model Accuracy: ", accuracy)

Model Accuracy:  0.6479211940236164


In [11]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         728|
|  1.0|         412|
|  2.0|         454|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              781|
|       1.0|              400|
|       2.0|              413|
+----------+-----------------+

Model accuracy: 64.931%


In [None]:
#Cross-Validation
pipeline = Pipeline(stages=[regexTokenizer,countVectors, label_stringIdx])
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)

lr = LinearSVC(maxIter=10, tol=1E-6, fitIntercept=True)
# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5])
 #            .addGrid(lr.regParam, [10.0,1.0,0.1,0.01]) 
              .build())
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=ovr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)
             
cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)

In [101]:
# Show the accuracy 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Model Accuracy: ", accuracy)

Model Accuracy:  0.6717565926374811


In [102]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         728|
|  1.0|         412|
|  2.0|         454|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              828|
|       1.0|              348|
|       2.0|              418|
+----------+-----------------+

Model accuracy: 67.503%
