In [94]:
from pyspark.sql import SparkSession
jobDir = "tweets1.json"
tweets = spark.read.json([jobDir])
tweets.count()

7992

In [95]:
tweets = tweets.select("text", \
                     "Category" )

tweets.printSchema()

root
 |-- text: string (nullable = true)
 |-- Category: string (nullable = true)



In [96]:
#Model Pipeline
from pyspark.ml.feature import RegexTokenizer,CountVectorizer
from pyspark.ml.classification import LinearSVC, OneVsRest
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words")

countVectors = CountVectorizer(inputCol="words", outputCol="features")



In [97]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, countVectors, label_stringIdx])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
dataset.show(10,250)

+-------------------------------------------------+--------+-----------------------------------------------------------+----------------------------------------------------------------------------------------------+-----+
|                                             text|Category|                                                      words|                                                                                      features|label|
+-------------------------------------------------+--------+-----------------------------------------------------------+----------------------------------------------------------------------------------------------+-----+
|               والل عجب عشان كتاب انجليز صعب كلم |     POS|                  [والل, عجب, عشان, كتاب, انجليز, صعب, كلم]|                          (17072,[68,109,147,351,644,1456,6074],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|  2.0|
|               انه مفيد جدان انا اتعلم كثير اشيء |     POS|                  [انه, مفيد, جدان, انا, اتعلم, كثير

In [98]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 5654
Test Dataset Count: 2335


In [110]:
lr = LinearSVC(maxIter=10, tol=1E-6, fitIntercept=True)
# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# train the multiclass model.
ovrModel = ovr.fit(trainingData)

# score the model on test data.
predictions = ovrModel.transform(testData)
predictions.filter(predictions['prediction'] == 1) \
    .select("text","Category","label","prediction") \
    .show(n = 10, truncate = 30)

+------------------------------+--------+-----+----------+
|                          text|Category|label|prediction|
+------------------------------+--------+-----+----------+
|2 باب ازهر شيخ كنيسه دعامت ...|     NEG|  0.0|       1.0|
|2 متحدث عسكر يعترف قيام جيش...| NEUTRAL|  1.0|       1.0|
|«ونظر موسي الي شمال جبال بن...| NEUTRAL|  1.0|       1.0|
|«ونظر موسي الي شمال جبال بن...| NEUTRAL|  1.0|       1.0|
|                  اا سلام روح |     NEG|  0.0|       1.0|
|اب هرير رض الله عنه رسول ال...| NEUTRAL|  1.0|       1.0|
|ابار سعاد مستشار بامار منطق...|     POS|  2.0|       1.0|
|ابار سعاد مستشار بامار منطق...|     POS|  2.0|       1.0|
|   ابانوب سمير مصاب طلق خرطوش | NEUTRAL|  1.0|       1.0|
|         ابعد عنهم انتاش قدهم | NEUTRAL|  1.0|       1.0|
+------------------------------+--------+-----+----------+
only showing top 10 rows



In [109]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.6444058429485806

In [107]:
'''from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

lr = LinearSVC(maxIter=10, tol=1E-6, fitIntercept=True)
# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# train the multiclass model.
ovrModel = ovr.fit(trainingData)

# score the model on test data.
predictions = ovrModel.transform(testData)

predictions.filter(predictions['prediction'] == 1) \
    .select("text","Category","label","prediction") \
    .show(n = 10, truncate = 30) '''

'from pyspark.ml.feature import HashingTF, IDF\nhashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)\nidf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms\npipeline = Pipeline(stages=[regexTokenizer, hashingTF, idf, label_stringIdx])\npipelineFit = pipeline.fit(tweets)\ndataset = pipelineFit.transform(tweets)\n(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)\n\nlr = LinearSVC(maxIter=10, tol=1E-6, fitIntercept=True)\n# instantiate the One Vs Rest Classifier.\novr = OneVsRest(classifier=lr)\n\n# train the multiclass model.\novrModel = ovr.fit(trainingData)\n\n# score the model on test data.\npredictions = ovrModel.transform(testData)\n\npredictions.filter(predictions[\'prediction\'] == 1)     .select("text","Category","label","prediction")     .show(n = 10, truncate = 30) '

In [108]:
 '''from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions) '''

0.6444058429485806

In [103]:
pl = predictions.select("label", "prediction")

In [104]:

pl.groupby('label').agg({'label': 'count'}).show()

+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|        1085|
|  1.0|         600|
|  2.0|         650|
+-----+------------+



In [105]:
pl.groupby('prediction').agg({'prediction': 'count'}).show()

+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|             1144|
|       1.0|              575|
|       2.0|              616|
+----------+-----------------+



In [106]:
acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100))

Model accuracy: 64.582%
