In [1]:
#Read the json FIle contain tweets and there labes 
#For train and test
from pyspark.sql import SparkSession
jobDir = "tweets1.json"
tweets = spark.read.json([jobDir])
tweets.count() #number of tweets in the file

7988

In [2]:
#select the object in json file
tweets = tweets.select("text", \
                     "Category" )

tweets.printSchema()

root
 |-- text: string (nullable = true)
 |-- Category: string (nullable = true)



In [3]:
from pyspark.ml.feature import RegexTokenizer,CountVectorizer
from pyspark.ml.classification import LogisticRegression
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words")
#convert a collection of text documents to vectors of token counts. 
countVectors = CountVectorizer(inputCol="words", outputCol="features")

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import  StringIndexer
#StringIndexer encodes a string column of labels to a column of label indices.
label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")
#pipeline
pipeline = Pipeline(stages=[regexTokenizer, label_stringIdx, countVectors])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
dataset.show(10,10)

+----------+--------+----------+-----+----------+
|      text|Category|     words|label|  features|
+----------+--------+----------+-----+----------+
|والل عج...|     POS|[والل, ...|  2.0|(17070,...|
|انه رنا...|     POS|[انه, ر...|  2.0|(17070,...|
|رنامج ج...|     POS|[رنامج,...|  2.0|(17070,...|
|قمه روع...|     POS|[قمه, ر...|  2.0|(17070,...|
|جميل اش...|     POS|[جميل, ...|  2.0|(17070,...|
|عاش ايد...|     POS|[عاش, ا...|  2.0|(17070,...|
|برنامج ...|     POS|[برنامج...|  2.0|(17070,...|
|حلو وال...|     POS|[حلو, و...|  2.0|(17070,...|
|برنامج ...|     POS|[برنامج...|  2.0|(17070,...|
|رايع جد...|     POS|[رايع, ...|  2.0|(17070,...|
+----------+--------+----------+-----+----------+
only showing top 10 rows



In [5]:
#Partition Training & Test sets
#80% train ,20% test
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 6394
Test Dataset Count: 1594


In [6]:
#Logistic Regression using Count Vector Features
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)# model will make predictions and score on the test set


predictions.filter(predictions['prediction'] == 0) \
    .select("text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 10)

+----------+--------+-----------+-----+----------+
|      text|Category|probability|label|prediction|
+----------+--------+-----------+-----+----------+
|😂 سلاح...|     NEG| [0.9996...|  0.0|       0.0|
|اكر واح...|     NEG| [0.9898...|  0.0|       0.0|
|هدول ما...|     POS| [0.9876...|  2.0|       0.0|
|اسخف من...|     NEG| [0.9719...|  0.0|       0.0|
|تعلم ان...|     NEG| [0.9698...|  0.0|       0.0|
|تعلم ان...|     NEG| [0.9698...|  0.0|       0.0|
|استغرب ...|     NEG| [0.9681...|  0.0|       0.0|
|متعب عن...|     NEG| [0.9670...|  0.0|       0.0|
|“بطل تف...|     NEG| [0.9649...|  0.0|       0.0|
|ست زينب...|     NEG| [0.9625...|  0.0|       0.0|
+----------+--------+-----------+-----+----------+
only showing top 10 rows



In [7]:
# Show the accuracy 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="f1")
accuracy = evaluator.evaluate(predictions)
print("Model Accuracy: ", accuracy)

Model Accuracy:  0.6743941550173962


In [8]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

pl.filter(pl.label == pl.prediction).count() / pl.count()
acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         728|
|  1.0|         412|
|  2.0|         454|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              943|
|       1.0|              289|
|       2.0|              362|
+----------+-----------------+

Model accuracy: 68.256%


In [9]:
#Logistic Regression using TF-IDF Features
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 100)


+----------------------------------------------------------------------------------------------------+--------+---------------------------------------------------------------+-----+----------+
|                                                                                                text|Category|                                                    probability|label|prediction|
+----------------------------------------------------------------------------------------------------+--------+---------------------------------------------------------------+-----+----------+
|😂 سلاح موجود مقدم دستور ليش الكل عم يقول انو قدم استقال خارجما اي دليل ا… سلاح موجود مقدم دستور ...|     NEG|[0.9998595001194233,1.1238710338103115E-4,2.811277719581919E-5]|  0.0|       0.0|
|يقول تطورا خطير منطقه وكء اكثرم مليون شهيد عراق سور عشرا الوف يمن ليبي لسط تدمير حلب موصل تطورا س...|     NEG|  [0.9883705526327038,0.00477643783829598,0.006853009529000155]|  0.0|       0.0|
|ايا عمر فاروق عود جيوش فرس تنهي تام

In [10]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",metricName="f1")
accuracy = evaluator.evaluate(predictions)
print ("Model Accuracy: ", accuracy)

Model Accuracy:  0.6444852381016305


In [11]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

pl.filter(pl.label == pl.prediction).count() / pl.count()
acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         728|
|  1.0|         412|
|  2.0|         454|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              928|
|       1.0|              316|
|       2.0|              350|
+----------+-----------------+

Model accuracy: 65.307%


In [None]:
#Cross-Validation
pipeline = Pipeline(stages=[regexTokenizer,countVectors, label_stringIdx])
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
             .build())
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)
cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)

In [88]:
#Cross-Validation
pipeline = Pipeline(stages=[regexTokenizer,countVectors, label_stringIdx])
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
  .addGrid(lr.regParam, [0.1, 0.3, 0.5]) \
  .addGrid(lr.maxIter, [10, 20, 50]) \
  .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) \
  .build())

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)

cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)

In [112]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print ("Model Accuracy: ", accuracy)

Model Accuracy:  0.6782760772380454


In [90]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

pl.filter(pl.label == pl.prediction).count() / pl.count()
acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         728|
|  1.0|         412|
|  2.0|         454|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|              882|
|       1.0|              328|
|       2.0|              384|
+----------+-----------------+

Model accuracy: 68.319%


In [116]:
lrModel.save("myModelPath4")

In [118]:
cvModel.save("vModel2")

In [119]:
from pyspark.ml.tuning import  CrossValidatorModel

In [120]:
sameCVModel = CrossValidatorModel.load("vModel")