In [9]:
#Read the json FIle contain tweets and there labes 
#For train and test
from pyspark.sql import SparkSession
jobDir = "tweets111.json"
tweets = spark.read.json([jobDir])
tweets.count() #number of tweets in the file

9988

In [10]:
#select the object in json file
tweets = tweets.select("text", \
                     "Category" )

tweets.printSchema()

root
 |-- text: string (nullable = true)
 |-- Category: string (nullable = true)



In [11]:
from pyspark.ml.feature import RegexTokenizer,CountVectorizer
from pyspark.ml.classification import LogisticRegression
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words")
#convert a collection of text documents to vectors of token counts. 
countVectors = CountVectorizer(inputCol="words", outputCol="features")

In [12]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import  StringIndexer
#StringIndexer encodes a string column of labels to a column of label indices.
label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")
#pipeline
pipeline = Pipeline(stages=[regexTokenizer, label_stringIdx, countVectors])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
dataset.show(5,False)

+-------------------------------------------------+--------+-----------------------------------------------------------+-----+----------------------------------------------------------------------------------------------+
|text                                             |Category|words                                                      |label|features                                                                                      |
+-------------------------------------------------+--------+-----------------------------------------------------------+-----+----------------------------------------------------------------------------------------------+
|والل عجب عشان كتاب انجليز صعب كلم                |POS     |[والل, عجب, عشان, كتاب, انجليز, صعب, كلم]                  |1.0  |(19722,[47,92,176,419,588,1350,6837],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])                           |
|انه رنامج رايع يترجم كلم قطع باقص سرع            |POS     |[انه, رنامج, رايع, يترجم, كلم, قطع, باقص, سرع]      

In [13]:
#Partition Training & Test sets
#80% train ,20% test
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 7957
Test Dataset Count: 2031


In [14]:
#Logistic Regression using Count Vector Features
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)# model will make predictions and score on the test set


predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(5,False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------+-----+----------+
|text                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |probabili

In [15]:
# Show the accuracy 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="f1")
accuracy = evaluator.evaluate(predictions)
print("Model Accuracy: ", accuracy)

Model Accuracy:  0.7026545276911003


In [16]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

pl.filter(pl.label == pl.prediction).count() / pl.count()
acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         940|
|  1.0|         639|
|  2.0|         452|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|             1157|
|       1.0|              600|
|       2.0|              274|
+----------+-----------------+

Model accuracy: 71.147%


In [17]:
#Logistic Regression using TF-IDF Features
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(5,False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+--------------------------------------------------------------+-----+----------+
|text                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |

In [18]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",metricName="f1")
accuracy = evaluator.evaluate(predictions)
print ("Model Accuracy: ", accuracy)

Model Accuracy:  0.6729132795659876


In [19]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

pl.filter(pl.label == pl.prediction).count() / pl.count()
acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         940|
|  1.0|         639|
|  2.0|         452|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|             1151|
|       1.0|              597|
|       2.0|              283|
+----------+-----------------+

Model accuracy: 68.390%


In [20]:
#Cross-Validation
pipeline = Pipeline(stages=[regexTokenizer,countVectors, label_stringIdx])
pipelineFit = pipeline.fit(tweets)
dataset = pipelineFit.transform(tweets)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
             .build())
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)
cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)

In [21]:
predictions.filter(predictions['prediction'] == 0) \
    .select("text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(5,False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+---------------------------------------------------------------+-----+----------+
|text                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [22]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print ("Model Accuracy: ", accuracy)

Model Accuracy:  0.7104643740264259


In [23]:
pl = predictions.select("label", "prediction")

print("the label from our dataset") 
pl.groupby('label').agg({'label': 'count'}).show()

print("the label from test") 
pl.groupby('prediction').agg({'prediction': 'count'}).show()

pl.filter(pl.label == pl.prediction).count() / pl.count()
acc = pl.filter(pl.label == pl.prediction).count() / pl.count()
print("Model accuracy: %.3f%%" % (acc * 100)) 

the label from our dataset
+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|         940|
|  1.0|         639|
|  2.0|         452|
+-----+------------+

the label from test
+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|             1081|
|       1.0|              616|
|       2.0|              334|
+----------+-----------------+

Model accuracy: 71.590%
