In [1]:
import findspark
findspark.init()
import pyspark 
import warnings
from pyspark.sql.functions import *
from pyspark.ml.feature import HashingTF, CountVectorizer
from pyspark.ml.feature import IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
   .master("local") \
   .appName("Naive bayes Model") \
   .getOrCreate()

In [3]:
######## Import Labeled Data (Corpus) 
df= spark.read.option("header", "true").option("delimiter", ";").option("inferSchema", "true").option("header","true").csv("Output\data_corpus.csv")
print(type(df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [4]:
sentenceData = df.selectExpr("Text as sentence ", "cast(label as int) as label")

In [5]:
sentenceData=sentenceData.na.drop()

In [6]:
sentenceData = sentenceData.withColumn('sentence', regexp_replace('sentence', ',', ' '))

In [7]:
print((sentenceData.count(), len(sentenceData.columns)))

(10904, 2)


In [8]:
########## Partition Training & Test sets
(train_set, val_set) = sentenceData.randomSplit([0.8, 0.2], seed = 2000)

### TF-IDF

In [9]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) 
nb = NaiveBayes(smoothing=1)
pipeline = Pipeline(stages=[tokenizer,hashtf,idf,nb])

In [10]:
### Data tranform using pipeline
pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
train_df.show(5)
predictions_tf = pipelineFit.transform(val_set)

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|            sentence|label|               words|                  tf|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    غبي     امس ا...|    0|[, , , , غبي, , ,...|(65536,[2049,2488...|(65536,[2049,2488...|[-347.75393870900...|[0.99504370273935...|       0.0|
|    للاسف ـ بعض ـ...|    0|[, , , , للاسف, ـ...|(65536,[12736,146...|(65536,[12736,146...|[-167.69766584039...|[0.99999998732112...|       0.0|
|   بعدين هالعالم ...|    0|[, , , بعدين, هال...|(65536,[1390,2168...|(65536,[1390,2168...|[-401.99481174627...|[0.80530086483193...|       0.0|
|   تراي ألغيت حسا...|    0|[, , , تراي, ألغي...|(65536,[2356,6840...|(65536,[2356,6840...|[-490.46329807627...|[2.50975535212321.

In [11]:
### Evaluation of Model

In [12]:
#F1 score
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_tf = evaluator.evaluate(predictions_tf)
print("F1 score  = {0:.2%}".format(f1_tf))

F1 score  = 81.31%


In [13]:
#Precision
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
precision_tf = evaluator.evaluate(predictions_tf)
print("Precision = {0:.2%}".format(precision_tf))

Precision = 82.00%


In [14]:
#Recall
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall_tf= evaluator.evaluate(predictions_tf)
print("Recall = {0:.2%}".format(recall_tf))

Recall = 81.44%


#### Bag of words

In [15]:
## Build Pipeline
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
cv = CountVectorizer(vocabSize=2**16,inputCol="words", outputCol='features')
nb = NaiveBayes(smoothing=1)
pipeline = Pipeline(stages=[tokenizer,cv,nb])

In [16]:
## Data tranform using pipeline
pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
train_df.show(5)
predictions_cv = pipelineFit.transform(val_set)

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|            sentence|label|               words|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|    غبي     امس ا...|    0|[, , , , غبي, , ,...|(33713,[1,6,31,37...|[-126.94576332948...|[0.95589386375133...|       0.0|
|    للاسف ـ بعض ـ...|    0|[, , , , للاسف, ـ...|(33713,[1,5,26,92...|[-49.793908211457...|[0.94939841822302...|       0.0|
|   بعدين هالعالم ...|    0|[, , , بعدين, هال...|(33713,[1,21,25,1...|[-149.78066391473...|[0.98148174156400...|       0.0|
|   تراي ألغيت حسا...|    0|[, , , تراي, ألغي...|(33713,[1,10,15,1...|[-153.13229815384...|[0.36174043228756...|       2.0|
|   عزازي ومزعل فر...|    0|[, , , عزازي, ومز...|(33713,[1,127,131...|[-60.668158619948...|[0.66093369190816...|       0.0|
+-------

In [17]:
## Evaluation of Model
# F1 score
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_cv = evaluator.evaluate(predictions_cv )
print("F1 score = {0:.2%}".format(f1_cv))

F1 score = 84.75%


In [18]:
# Precision
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
precision_cv = evaluator.evaluate(predictions_cv )
print("Precision = {0:.2%}".format(precision_cv))

Precision = 86.23%


In [19]:
# Recall
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall_cv= evaluator.evaluate(predictions_cv )
print("Recall = {0:.2%}".format(recall_cv))

Recall = 84.93%


### ngram models

In [20]:
from pyspark.ml.feature import NGram

In [21]:
## Build Pipeline
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
cv = CountVectorizer(vocabSize=2**16,inputCol="ngrams", outputCol='features')
nb = NaiveBayes(smoothing=1)
pipeline = Pipeline(stages=[tokenizer,ngram, cv,nb])

In [22]:
## Data tranform using pipeline
pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)

predictions_ng = pipelineFit.transform(val_set)

In [23]:
train_df.toPandas().head()

Unnamed: 0,sentence,label,words,ngrams,features,rawPrediction,probability,prediction
0,غبي امس الشوط الاول كان مليون خطأ سواه...,0,"[, , , , غبي, , , , , امس, الشوط, الاول, كان, ...","[ , , , غبي, غبي , , , , امس, امس الشوط...","(6.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-138.67972596430948, -148.68885111115392, -14...","[0.9848509991684783, 4.430601701027925e-05, 0....",0.0
1,للاسف ـ بعض ـ البشر,0,"[, , , , للاسف, ـ, بعض, ـ, البشر]","[ , , , للاسف, للاسف ـ, ـ بعض, بعض ـ, ـ البشر]","(3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-47.27187546443951, -51.141152012413265, -49....","[0.8762638452251387, 0.018290662629250255, 0.1...",0.0
2,بعدين هالعالم كله خطا وحنا صح رايح تكتشفين ...,0,"[, , , بعدين, هالعالم, كله, خطا, وحنا, صح, راي...","[ , , بعدين, بعدين هالعالم, هالعالم كله, كله...","(2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-97.03067181241558, -103.98411083719797, -101...","[0.990280565232438, 0.0009460586418517492, 0.0...",0.0
3,تراي ألغيت حسابي تجربة اسبوع كفاية تخليني...,0,"[, , , تراي, ألغيت, حسابي, , , تجربة, اسبوع, ك...","[ , , تراي, تراي ألغيت, ألغيت حسابي, حسابي ,...","(3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-154.80930360421678, -165.56950339404435, -15...","[0.9929492762715695, 2.1078110197880356e-05, 0...",0.0
4,عزازي ومزعل فرحان افضل منه,0,"[, , , عزازي, ومزعل, فرحان, افضل, منه]","[ , , عزازي, عزازي ومزعل, ومزعل فرحان, فرحان...","(2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-43.60853133280692, -46.626094110156544, -44....","[0.7359144954835317, 0.03600115999483327, 0.22...",0.0


In [24]:
## Evaluation of the model
#F1 score
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_ng = evaluator.evaluate(predictions_ng)
print("F1 score  = {0:.2%}".format(f1_ng))

F1 score  = 71.15%


In [25]:
#Precision
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
precision_ng = evaluator.evaluate(predictions_ng)
print("Precision  = {0:.2%}".format(precision_ng))

Precision  = 77.10%


In [27]:
#Recall
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall_ng= evaluator.evaluate(predictions_ng)
print("Recall  = {0:.2%}".format(recall_ng))

Recall  = 71.94%


In [None]:
# Print Results 

In [29]:
import pandas as pd
import numpy as np
l_tf=[f1_tf,precision_tf,recall_tf]
l_cv=[f1_cv,precision_cv,recall_cv]
l_ng=[f1_ng,precision_ng,recall_ng]

l_tf_set=[str(np.round(item*100,3))+'%' for item in l_tf]
l_cv_set=[str(np.round(item*100,3))+'%' for item in l_cv]
l_ng_set=[str(np.round(item*100,3))+'%' for item in l_ng]
index=['F1 score','Precision','Recall']
metrics=pd.DataFrame(l_tf_set,index=index,columns=['TF'])
metrics['bag of words']=l_cv_set
metrics['N-grams']=l_ng_set
metrics.transpose()

Unnamed: 0,F1 score,Precision,Recall
TF,81.306%,82.002%,81.436%
bag of words,84.754%,86.233%,84.932%
N-grams,71.15%,77.103%,71.941%
