# Bayesian_Inference

The dataset can be downloaded from [UCI Machine Learning Respository](https://archive.ics.uci.edu/ml/machine-learning-databases/00228/)

## Instantiate a spark session

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SpamClassifier').getOrCreate()

## Load text data

In [4]:
# Load data and rename column
df = spark.read.option("header", "false") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv("SMSSpamCollection.txt") \
    .withColumnRenamed("_c0", "class") \
    .withColumnRenamed("_c1", "text")

df.limit(10).show()

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
| spam|FreeMsg Hey there...|
|  ham|Even my brother i...|
|  ham|As per your reque...|
| spam|WINNER!! As a val...|
| spam|Had your mobile 1...|
+-----+--------------------+



## Pipeline Stages

In [5]:
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import NaiveBayes

In [6]:
stages = []
# 1. clean data and tokenize sentences using RegexTokenizer
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="tokens", pattern="\\W+")
stages += [regexTokenizer]

# 2. CountVectorize the data
cv = CountVectorizer(inputCol="tokens", outputCol="token_features", minDF=2.0)#, vocabSize=3, minDF=2.0
stages += [cv]

# 3. Convert the labels to numerical values using binariser
indexer = StringIndexer(inputCol="class", outputCol="label")
stages += [indexer]

# 4. Vectorise features using vectorassembler
vecAssembler = VectorAssembler(inputCols=['token_features'], outputCol="features")
stages += [vecAssembler]

[print('\n', stage) for stage in stages]


 RegexTokenizer_d4ae0ba9e0da

 CountVectorizer_ce863a6732fe

 StringIndexer_3d2ef229b0c5

 VectorAssembler_330cb9a0ff40


[None, None, None, None]

## Fit Pipeline

In [7]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)
data = pipeline.fit(df).transform(df)

## Split dataset into train and test

In [8]:
train, test = data.randomSplit([0.7, 0.3], seed = 2018)

## Naive Bayes Implementation

In [9]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
model = nb.fit(train)

## Prediction

In [10]:
predictions = model.transform(test)
# Select results to view
predictions.limit(10).select("label", "prediction", "probability").show(truncate=False)

+-----+----------+------------------------------------------+
|label|prediction|probability                               |
+-----+----------+------------------------------------------+
|0.0  |0.0       |[0.9999996176179956,3.823820044882337E-7] |
|0.0  |0.0       |[0.9972054995602091,0.002794500439790882] |
|0.0  |0.0       |[0.9999999999978098,2.190326444063966E-12]|
|0.0  |0.0       |[0.9999999999999538,4.607804951342392E-14]|
|0.0  |0.0       |[0.999999999880886,1.1911406870203127E-10]|
|0.0  |0.0       |[0.999688852925206,3.1114707479388615E-4] |
|0.0  |0.0       |[0.9999999098737272,9.012627286140461E-8] |
|0.0  |0.0       |[0.9999950690131734,4.930986826665776E-6] |
|0.0  |0.0       |[0.9999795625725587,2.043742744135259E-5] |
|0.0  |0.0       |[0.9999063364041348,9.366359586510845E-5] |
+-----+----------+------------------------------------------+



## Model Evaluation

In [11]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print ("Test Area Under ROC: ", accuracy)

Test Area Under ROC:  0.972052252090383


- Use <b>MulticlassClassificationEvaluator</b> to calculate the <b>f1_score</b>.

In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

eval_f1 = MulticlassClassificationEvaluator(predictionCol="prediction")



f1score = eval_f1.evaluate(predictions)
print ("f1_score is: ", f1score)

f1_score is:  0.9894791154468984
