In [None]:
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml.pipeline import PipelineModel

In [None]:
sc =SparkContext()
sqlContext = SQLContext(sc)



In [None]:
customSchema = StructType([
    StructField("label", IntegerType()),
    StructField("text", StringType())])

In [None]:
df_val = sqlContext.read.format("csv").option("header", "true").schema(customSchema).load('cleaned_twitter_validation.csv')

In [None]:
# Fit the pipeline to validation documents.
pipeline = PipelineModel.load('pipelineFit')
dataset_val = pipeline.transform(df_val)
dataset_val.show(5)

+-----+--------------------+--------------------+--------------------+
|label|                text|              tokens|            features|
+-----+--------------------+--------------------+--------------------+
|    0|mentioned faceboo...|[mentioned, faceb...|(13363,[3,16,23,2...|
|    2|bbc news amazon b...|[bbc, news, amazo...|(13363,[2,34,138,...|
|    1|why pay  word  fu...|[why, pay, word, ...|(13363,[90,265,69...|
|    1|csgo matchmaking ...|[csgo, matchmakin...|(13363,[0,115,262...|
|    2|now  president sl...|[now, president, ...|(13363,[7,32,143,...|
+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [None]:
cvModel = CrossValidatorModel.load('cvModel1')
predictions = cvModel.transform(dataset_val)

In [None]:
predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 110)

+--------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+-----+----------+
|                                                                                                          text|                                                                           probability|label|prediction|
+--------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+-----+----------+
|words gratitude trevorphilipsstanaccount love   gta tumblr fandom particular   creating  talented  beautifu...|   [0.9988016127239734,8.530927674802259E-6,6.289466453168379E-4,5.609097030348915E-4]|    0|       0.0|
|the nigeria national team   ranked  29th best team  world  3rd africa  latest fifa world rankings  its  fir...| [0.9981446665691251

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.9037828332690099