In [None]:
!pip install pyspark
!unzip cvModel1.zip
!unzip pipelineFit.zip

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=0ac105f8c300e49bc4d44387504a3bf1244f840d95eb207e5dc9bc5e04daed54
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1
Archive:  cvModel1.zip
   creating: cvModel1/
   creating: cvModel1/bestModel/
   creating: cvModel1/bestModel/data/
 extracting: cvModel1/bestModel/data/.part-00000-e0e6fbd7-dd48-4880-82b5-6e2637cd0c07-c000.snappy.parquet.crc  
 extracting: cvModel1/bestModel/data/._SUCCESS

In [None]:
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.functions import col
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, Word2Vec, Tokenizer, StopWordsRemover
from pyspark.ml.classification import LogisticRegression, NaiveBayes, LinearSVC, OneVsRest, MultilayerPerceptronClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.pipeline import PipelineModel

In [None]:
sc =SparkContext()
sqlContext = SQLContext(sc)



In [None]:
customSchema = StructType([
    StructField("label", IntegerType()),
    StructField("text", StringType())])

In [None]:
df_val = sqlContext.read.format("csv").option("header", "true").schema(customSchema).load('cleaned_twitter_validation.csv')

In [None]:
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="tokens", pattern=r" +")
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
countVectors = CountVectorizer(inputCol="tokens", outputCol="features", vocabSize=15000, minDF=5)
word2Vec = Word2Vec(vectorSize=100, minCount=0,maxIter=20, inputCol="tokens", outputCol="features")
hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=15000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=10) #minDocFreq: remove sparse terms

In [None]:
# Fit the pipeline to validation documents.
pipeline = PipelineModel.load('pipelineFit')
dataset_val = pipeline.transform(df_val)
dataset_val.show(5)

+-----+--------------------+--------------------+--------------------+
|label|                text|              tokens|            features|
+-----+--------------------+--------------------+--------------------+
|    0|mentioned faceboo...|[mentioned, faceb...|(13363,[3,16,23,2...|
|    2|bbc news amazon b...|[bbc, news, amazo...|(13363,[2,34,138,...|
|    1|why pay  word  fu...|[why, pay, word, ...|(13363,[90,265,69...|
|    1|csgo matchmaking ...|[csgo, matchmakin...|(13363,[0,115,262...|
|    2|now  president sl...|[now, president, ...|(13363,[7,32,143,...|
+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [None]:
cvModel = CrossValidatorModel.load('cvModel1')
predictions = cvModel.transform(dataset_val)

In [None]:
predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 110)

+--------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+-----+----------+
|                                                                                                          text|                                                                           probability|label|prediction|
+--------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+-----+----------+
|words gratitude trevorphilipsstanaccount love   gta tumblr fandom particular   creating  talented  beautifu...|   [0.9988016127239734,8.530927674802259E-6,6.289466453168379E-4,5.609097030348915E-4]|    0|       0.0|
|the nigeria national team   ranked  29th best team  world  3rd africa  latest fifa world rankings  its  fir...| [0.9981446665691251

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.9037828332690099