In [41]:
import pandas as pd

from pyspark.ml import Pipeline
# from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import *

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, countDistinct, udf
from pyspark.sql.types import ArrayType, IntegerType,  StringType

from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
# from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import StringIndexer

# from pyspark.mllib.tree import RandomForest
from pyspark.mllib.util import MLUtils

from pyspark.sql.types import IntegerType,BooleanType
from pyspark.ml.classification import RandomForestClassifier
import time
# from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.pipeline import PipelineModel

import matplotlib.pyplot as plt  
from sklearn.datasets import make_classification
from sklearn.metrics import multilabel_confusion_matrix

spark = SparkSession.builder.getOrCreate()

## Import and Clean Data from Subset Data File

In [42]:
df = spark.read.csv("./jordan_subset", header=True)

In [43]:
df.show(5)

+------------+-------+-----------+---+---------+----------+------------+----------------------+--------------------+-------------+-------+--------------+------+-----+--------+------------------+-----+------------+--------------------+-------------+------+----------------+----------+
|subreddit_id|    _c0|created_utc|ups|  link_id|      name|score_hidden|author_flair_css_class|   author_flair_text|    subreddit|     id|removal_reason|gilded|downs|archived|            author|score|retrieved_on|                body|distinguished|edited|controversiality| parent_id|
+------------+-------+-----------+---+---------+----------+------------+----------------------+--------------------+-------------+-------+--------------+------+-----+--------+------------------+-----+------------+--------------------+-------------+------+----------------+----------+
|    t5_2qh33|5027245| 1430702078|  1|t3_34rftl|t1_cqxhekl|           0|                    NA|                  NA|        funny|cqxhekl|          

In [44]:
df.count()

4871463

In [45]:
df.columns

['subreddit_id',
 '_c0',
 'created_utc',
 'ups',
 'link_id',
 'name',
 'score_hidden',
 'author_flair_css_class',
 'author_flair_text',
 'subreddit',
 'id',
 'removal_reason',
 'gilded',
 'downs',
 'archived',
 'author',
 'score',
 'retrieved_on',
 'body',
 'distinguished',
 'edited',
 'controversiality',
 'parent_id']

In [46]:
df2 = df.select(['subreddit_id','ups','gilded','score_hidden','downs','score','controversiality','body'])
df2.dtypes

[('subreddit_id', 'string'),
 ('ups', 'string'),
 ('gilded', 'string'),
 ('score_hidden', 'string'),
 ('downs', 'string'),
 ('score', 'string'),
 ('controversiality', 'string'),
 ('body', 'string')]

In [47]:
# https://stackoverflow.com/questions/46956026/how-to-convert-column-with-string-type-to-int-form-in-pyspark-data-frame
# data_df = data_df.withColumn("Plays", data_df["Plays"].cast(IntegerType()))
df2 = df2.withColumn('ups',df2['ups'].cast(IntegerType()))
df2 = df2.withColumn('downs',df2['downs'].cast(IntegerType()))
df2 = df2.withColumn('score',df2['score'].cast(IntegerType()))
df2 = df2.withColumn('controversiality',df2['controversiality'].cast(IntegerType()))
df2 = df2.withColumn('gilded',df2['gilded'].cast(IntegerType()))
df2 = df2.withColumn('score_hidden',df2['score_hidden'].cast(BooleanType()))
df2 = df2.fillna(0)
df2.dtypes

[('subreddit_id', 'string'),
 ('ups', 'int'),
 ('gilded', 'int'),
 ('score_hidden', 'boolean'),
 ('downs', 'int'),
 ('score', 'int'),
 ('controversiality', 'int'),
 ('body', 'string')]

In [48]:
df2.select(['score_hidden']).distinct().show()

+------------+
|score_hidden|
+------------+
|        true|
|       false|
+------------+



## Split Data and Create Pipeline

In [49]:
# Split to training and test
(trainingData, testData) = df2.sample(.01).randomSplit([0.3, 0.7])

In [50]:
trainingData.count()

14388

In [51]:
testData.count()

34223

In [52]:
tk = Tokenizer(inputCol="body", outputCol="words")

In [53]:
sw = StopWordsRemover(inputCol="words", outputCol="filtered")

In [55]:
w2v =  Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="word2vec")

In [56]:
si = StringIndexer(inputCol="subreddit_id", outputCol="sr_id_num") # maybe I don't need this.

In [57]:
feats =  ['ups','gilded','score_hidden','downs','score','controversiality','word2vec']
assembler = VectorAssembler(inputCols=feats, outputCol="features")


In [58]:
rf = RandomForestClassifier(labelCol='sr_id_num', featuresCol='features')

## Run Pipeline and Evaluate

In [61]:
# pipeline = Pipeline(stages=[tk,sw,cv,w2v,si,assembler,rf])
pipeline = Pipeline(stages=[tk,sw,w2v,si,assembler,rf])
model = pipeline.fit(trainingData)
prediction = model.transform(testData)

In [62]:
# Load previous model
# model = PipelineModel.load('./jordan_model20210802-114416')
# prediction = model.transform(testData)

In [64]:
prediction.show(5)

+------------+---+------+------------+-----+-----+----------------+--------------------+--------------------+--------------------+--------------------+---------+--------------------+--------------------+--------------------+----------+
|subreddit_id|ups|gilded|score_hidden|downs|score|controversiality|                body|               words|            filtered|            word2vec|sr_id_num|            features|       rawPrediction|         probability|prediction|
+------------+---+------+------------+-----+-----+----------------+--------------------+--------------------+--------------------+--------------------+---------+--------------------+--------------------+--------------------+----------+
|    t5_2cneq|  1|     0|       false|    0|    1|               0|"I'm a far-leftis...|["i'm, a, far-lef...|["i'm, far-leftis...|[0.13782385766506...|     23.0|[1.0,0.0,0.0,0.0,...|[3.65621952968655...|[0.18281097648432...|       0.0|
|    t5_2cneq|  1|     0|       false|    0|    1|      

In [65]:
# save the rf model with a timestamp
timestr = time.strftime("%Y%m%d-%H%M%S")
model.save("jordan_model"+timestr)

In [66]:
predictionAndLabels = prediction[['prediction','sr_id_num']].rdd
predictionAndLabels.take(5)

[Row(prediction=0.0, sr_id_num=23.0),
 Row(prediction=0.0, sr_id_num=23.0),
 Row(prediction=0.0, sr_id_num=23.0),
 Row(prediction=0.0, sr_id_num=23.0),
 Row(prediction=0.0, sr_id_num=23.0)]

In [67]:
metrics = MulticlassMetrics(predictionAndLabels)

In [68]:
# top_threads = spark.read.csv("./top_sr_sorted_jordan", header=True)

In [69]:
# top_threads.collect()

In [70]:
# https://spark.apache.org/docs/2.3.0/mllib-evaluation-metrics.html

print("accuracy = %s" % metrics.accuracy)
print("Weighted recall = %s" % metrics.weightedRecall)
print("Weighted precision = %s" % metrics.weightedPrecision)
print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

accuracy = 0.15615229524004326
Weighted recall = 0.15615229524004326
Weighted precision = 0.03864890988047767
Weighted F(1) Score = 0.04332548231139575
Weighted F(0.5) Score = 0.03178716857704093
Weighted false positive rate = 0.15524501110366945


In [71]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.multilabel_confusion_matrix.html

pred_pd = prediction[['sr_id_num', 'prediction']].toPandas()

conf_matrix = multilabel_confusion_matrix(pred_pd['sr_id_num'], pred_pd['prediction']) 
conf_matrix

array([[[  176, 28705],
        [   18,  5324]],

       [[32426,     0],
        [ 1797,     0]],

       [[32882,    13],
        [ 1327,     1]],

       [[33183,     2],
        [ 1038,     0]],

       [[33222,     0],
        [ 1001,     0]],

       [[33281,     0],
        [  942,     0]],

       [[33165,    16],
        [ 1042,     0]],

       [[33289,    45],
        [  886,     3]],

       [[33487,     7],
        [  729,     0]],

       [[33491,     4],
        [  728,     0]],

       [[33540,     0],
        [  683,     0]],

       [[33615,    26],
        [  580,     2]],

       [[33711,     1],
        [  511,     0]],

       [[33709,     0],
        [  514,     0]],

       [[33621,     9],
        [  591,     2]],

       [[33742,     0],
        [  481,     0]],

       [[33665,     0],
        [  558,     0]],

       [[33598,     0],
        [  625,     0]],

       [[33701,     0],
        [  522,     0]],

       [[33823,     0],
        [  400,     0]],

