In [1]:
import pandas as pd

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import *

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, countDistinct, udf
from pyspark.sql.types import ArrayType, IntegerType,  StringType

from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import StringIndexer

from pyspark.mllib.tree import RandomForest
from pyspark.mllib.util import MLUtils

spark = SparkSession.builder.getOrCreate()

In [2]:
df = spark.read.csv("./jordan_subset", header=True)

In [3]:
df.show(5)

+------------+-------+-----------+---+---------+----------+------------+----------------------+--------------------+-------------+-------+--------------+------+-----+--------+------------------+-----+------------+--------------------+-------------+------+----------------+----------+
|subreddit_id|    _c0|created_utc|ups|  link_id|      name|score_hidden|author_flair_css_class|   author_flair_text|    subreddit|     id|removal_reason|gilded|downs|archived|            author|score|retrieved_on|                body|distinguished|edited|controversiality| parent_id|
+------------+-------+-----------+---+---------+----------+------------+----------------------+--------------------+-------------+-------+--------------+------+-----+--------+------------------+-----+------------+--------------------+-------------+------+----------------+----------+
|    t5_2qh33|5027245| 1430702078|  1|t3_34rftl|t1_cqxhekl|           0|                    NA|                  NA|        funny|cqxhekl|          

In [4]:
df.count()

4871463

In [5]:
df.columns

['subreddit_id',
 '_c0',
 'created_utc',
 'ups',
 'link_id',
 'name',
 'score_hidden',
 'author_flair_css_class',
 'author_flair_text',
 'subreddit',
 'id',
 'removal_reason',
 'gilded',
 'downs',
 'archived',
 'author',
 'score',
 'retrieved_on',
 'body',
 'distinguished',
 'edited',
 'controversiality',
 'parent_id']

In [6]:
df2 = df.select(['subreddit_id','ups','gilded','score_hidden','downs','score','controversiality','body'])
df2.dtypes

[('subreddit_id', 'string'),
 ('ups', 'string'),
 ('gilded', 'string'),
 ('score_hidden', 'string'),
 ('downs', 'string'),
 ('score', 'string'),
 ('controversiality', 'string'),
 ('body', 'string')]

In [7]:
from pyspark.sql.types import IntegerType,BooleanType
# https://stackoverflow.com/questions/46956026/how-to-convert-column-with-string-type-to-int-form-in-pyspark-data-frame
# data_df = data_df.withColumn("Plays", data_df["Plays"].cast(IntegerType()))
df2 = df2.withColumn('ups',df2['ups'].cast(IntegerType()))
df2 = df2.withColumn('downs',df2['downs'].cast(IntegerType()))
df2 = df2.withColumn('score',df2['score'].cast(IntegerType()))
df2 = df2.withColumn('controversiality',df2['controversiality'].cast(IntegerType()))
df2 = df2.withColumn('gilded',df2['gilded'].cast(IntegerType()))
df2 = df2.withColumn('score_hidden',df2['score_hidden'].cast(BooleanType()))
df2 = df2.fillna(0)
df2.dtypes

[('subreddit_id', 'string'),
 ('ups', 'int'),
 ('gilded', 'int'),
 ('score_hidden', 'boolean'),
 ('downs', 'int'),
 ('score', 'int'),
 ('controversiality', 'int'),
 ('body', 'string')]

In [8]:
df2.select(['score_hidden']).distinct().show()

+------------+
|score_hidden|
+------------+
|        true|
|       false|
+------------+



In [20]:
# Split to training and test
(trainingData, testData) = df2.sample(.01).randomSplit([0.3, 0.7])
# (trainingData, testData) = df2.randomSplit([0.5, 0.5])

In [21]:
trainingData.count()

14473

In [22]:
testData.count()

34010

In [23]:
tk = Tokenizer(inputCol="body", outputCol="words")
# tk_data = tk.transform(trainingData)
# tk_data.select('body','words').show(5)

In [24]:
sw = StopWordsRemover(inputCol="words", outputCol="filtered")
# sw_data = sw.transform(tk_data)
# sw_data.select('body','words','filtered').show(5)

In [25]:
cv = CountVectorizer(inputCol="filtered", outputCol="counted", vocabSize=3, minDF=2.0)
# cv_fit = cv.fit(temp)
# cv_data = cv_fit.transform(temp)
# cv_data.select('body','words','filtered','counted').show(5)

In [26]:
w2v =  Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="word2vec")
# w2v_fit = w2v.fit(cv_data)
# w2v_data = w2v_fit.transform(cv_data)
# w2v_data.select('body','words','filtered',"word2vec").show(5)

In [27]:
si = StringIndexer(inputCol="subreddit_id", outputCol="sr_id_num") # maybe I don't need this.
# si_model = si.fit(w2v_data)
# si_data = si_model.transform(w2v_data)

# si_data.show(5)

In [28]:
# feats =  ['ups','gilded','score_hidden','downs','score','controversiality','counted','word2vec']
feats =  ['ups','gilded','score_hidden','downs','score','controversiality','word2vec']
assembler = VectorAssembler(inputCols=feats, outputCol="features")
# assembler_data = assembler.transform(si_data)

# assembler_data.show(5)


In [29]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol='sr_id_num', featuresCol='features')
# rf_model = rf.fit(assembler_data)

In [30]:
# pipeline = Pipeline(stages=[tk,sw,cv,w2v,si,assembler,rf])
pipeline = Pipeline(stages=[tk,sw,w2v,si,assembler,rf])
model = pipeline.fit(trainingData)
prediction = model.transform(testData)

In [31]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="sr_id_num", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(prediction)
# evaluator = MulticlassClassificationEvaluator(
#     labelCol=“n_index”, predictionCol=“prediction”, metricName=“accuracy”)
# accuracy = evaluator.evaluate(predictions)

In [32]:
# from pyspark.mllib.linalg import Vectors
# from pyspark.mllib.regression import LabeledPoint

# transformed_df = df.rdd.map(lambda row: LabeledPoint(row[0], Vectors.dense(row[0:-1])))

In [33]:
# https://stackoverflow.com/questions/32556178/create-labeledpoints-from-spark-dataframe-in-python
# (vec.select(col("outcome_column").alias("label"), col("features"))
#   .rdd
#   .map(lambda row: LabeledPoint(row.label, row.features)))

# w2v_data.dtypes

In [34]:
# ohe = OneHotEncoder(inputCol="sr_id_num", outputCol="subr_ohe")  
# ohe_fit = ohe.fit(si_data)
# ohe_data = ohe_fit.transform(si_data)

In [None]:
# Train a RandomForest model.  #david
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
# model = RandomForest.trainClassifier(vec_rdd, numClasses=1000, categoricalFeaturesInfo={1:2,2:2},
#                                      numTrees=10, featureSubsetStrategy="auto",
#                                      impurity='gini', maxDepth=3, maxBins=32)

In [None]:
# vec_rdd = vec.rdd.map(tuple).take(5)
# features = vec.select(feats).rdd.map(tuple)
# rdd = vec.select(['subreddit_id','ups','gilded','score_hidden','downs','score','controversiality','counted','word2vec']).rdd.map(lambda x: LabeledPoint(x[0],x[1:]))
# LabeledPoint(label, features)


# def parsePoint(line):
#     values = [float(x) for x in line.split(' ')]
#     return LabeledPoint(values[0], values[1:])

# parsedData = data.map(parsePoint)

# model_data = si_data.select(['ups','gilded','score_hidden','downs','score','controversiality','counted','word2vec','sr_id_num'])
# model_data = si_data.select(['ups','gilded','score_hidden','downs','score','controversiality','sr_id_num'])
# rdd = model_data.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))

# model_data.dtypes
# model_data.show(5)
# rdd.take(3)