In [1]:
import pandas as pd

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import *

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, countDistinct, udf
from pyspark.sql.types import ArrayType, IntegerType,  StringType

from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import StringIndexer

from pyspark.mllib.tree import RandomForest
from pyspark.mllib.util import MLUtils

spark = SparkSession.builder.getOrCreate()

In [2]:
df = spark.read.csv("./jordan_subset", header=True)

In [3]:
df.show(5)

+------------+-------+-----------+---+---------+----------+------------+----------------------+-----------------+---------------+-------+--------------+------+-----+--------+---------------+-----+------------+--------------------+-------------+------+----------------+----------+
|subreddit_id|    _c0|created_utc|ups|  link_id|      name|score_hidden|author_flair_css_class|author_flair_text|      subreddit|     id|removal_reason|gilded|downs|archived|         author|score|retrieved_on|                body|distinguished|edited|controversiality| parent_id|
+------------+-------+-----------+---+---------+----------+------------+----------------------+-----------------+---------------+-------+--------------+------+-----+--------+---------------+-----+------------+--------------------+-------------+------+----------------+----------+
|    t5_2te6p|3345542| 1430609172|  4|t3_34mhlr|t1_cqwgwou|           0|                    NA|               NA|WaltDisneyWorld|cqwgwou|            NA|     0| 

In [4]:
df.count()

8799960

In [5]:
df.columns

['subreddit_id',
 '_c0',
 'created_utc',
 'ups',
 'link_id',
 'name',
 'score_hidden',
 'author_flair_css_class',
 'author_flair_text',
 'subreddit',
 'id',
 'removal_reason',
 'gilded',
 'downs',
 'archived',
 'author',
 'score',
 'retrieved_on',
 'body',
 'distinguished',
 'edited',
 'controversiality',
 'parent_id']

In [6]:
df2 = df.select(['subreddit_id','ups','gilded','score_hidden','downs','score','controversiality','body'])
df2.dtypes

[('subreddit_id', 'string'),
 ('ups', 'string'),
 ('gilded', 'string'),
 ('score_hidden', 'string'),
 ('downs', 'string'),
 ('score', 'string'),
 ('controversiality', 'string'),
 ('body', 'string')]

In [7]:
from pyspark.sql.types import IntegerType,BooleanType
# https://stackoverflow.com/questions/46956026/how-to-convert-column-with-string-type-to-int-form-in-pyspark-data-frame
# data_df = data_df.withColumn("Plays", data_df["Plays"].cast(IntegerType()))
df2 = df2.withColumn('ups',df2['ups'].cast(IntegerType()))
df2 = df2.withColumn('downs',df2['downs'].cast(IntegerType()))
df2 = df2.withColumn('score',df2['score'].cast(IntegerType()))
df2 = df2.withColumn('controversiality',df2['controversiality'].cast(IntegerType()))
df2 = df2.withColumn('gilded',df2['gilded'].cast(IntegerType()))
df2 = df2.withColumn('score_hidden',df2['score_hidden'].cast(BooleanType()))
df2 = df2.fillna(0)
df2.dtypes

[('subreddit_id', 'string'),
 ('ups', 'int'),
 ('gilded', 'int'),
 ('score_hidden', 'boolean'),
 ('downs', 'int'),
 ('score', 'int'),
 ('controversiality', 'int'),
 ('body', 'string')]

In [8]:
df2.select(['score_hidden']).distinct().show()

+------------+
|score_hidden|
+------------+
|        true|
|       false|
+------------+



In [9]:
# subset 

In [10]:
# Split to training and test
(trainingData, testData) = df2.sample(.0001).randomSplit([0.7, 0.3])

In [11]:
tk = Tokenizer(inputCol="body", outputCol="words")
# tk_data = tk.transform(trainingData)
# tk_data.select('body','words').show(5)

In [12]:
sw = StopWordsRemover(inputCol="words", outputCol="filtered")
# sw_data = sw.transform(tk_data)
# sw_data.select('body','words','filtered').show(5)

In [13]:
# temp = sw_data.sample(.0001)
# temp.count()

In [14]:
# temp.dtypes

In [15]:
cv = CountVectorizer(inputCol="filtered", outputCol="counted", vocabSize=3, minDF=2.0)
# cv_fit = cv.fit(temp)
# cv_data = cv_fit.transform(temp)
# cv_data.select('body','words','filtered','counted').show(5)

In [16]:
w2v =  Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="word2vec")
# w2v_fit = w2v.fit(cv_data)
# w2v_data = w2v_fit.transform(cv_data)
# w2v_data.select('body','words','filtered',"word2vec").show(5)

In [17]:
si = StringIndexer(inputCol="subreddit_id", outputCol="sr_id_num") # maybe I don't need this.
# si_model = si.fit(w2v_data)
# si_data = si_model.transform(w2v_data)

# si_data.show(5)
# # for each level, count freq. val=0 for most freq, then 1, ...
# stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex") #- converts to numeric
# model = stringIndexer.fit(df)
# indexed = model.transform(df)
# indexed.show()

In [18]:
# from pyspark.mllib.linalg import Vectors
# from pyspark.mllib.regression import LabeledPoint

# transformed_df = df.rdd.map(lambda row: LabeledPoint(row[0], Vectors.dense(row[0:-1])))

In [19]:
# https://stackoverflow.com/questions/32556178/create-labeledpoints-from-spark-dataframe-in-python
# (vec.select(col("outcome_column").alias("label"), col("features"))
#   .rdd
#   .map(lambda row: LabeledPoint(row.label, row.features)))

# w2v_data.dtypes

In [20]:
# vec_rdd = vec.rdd.map(tuple).take(5)
# features = vec.select(feats).rdd.map(tuple)
# rdd = vec.select(['subreddit_id','ups','gilded','score_hidden','downs','score','controversiality','counted','word2vec']).rdd.map(lambda x: LabeledPoint(x[0],x[1:]))
# LabeledPoint(label, features)


# def parsePoint(line):
#     values = [float(x) for x in line.split(' ')]
#     return LabeledPoint(values[0], values[1:])

# parsedData = data.map(parsePoint)

# model_data = si_data.select(['ups','gilded','score_hidden','downs','score','controversiality','counted','word2vec','sr_id_num'])
# model_data = si_data.select(['ups','gilded','score_hidden','downs','score','controversiality','sr_id_num'])
# rdd = model_data.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))

# model_data.dtypes
# model_data.show(5)
# rdd.take(3)

In [21]:
# Train a RandomForest model.  #david
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
# model = RandomForest.trainClassifier(vec_rdd, numClasses=1000, categoricalFeaturesInfo={1:2,2:2},
#                                      numTrees=10, featureSubsetStrategy="auto",
#                                      impurity='gini', maxDepth=3, maxBins=32)

In [22]:
feats =  ['ups','gilded','score_hidden','downs','score','controversiality','counted','word2vec']
# feats =  ['ups','gilded','score_hidden','downs','score','controversiality']
assembler = VectorAssembler(inputCols=feats, outputCol="features")
# assembler_data = assembler.transform(si_data)

# assembler_data.show(5)


In [23]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol='sr_id_num', featuresCol='features')
# rf_model = rf.fit(assembler_data)

In [24]:
pipeline = Pipeline(stages=[tk,sw,cv,w2v,si,assembler,rf])
# pipeline = Pipeline(stages=[tk,sw,si,ohe,vecAs])
model = pipeline.fit(trainingData)
prediction = model.transform(testData)

In [25]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="sr_id_num", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(prediction)
# evaluator = MulticlassClassificationEvaluator(
#     labelCol=“n_index”, predictionCol=“prediction”, metricName=“accuracy”)
# accuracy = evaluator.evaluate(predictions)

Py4JJavaError: An error occurred while calling o564.evaluate.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 4 in stage 39.0 failed 1 times, most recent failure: Lost task 4.0 in stage 39.0 (TID 727, udc-ba33-13c7, executor driver): org.apache.spark.SparkException: Failed to execute user defined function(StringIndexerModel$$Lambda$3497/0x000000084142c840: (string) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Unseen label: t5_2qh3r. To handle unseen labels, set Param handleInvalid to keep.
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1(StringIndexer.scala:405)
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1$adapted(StringIndexer.scala:390)
	... 19 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2120)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2139)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2164)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1003)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$collectAsMap$1(PairRDDFunctions.scala:737)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:736)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.confusions$lzycompute(MulticlassMetrics.scala:61)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.confusions(MulticlassMetrics.scala:52)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.tpByClass$lzycompute(MulticlassMetrics.scala:78)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.tpByClass(MulticlassMetrics.scala:76)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.accuracy$lzycompute(MulticlassMetrics.scala:188)
	at org.apache.spark.mllib.evaluation.MulticlassMetrics.accuracy(MulticlassMetrics.scala:188)
	at org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator.evaluate(MulticlassClassificationEvaluator.scala:179)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function(StringIndexerModel$$Lambda$3497/0x000000084142c840: (string) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: org.apache.spark.SparkException: Unseen label: t5_2qh3r. To handle unseen labels, set Param handleInvalid to keep.
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1(StringIndexer.scala:405)
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1$adapted(StringIndexer.scala:390)
	... 19 more


In [None]:
# ohe = OneHotEncoder(inputCol="sr_id_num", outputCol="subr_ohe")  
# ohe_fit = ohe.fit(si_data)
# ohe_data = ohe_fit.transform(si_data)