In [1]:
path = '/project/ds5559/r-slash-group8/sample.csv'

In [2]:
import pandas as pd
from pyspark.sql import SparkSession
import numpy as np
import re
import nltk
from pyspark.sql.functions import *
from pyspark.sql.types import ArrayType, IntegerType,  StringType, TimestampType

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
df_full = spark.read.csv(path,  inferSchema=True, header = True)

In [5]:
df_full.show(5)

+--------------------------------+-----------+----+------------+---------+----------+------------+----------------------+-----------------+---------+-------+--------------+------+-----+--------+------------+-----+------------+--------------------+-------------+------+----------------+----------+
|                             _c0|created_utc| ups|subreddit_id|  link_id|      name|score_hidden|author_flair_css_class|author_flair_text|subreddit|     id|removal_reason|gilded|downs|archived|      author|score|retrieved_on|                body|distinguished|edited|controversiality| parent_id|
+--------------------------------+-----------+----+------------+---------+----------+------------+----------------------+-----------------+---------+-------+--------------+------+-----+--------+------------+-----+------------+--------------------+-------------+------+----------------+----------+
|                               1| 1430438400|   4|    t5_378oi|t3_34di91|t1_cqug90g|           0|           

In [6]:
df = df_full.drop('_c0','subreddit_id','link_id','name','score_hidden','author_flair_css_class', \
        'author_flair_text','archived','retrieved_on', 'edited','controversiality','parent_id','ups', \
       'downs', 'removal_reason', 'distinguished')
# keep score, subreddit, id, created_utc, author, body

df=df.withColumn("score",df.score.cast(IntegerType()))
df=df.withColumn("gilded",df.gilded.cast(IntegerType()))
df=df.withColumn("created_utc",from_unixtime(df.created_utc))
df=df.withColumnRenamed("id", "comment_id")

In [7]:
df.show(5)
df.printSchema()

+-------------------+---------+----------+------+------------+-----+--------------------+
|        created_utc|subreddit|comment_id|gilded|      author|score|                body|
+-------------------+---------+----------+------+------------+-----+--------------------+
|2015-05-01 00:00:00|soccer_jp|   cqug90g|     0|       rx109|    4|                くそ|
|               null|     null|      null|  null|        null| null|                null|
|               null|     null|      null|  null|        null| null|                null|
|2015-05-01 00:00:00|      nba|   cqug90h|     0|   WyaOfWade|    4|gg this one's ove...|
|2015-05-01 00:00:00| politics|   cqug90i|     0|Wicked_Truth|    0|Are you really im...|
+-------------------+---------+----------+------+------------+-----+--------------------+
only showing top 5 rows

root
 |-- created_utc: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- comment_id: string (nullable = true)
 |-- gilded: integer (nullable = tru

In [8]:
df=df.filter(df['body'].isNotNull())
df=df.filter(df['subreddit'].isNotNull())
df=df.filter(df['comment_id'].isNotNull())

In [9]:
df.show()

+-------------------+------------------+----------+------+---------------+-----+--------------------+
|        created_utc|         subreddit|comment_id|gilded|         author|score|                body|
+-------------------+------------------+----------+------+---------------+-----+--------------------+
|2015-05-01 00:00:00|         soccer_jp|   cqug90g|     0|          rx109|    4|                くそ|
|2015-05-01 00:00:00|               nba|   cqug90h|     0|      WyaOfWade|    4|gg this one's ove...|
|2015-05-01 00:00:00|          politics|   cqug90i|     0|   Wicked_Truth|    0|Are you really im...|
|2015-05-01 00:00:00|         AskReddit|   cqug90j|     0|       jesse9o3|    3|No one has a Euro...|
|2015-05-01 00:00:00|         AskReddit|   cqug90k|     0| beltfedshooter|    3|"That the kid ""....|
|2015-05-01 00:00:00|        bloodborne|   cqug90l|     0|     Rubenticus|    1|Haha, i was getti...|
|2015-05-01 00:00:00|     relationships|   cqug90m|     0|silverraven1189|    6|Afte

In [10]:
df.count()

10002410

In [11]:
df_reduced = df.filter((df['subreddit'] == 'AskReddit') | (df['subreddit'] == 'nfl'))
df_reduced = df_reduced.withColumn('subreddit_bin',when(df['subreddit'] == 'AskReddit',0).otherwise(1))
df_reduced.show()

+-------------------+---------+----------+------+--------------+-----+--------------------+-------------+
|        created_utc|subreddit|comment_id|gilded|        author|score|                body|subreddit_bin|
+-------------------+---------+----------+------+--------------+-----+--------------------+-------------+
|2015-05-01 00:00:00|AskReddit|   cqug90j|     0|      jesse9o3|    3|No one has a Euro...|            0|
|2015-05-01 00:00:00|AskReddit|   cqug90k|     0|beltfedshooter|    3|"That the kid ""....|            0|
|2015-05-01 00:00:00|AskReddit|   cqug90z|     0| InterimFatGuy|    5|                NSFL|            0|
|2015-05-01 00:00:01|AskReddit|   cqug91c|     0|   JuanTutrego|    1|I'm a guy and I h...|            0|
|2015-05-01 00:00:01|AskReddit|   cqug91e|     0|   dcblackbelt|  101|Mid twenties male...|            0|
|2015-05-01 00:00:02|AskReddit|   cqug920|     0| TheDoorsShirt|    1|Fran Drescher lau...|            0|
|2015-05-01 00:00:02|AskReddit|   cqug921|    

In [12]:
df_reduced.count()

894729

In [13]:
df_train, df_test = df_reduced.randomSplit([0.75,0.25])

In [14]:
df_train.count()

671189

In [15]:
df_train.groupBy('subreddit_bin').count().show()

+-------------+------+
|subreddit_bin| count|
+-------------+------+
|            1|103800|
|            0|567389|
+-------------+------+



In [16]:
df_test.groupBy('subreddit_bin').count().show()

+-------------+------+
|subreddit_bin| count|
+-------------+------+
|            1| 34855|
|            0|188685|
+-------------+------+



In [17]:
from pyspark.ml import Pipeline  
from pyspark.ml.feature import *
from pyspark.ml.classification import LogisticRegression

In [18]:
tok = Tokenizer(inputCol="body", outputCol="body_tokens")
rem = StopWordsRemover(inputCol="body_tokens", outputCol="tokens_filtered")
htf = HashingTF(numFeatures = 4096, inputCol="tokens_filtered", outputCol="tf") 
w2v = Word2Vec(inputCol="body_tokens", outputCol="w2v") 
va = VectorAssembler(inputCols=["tf", "w2v"], outputCol="features")
lr = LogisticRegression(labelCol='subreddit_bin', featuresCol='features', maxIter=10, regParam=0.01)

pipeline = Pipeline(stages=[tok, rem, htf, w2v, va, lr])

In [19]:
model = pipeline.fit(df_train)

In [21]:
prediction = model.transform(df_test)
prediction.show()

+-------------------+---------+----------+------+--------------+-----+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|        created_utc|subreddit|comment_id|gilded|        author|score|                body|subreddit_bin|         body_tokens|     tokens_filtered|                  tf|                 w2v|            features|       rawPrediction|         probability|prediction|
+-------------------+---------+----------+------+--------------+-----+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|2015-05-01 00:00:00|AskReddit|   cqug90j|     0|      jesse9o3|    3|No one has a Euro...|            0|[no, one, has, a,...|[one, european, a...|(4096,[99,1343,17...|[0.11656655858016...|(4196,[99,1343,17..

In [31]:
prediction.groupBy('prediction').count().show()

+----------+------+
|prediction| count|
+----------+------+
|       0.0|205459|
|       1.0| 18061|
+----------+------+



In [39]:
predictionsAndLabels = prediction.drop('created_utc', 'subreddit', 'comment_id', 'gilded', 'author', 'score', 'body', 'body_tokens', 'tokens_filtered', 'rawPrediction', 'probability', 'tf', 'w2v', 'features')

In [45]:
predictionsAndLabels = predictionsAndLabels.withColumnRenamed('subreddit_bin', 'label')
predictionsAndLabels = predictionsAndLabels.withColumn('label', predictionsAndLabels.label.cast('float'))
predictionsAndLabels = predictionsAndLabels.withColumn('prediction', predictionsAndLabels.prediction.cast('float'))
predictionsAndLabels.show()

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  1.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       0.0|
|  0.0|       0.0|
+-----+----------+
only showing top 20 rows



In [46]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [47]:
rdd = predictionsAndLabels.rdd.map(tuple)

In [48]:
metrics = MulticlassMetrics(rdd)

In [49]:
metrics.confusionMatrix().toArray()

array([[185928.,  19345.],
       [  2757.,  15510.]])

Model biased toward predicting /r/Askreddit, could be tuned.