In [1]:
# path to data- first 10,000,000 rows sampled from large reddit dataset
path = '/project/ds5559/r-slash-group8/sample.csv'

In [2]:
import pandas as pd # data management
from pyspark.sql import SparkSession # paralellization
import numpy as np # math
import re # regex
from pyspark.sql.functions import * # pyspark sql functions
from pyspark.sql.types import ArrayType, IntegerType,  StringType, TimestampType # pyspark sql cast data types
from pyspark.ml import Pipeline # pipeline of ML steps
from pyspark.ml.feature import * # ML features
from pyspark.ml.classification import LogisticRegression # ML regression model
from pyspark.mllib.evaluation import MulticlassMetrics # Model evaluation

In [3]:
# start spark session
spark = SparkSession.builder.getOrCreate()

In [4]:
# read in data
df_full = spark.read.csv(path, inferSchema=True, header = True)

In [5]:
# preview data
df_full.show(5)

+--------------------------------+-----------+----+------------+---------+----------+------------+----------------------+-----------------+---------+-------+--------------+------+-----+--------+------------+-----+------------+--------------------+-------------+------+----------------+----------+
|                             _c0|created_utc| ups|subreddit_id|  link_id|      name|score_hidden|author_flair_css_class|author_flair_text|subreddit|     id|removal_reason|gilded|downs|archived|      author|score|retrieved_on|                body|distinguished|edited|controversiality| parent_id|
+--------------------------------+-----------+----+------------+---------+----------+------------+----------------------+-----------------+---------+-------+--------------+------+-----+--------+------------+-----+------------+--------------------+-------------+------+----------------+----------+
|                               1| 1430438400|   4|    t5_378oi|t3_34di91|t1_cqug90g|           0|           

In [6]:
# remove irrelevant columns
df = df_full.drop('_c0','subreddit_id','link_id','name','score_hidden','author_flair_css_class', \
        'author_flair_text','archived','retrieved_on', 'edited','parent_id','ups', \
       'downs', 'removal_reason', 'distinguished')
# keep score, subreddit, id, created_utc, author, body, gilded, controversiality

df=df.withColumn("score",df.score.cast(IntegerType()))
df=df.withColumn("gilded",df.gilded.cast(IntegerType()))
df=df.withColumn("created_utc",from_unixtime(df.created_utc))
df=df.withColumnRenamed("id", "comment_id")

In [7]:
# preview reduced data
df.show(5)
df.printSchema()

+-------------------+---------+----------+------+------------+-----+--------------------+----------------+
|        created_utc|subreddit|comment_id|gilded|      author|score|                body|controversiality|
+-------------------+---------+----------+------+------------+-----+--------------------+----------------+
|2015-05-01 00:00:00|soccer_jp|   cqug90g|     0|       rx109|    4|                くそ|            null|
|               null|     null|      null|  null|        null| null|                null|            null|
|               null|     null|      null|  null|        null| null|                null|            null|
|2015-05-01 00:00:00|      nba|   cqug90h|     0|   WyaOfWade|    4|gg this one's ove...|               0|
|2015-05-01 00:00:00| politics|   cqug90i|     0|Wicked_Truth|    0|Are you really im...|               0|
+-------------------+---------+----------+------+------------+-----+--------------------+----------------+
only showing top 5 rows

root
 |-- crea

In [8]:
# filter out rows with missing values in essential columns
df=df.filter(df['body'].isNotNull())
df=df.filter(df['subreddit'].isNotNull())
df=df.filter(df['comment_id'].isNotNull())

In [9]:
# preview clean data
df.show()

+-------------------+------------------+----------+------+---------------+-----+--------------------+----------------+
|        created_utc|         subreddit|comment_id|gilded|         author|score|                body|controversiality|
+-------------------+------------------+----------+------+---------------+-----+--------------------+----------------+
|2015-05-01 00:00:00|         soccer_jp|   cqug90g|     0|          rx109|    4|                くそ|            null|
|2015-05-01 00:00:00|               nba|   cqug90h|     0|      WyaOfWade|    4|gg this one's ove...|               0|
|2015-05-01 00:00:00|          politics|   cqug90i|     0|   Wicked_Truth|    0|Are you really im...|               0|
|2015-05-01 00:00:00|         AskReddit|   cqug90j|     0|       jesse9o3|    3|No one has a Euro...|               0|
|2015-05-01 00:00:00|         AskReddit|   cqug90k|     0| beltfedshooter|    3|"That the kid ""....|               0|
|2015-05-01 00:00:00|        bloodborne|   cqug90l

In [10]:
# total records remaining in dataset: 
df.count()

10002410

In [11]:
# choose only the two most popular subreddits
df_reduced = df.filter((df['subreddit'] == 'AskReddit') | (df['subreddit'] == 'leagueoflegends'))
# encode subreddit as a binary variable
df_reduced = df_reduced.withColumn('subreddit_bin',when(df['subreddit'] == 'AskReddit',0).otherwise(1))
df_reduced.show()

+-------------------+---------------+----------+------+------------------+-----+--------------------+--------------------+-------------+
|        created_utc|      subreddit|comment_id|gilded|            author|score|                body|    controversiality|subreddit_bin|
+-------------------+---------------+----------+------+------------------+-----+--------------------+--------------------+-------------+
|2015-05-01 00:00:00|      AskReddit|   cqug90j|     0|          jesse9o3|    3|No one has a Euro...|                   0|            0|
|2015-05-01 00:00:00|      AskReddit|   cqug90k|     0|    beltfedshooter|    3|"That the kid ""....|                   0|            0|
|2015-05-01 00:00:00|      AskReddit|   cqug90z|     0|     InterimFatGuy|    5|                NSFL|                   0|            0|
|2015-05-01 00:00:01|leagueoflegends|   cqug919|     0|    SenpaiOniichan|    1|well i think new ...|                null|            1|
|2015-05-01 00:00:01|      AskReddit|   c

In [12]:
# Total records between the two most popular subreddits
df_reduced.count()

950528

These two subreddits account for almost 10% of all the data; not a bad sample size!

In [13]:
# Train/Test split
df_train, df_test = df_reduced.randomSplit([0.75,0.25])

In [14]:
df_train.count()

712005

In [15]:
# Breakdown of test set by subreddit
df_train.groupBy('subreddit_bin').count().show()

+-------------+------+
|subreddit_bin| count|
+-------------+------+
|            1|145672|
|            0|566333|
+-------------+------+



Certainly an imbalance between the two classes.

In [16]:
df_test.groupBy('subreddit_bin').count().show()

+-------------+------+
|subreddit_bin| count|
+-------------+------+
|            1| 48782|
|            0|189741|
+-------------+------+



The train and test sets show a similar ratio; performance may be improved later on by downsampling the training set.

In [17]:
# Build up pipeline for modeling

# Separate documents into tokens
tok = Tokenizer(inputCol="body", outputCol="body_tokens")
# Remove stopwords
rem = StopWordsRemover(inputCol="body_tokens", outputCol="tokens_filtered")
# Reduce feature count using hashing function
htf = HashingTF(numFeatures = 4096, inputCol="tokens_filtered", outputCol="tf") 
# Vectorize features
w2v = Word2Vec(inputCol="body_tokens", outputCol="w2v") 
# Assemble feature column
va = VectorAssembler(inputCols=["tf", "w2v"], outputCol="features")
# Apply logistic regression
lr = LogisticRegression(labelCol='subreddit_bin', featuresCol='features', maxIter=10, regParam=0.01)

pipeline = Pipeline(stages=[tok, rem, htf, w2v, va, lr])

In [18]:
# Fit model to train data
model = pipeline.fit(df_train)

In [19]:
# Make predictions for test data using model
prediction = model.transform(df_test)
# Check out format of dataframe with predictions
prediction.show()

+-------------------+---------------+----------+------+---------------+-----+--------------------+----------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|        created_utc|      subreddit|comment_id|gilded|         author|score|                body|controversiality|subreddit_bin|         body_tokens|     tokens_filtered|                  tf|                 w2v|            features|       rawPrediction|         probability|prediction|
+-------------------+---------------+----------+------+---------------+-----+--------------------+----------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|2015-05-01 00:00:00|      AskReddit|   cqug90z|     0|  InterimFatGuy|    5|                NSFL|               0|            0|       

In [20]:
# The predictions again seem to follow a fairly similar ratio to the actual data!
prediction.groupBy('prediction').count().show()

+----------+------+
|prediction| count|
+----------+------+
|       0.0|210457|
|       1.0| 28066|
+----------+------+



In [21]:
# reduce the predictions to only the necessary columns
predictionsAndLabels = prediction.drop('created_utc', 'subreddit', 'comment_id', 'gilded', 'controversiality', 'author', 'score', 'body', 'body_tokens', 'tokens_filtered', 'rawPrediction', 'probability', 'tf', 'w2v', 'features')

In [22]:
# Format the predictions correctly
predictionsAndLabels = predictionsAndLabels.withColumnRenamed('subreddit_bin', 'label')
predictionsAndLabels = predictionsAndLabels.withColumn('label', predictionsAndLabels.label.cast('float'))
predictionsAndLabels = predictionsAndLabels.withColumn('prediction', predictionsAndLabels.prediction.cast('float'))
predictionsAndLabels.show()

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       0.0|
|  0.0|       0.0|
+-----+----------+
only showing top 20 rows



In [23]:
# Convert the predictions to RDD
rdd = predictionsAndLabels.rdd.map(tuple)

In [24]:
# Run metrics on predictions
metrics = MulticlassMetrics(rdd)

In [25]:
metrics.confusionMatrix().toArray()

array([[185195.,  25262.],
       [  4546.,  23520.]])

In [26]:
metrics.accuracy

0.8750309194501159

In [27]:
metrics.weightedPrecision

0.9179263590174914

In [28]:
metrics.weightedRecall

0.875030919450116

In [29]:
metrics.weightedFMeasure()

0.8886404725156227

In [30]:
metrics.weightedFMeasure(beta=0.5)

0.904788745567817

In [31]:
metrics.weightedFalsePositiveRate

0.15704028448399157

Model biased toward predicting /r/Askreddit, could be a result of imbalance in classes in the training set. Will try downsampling.

In [32]:
def downsample(df, target, positive_label, negative_label):

    # Split the data by class
    positive_df = df.filter(df[target] == positive_label)
    negative_df = df.filter(df[target] == negative_label)
    
    # Count the observations by class
    positive_count = positive_df.count()
    negative_count = negative_df.count()
    
    # Reduce the larger dataset by the appropriate ratio
    if positive_count > negative_count:
        positive_df = positive_df.sample(True, negative_count/positive_count)
    else:
        negative_df = negative_df.sample(True, positive_count/negative_count)
        
    # Recombine into a full dataset
    df_b = positive_df
    df_b = df_b.union(negative_df)
    
    return df_b

In [33]:
# Downsample the data
df_train_ds = downsample(df_train, 'subreddit_bin', 1, 0)

In [34]:
# Verify the new counts
df_train_ds.groupBy('subreddit_bin').count().show()

+-------------+------+
|subreddit_bin| count|
+-------------+------+
|            1|145672|
|            0|145393|
+-------------+------+



Nice! Much closer

In [35]:
# The same pipeline can be reused... process follows...
model_ds = pipeline.fit(df_train_ds)

In [36]:
prediction_ds = model_ds.transform(df_test)

In [37]:
prediction_ds.groupBy('prediction').count().show()

+----------+------+
|prediction| count|
+----------+------+
|       0.0|166070|
|       1.0| 72453|
+----------+------+



In [38]:
predictionsAndLabels_ds = prediction_ds.drop('created_utc', 'subreddit', 'comment_id', 'gilded', 'controversiality', 'author', 'score', 'body', 'body_tokens', 'tokens_filtered', 'rawPrediction', 'probability', 'tf', 'w2v', 'features')

In [39]:
predictionsAndLabels_ds = predictionsAndLabels_ds.withColumnRenamed('subreddit_bin', 'label')
predictionsAndLabels_ds = predictionsAndLabels_ds.withColumn('label', predictionsAndLabels_ds.label.cast('float'))
predictionsAndLabels_ds = predictionsAndLabels_ds.withColumn('prediction', predictionsAndLabels_ds.prediction.cast('float'))

In [40]:
rdd_ds = predictionsAndLabels_ds.rdd.map(tuple)

In [41]:
metrics_ds = MulticlassMetrics(rdd_ds)

In [42]:
metrics_ds.confusionMatrix().toArray()

array([[151830.,  14240.],
       [ 37911.,  34542.]])

In [43]:
metrics_ds.accuracy

0.7813586111192631

In [44]:
metrics_ds.weightedPrecision

0.7722179203921324

In [45]:
metrics_ds.weightedRecall

0.7813586111192631

In [46]:
metrics_ds.weightedFMeasure()

0.767286632177368

In [47]:
metrics_ds.weightedFMeasure(beta=0.5)

0.7674473102424092

In [48]:
metrics_ds.weightedFalsePositiveRate

0.3903551401706386

Less biased toward predicting /r/AskReddit but performed much worse overall. Maybe a more distinguishing feature? Anecdotally, gold and awards have always been given often on AskReddit for especially good or helpful answers, more than most other subreddits. Maybe would provide a good distinction.

In [49]:
# Create a new pipeline step that includes the gilded column.
va2 = VectorAssembler(inputCols=["tf", "w2v", "gilded"], outputCol="features")
# And assemble a new pipeline
pipeline2 = Pipeline(stages=[tok, rem, htf, w2v, va2, lr])

In [50]:
# Create a new model using the training data, repeat same process
model_more_feats = pipeline2.fit(df_train_ds)
prediction_more_feats = model_more_feats.transform(df_test)

In [51]:
predictionsAndLabels_more_feats = prediction_more_feats.drop('created_utc', 'subreddit', 'comment_id', 'gilded', 'controversiality','author', 'score', 'body', 'body_tokens', 'tokens_filtered', 'rawPrediction', 'probability', 'tf', 'w2v', 'features')

In [52]:
predictionsAndLabels_more_feats = predictionsAndLabels_more_feats.withColumnRenamed('subreddit_bin', 'label')
predictionsAndLabels_more_feats = predictionsAndLabels_more_feats.withColumn('label', predictionsAndLabels_more_feats.label.cast('float'))
predictionsAndLabels_more_feats = predictionsAndLabels_more_feats.withColumn('prediction', predictionsAndLabels_more_feats.prediction.cast('float'))
predictionsAndLabels_more_feats.show()

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  1.0|       1.0|
|  0.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       0.0|
|  0.0|       0.0|
+-----+----------+
only showing top 20 rows



In [53]:
rdd_more_feats = predictionsAndLabels_more_feats.rdd.map(tuple)

In [54]:
metrics_more_feats = MulticlassMetrics(rdd_more_feats)

In [None]:
metrics_more_feats.confusionMatrix().toArray()

In [None]:
metrics_more_feats.accuracy

In [None]:
metrics_more_feats.weightedPrecision

In [None]:
metrics_more_feats.weightedRecall

In [None]:
metrics_more_feats.weightedFMeasure()

In [None]:
metrics_more_feats.weightedFMeasure(beta=0.5)

In [None]:
metrics_more_feats.weightedFalsePositiveRate

In [None]:
# Save notebook as PDF document
!jupyter nbconvert --to pdf `pwd`/*.ipynb