# DS 5110 Group Project
Team: Alexandra Cathcart (adc6fs), Benjamin Feciura (bmf3bw), Jeremey Donovan (jdd5dw), Jordan Hiatt (jdh2e)

Original data: https://www.kaggle.com/reddit/reddit-comments-may-2015


## Includes & Spark Setup

In [41]:
import pandas as pd

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import *

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, countDistinct, lower, size, split, udf, when
from pyspark.sql.types import ArrayType, IntegerType,  StringType, StructType

spark = SparkSession.builder.getOrCreate()


## Data Import and Pre-Processing

### Data Import

In [42]:
# Import the reddit data
full_path = '/project/ds5559/r-slash-group8/sample.csv'

df = spark.read.csv(full_path,  inferSchema=True, header = True)

In [43]:
# Import the Bad Word data
schema = StructType().add("badWord",StringType(),True)
dfBW=spark.read.format("csv").schema(schema).load('bad_words.csv')
#  dfBW.show(5)  # not showing since words are quite vulgar

# Also create in list format
listBW=list(dfBW.select('badWord').toPandas()['badWord']) 
# listBW


In [44]:
# Create a regex with all the bad words
# if there is an issue, try \\\\b instead
listBW=list(map(lambda line: "\\b" + line + "\\b",listBW))
delim='|'
strBW=delim.join(listBW)



### Filtering

In [45]:
# Drop unneeded cols from dataframe
df=df.drop('_c0','created_utc','subreddit_id','link_id','name','score_hidden','author_flair_css_class', 'gilded', \
        'author_flair_text','id','archived','retrieved_on', 'edited','controversiality','parent_id','score')

# convert integer cols (ups, downs, and gilded) to integers
# Note: we could have done this by defining a schema before the csv read
df=df.withColumn("ups",df.ups.cast(IntegerType()))
df=df.withColumn("downs",df.downs.cast(IntegerType()))
#df=df.withColumn("gilded",df.gilded.cast(IntegerType()))  # Removed gilded since not used in this analysis

# Confirm new schema
df.printSchema()
df.show(5)

root
 |-- ups: integer (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- removal_reason: string (nullable = true)
 |-- downs: integer (nullable = true)
 |-- author: string (nullable = true)
 |-- body: string (nullable = true)
 |-- distinguished: string (nullable = true)

+----+---------+--------------+-----+------------+--------------------+-------------+
| ups|subreddit|removal_reason|downs|      author|                body|distinguished|
+----+---------+--------------+-----+------------+--------------------+-------------+
|   4|soccer_jp|            NA|    0|       rx109|                くそ|         null|
|null|     null|          null| null|        null|                null|         null|
|   0|     null|          null| null|        null|                null|         null|
|   4|      nba|            NA|    0|   WyaOfWade|gg this one's ove...|           NA|
|   0| politics|            NA|    0|Wicked_Truth|Are you really im...|           NA|
+----+---------+------------

In [46]:
# Count the number of rows before removing NA
df.count()
# There are 15,317,725 rows

15317725

In [47]:
# Remove rows where up, down, or body is null. We  do this since inference of these values is not applicable
df=df.filter(df['ups'].isNotNull())
df=df.filter(df['downs'].isNotNull())
df=df.filter(df['body'].isNotNull())

df.show(5)

+---+---------+--------------+-----+--------------+--------------------+-------------+
|ups|subreddit|removal_reason|downs|        author|                body|distinguished|
+---+---------+--------------+-----+--------------+--------------------+-------------+
|  4|soccer_jp|            NA|    0|         rx109|                くそ|         null|
|  4|      nba|            NA|    0|     WyaOfWade|gg this one's ove...|           NA|
|  0| politics|            NA|    0|  Wicked_Truth|Are you really im...|           NA|
|  3|AskReddit|            NA|    0|      jesse9o3|No one has a Euro...|           NA|
|  3|AskReddit|            NA|    0|beltfedshooter|"That the kid ""....|           NA|
+---+---------+--------------+-----+--------------+--------------------+-------------+
only showing top 5 rows



In [48]:
# Remove rows where the author was '[deleted]' 
df=df.filter(df['author']!='[deleted]')

# Remove author "0"
df=df.filter(df['author']!='0')


# Remove rows where the author was 'AutoModerator'
# see https://www.reddit.com/wiki/automoderator
df=df.filter(df['author']!='AutoModerator')

In [49]:
# Count the number of rows AFTER removing NA
df.count()
# There now 9,229,025 rows

9226090

### Binning & Feature Engineering

In [50]:
# Lowercase all body text
df=df.withColumn('body',lower(col('body')))

In [51]:
# Even though we dropped the column, adding score back into dataframe by computing it
df=df.withColumn('score',df['ups']-df['downs'])
df=df.withColumn("score",df.score.cast(IntegerType()))
df.show(5)

+---+---------+--------------+-----+--------------+--------------------+-------------+-----+
|ups|subreddit|removal_reason|downs|        author|                body|distinguished|score|
+---+---------+--------------+-----+--------------+--------------------+-------------+-----+
|  4|soccer_jp|            NA|    0|         rx109|                くそ|         null|    4|
|  4|      nba|            NA|    0|     WyaOfWade|gg this one's ove...|           NA|    4|
|  0| politics|            NA|    0|  Wicked_Truth|are you really im...|           NA|    0|
|  3|AskReddit|            NA|    0|      jesse9o3|no one has a euro...|           NA|    3|
|  3|AskReddit|            NA|    0|beltfedshooter|"that the kid ""....|           NA|    3|
+---+---------+--------------+-----+--------------+--------------------+-------------+-----+
only showing top 5 rows



In [52]:
# Determine a scoreSentiment as either postive, neutral, or negative.
# This will be our response variable

# Drop scoreSentiment if it already exists
df=df.drop('scoreSentiment')

# Set up bucketizer
splits = [-float("inf"), -0.1,0.1, float("inf")]
bkt = Bucketizer(splits=splits, inputCol="score", outputCol="scoreSentiment")

# Transform to add scoreSentiment: 0=negative; 1=neutral; 2=positive.
df=bkt.transform(df)

# !!! Cannot shift to -1,0,1 since LR must start with 0 !!!
# To make things more clear, shift to -1=negative; 0=neutral; 1=positive
#df=df.withColumn("scoreSentiment", \
#                 when(df['scoreSentiment']==0,-1) \
#                 .when(df['scoreSentiment']==1,0) \
#                 .otherwise(1)
#                ) 

df.show(2)



+---+---------+--------------+-----+---------+--------------------+-------------+-----+--------------+
|ups|subreddit|removal_reason|downs|   author|                body|distinguished|score|scoreSentiment|
+---+---------+--------------+-----+---------+--------------------+-------------+-----+--------------+
|  4|soccer_jp|            NA|    0|    rx109|                くそ|         null|    4|           2.0|
|  4|      nba|            NA|    0|WyaOfWade|gg this one's ove...|           NA|    4|           2.0|
+---+---------+--------------+-----+---------+--------------------+-------------+-----+--------------+
only showing top 2 rows



In [53]:
# Flag comments containing bad words
df=df.withColumn('bwFlag',col('body').rlike(strBW))

In [54]:
# Append bodyWordCount
df=df.withColumn("bodyWordCount", size(split(df['body'], ' ')))
#df.show(5)

## Data Splitting & Sampling

In [55]:
seed=314
trainDF,testDF,holdoutDF=df.randomSplit([0.4,0.4,0.2],seed)

## EDA

In [56]:
# How many comments have bad words?
# Confirm the flagging worked by looking at how many comments contain bad words vs good
# NOTE: This has a rather long runtime!!!
df.groupby('bwFlag').agg({"bwFlag":"count"}).show()
#df.filter(df['bwFlag']==True).show(5,False)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-56-198fda75c5b9>", line 4, in <module>
    df.groupby('bwFlag').agg({"bwFlag":"count"}).show()
  File "/usr/local/spark/python/pyspark/sql/dataframe.py", line 440, in show
    print(self._jdf.showString(n, 20, vertical))
  File "/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1303, in __call__
    answer = self.gateway_client.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1033, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1200, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.7/socket.py", line 589, in readinto
    return self._sock.r

TypeError: object of type 'NoneType' has no len()

In [None]:
# How many authors are there?
df.select(countDistinct('author')).show()
# There are 1,234,824 authors

In [None]:
# Show the top 10 authors with sum of ups and downs
df.groupby('author').agg({"author":"count","ups":"sum","downs":"sum","score":"sum"}).sort(col('count(author)').desc()).show(10)

Odd that the preceding authors have no down but this is correct

In [None]:
# Show authors with the lowest scores
df.groupby('author').agg({"score":"sum","ups":"sum","downs":"sum"}).sort(col('sum(score)').asc()).show(10)

In [None]:
# Get a summary of score sentiment by label
#tmpDF.groupby('scoreSentiment').agg({"scoreSentiment":"count"}).show()
df.groupby('scoreSentiment').agg({"scoreSentiment":"count"}).show()

In [None]:
# Show graphical Distribution of sentiment (TBD)

## Model: Predict Sentiment from body

In [40]:
# Though not the cleanest thing to do from a data sci perspective, we
# are going to drop the neutral sentiment rows
df=df.filter(df['scoreSentiment']!=1)
# Shift positive from 2 to 1
df=df.withColumn("scoreSentiment", \
                 when(df['scoreSentiment']==2,1) \
                 .when(df['scoreSentiment']==0,0) \
                 .otherwise(-1)
                ) 
# we should never have the otherwise case!!!

AttributeError: 'int' object has no attribute 'filter'

In [22]:
# Create TF (Term Frequency) feature
tok = Tokenizer(inputCol="body", outputCol="words")
htf = HashingTF(inputCol="words", outputCol="tf", numFeatures=200)  

#testing
tmpDF=tok.transform(df)
tmpDF=htf.transform(tmpDF)
tmpDF.select('words','tf').show(2)

+--------------------+--------------------+
|               words|                  tf|
+--------------------+--------------------+
|              [くそ]|   (200,[147],[1.0])|
|[gg, this, one's,...|(200,[2,17,24,35,...|
+--------------------+--------------------+
only showing top 2 rows



In [23]:
# Create w2v (word to vec) feature

# the comment string needs to be turned into a vector for w2v to work
# unfortunately, VectorAssember does not work on string so we need a UDF

# Create UDF (note: split(anything,0) simply means don't split)
str_to_vec=spark.udf.register("str_to_vec",
                             lambda row:row.split("#",0),
                             ArrayType(StringType()))

# set up the tranformation
rva=SQLTransformer(statement="SELECT *, str_to_vec(body) bodyVec FROM __THIS__")

w2v = Word2Vec(inputCol='bodyVec', outputCol='w2v')  # not setting minCount 

# testing
tmpDF=rva.transform(df)
model=w2v.fit(tmpDF)
tmpDF=model.transform(tmpDF)
tmpDF.show(2)

+---+---------+--------------+-----+---------+--------------------+-------------+-----+--------------+------+-------------+--------------------+--------------------+
|ups|subreddit|removal_reason|downs|   author|                body|distinguished|score|scoreSentiment|bwFlag|bodyWordCount|             bodyVec|                 w2v|
+---+---------+--------------+-----+---------+--------------------+-------------+-----+--------------+------+-------------+--------------------+--------------------+
|  4|soccer_jp|            NA|    0|    rx109|                くそ|         null|    4|             1| false|            1|              [くそ]|[0.0,0.0,0.0,0.0,...|
|  4|      nba|            NA|    0|WyaOfWade|gg this one's ove...|           NA|    4|             1| false|           12|[gg this one's ov...|[0.0,0.0,0.0,0.0,...|
+---+---------+--------------+-----+---------+--------------------+-------------+-----+--------------+------+-------------+--------------------+--------------------+
only sho

In [29]:
# Assemble predictors
va=VectorAssembler(inputCols=['tf','w2v','bwFlag','bodyWordCount'],outputCol='features')

In [30]:
# Set up the regression model
lr = LogisticRegression(labelCol='scoreSentiment',maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [31]:
# Build the pipeline
#pipeline=Pipeline(stages=[bkt,tok,htf,rva,w2v,va,lr])  # took out bkt since this is pre-EDA
pipeline=Pipeline(stages=[tok,htf,rva,w2v,va,lr])

In [32]:
# Fit the multinomial logistic regression model
mlrModel=pipeline.fit(trainDF)

Py4JJavaError: An error occurred while calling o474.fit.
: org.apache.spark.SparkException: Classification labels should be in [0 to 1]. Found 157931 invalid labels.
	at org.apache.spark.ml.classification.LogisticRegression.$anonfun$train$1(LogisticRegression.scala:556)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:487)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:482)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:281)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:150)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [24]:
# Training Summary
# source: https://spark.apache.org/docs/latest/ml-classification-regression.html

# Fix source: https://stackoverflow.com/questions/37278999/logistic-regression-with-spark-ml-data-frames
lrm=mlrModel.stages[-1]

# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(lrm.coefficientMatrix))
print("Intercept: " + str(lrm.interceptVector))

trainingSummary = lrm.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# for multiclass, we can inspect metrics on a per-label basis
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))


Coefficients: 
3 X 300 CSRMatrix

Intercept: [-1.0264586714522934,-1.0107704204551655,2.037229091907459]
objectiveHistory:
0.35298803317465105
False positive rate by label:
label 0: 0.0
label 1: 0.0
label 2: 1.0
True positive rate by label:
label 0: 0.0
label 1: 0.0
label 2: 1.0
Precision by label:
label 0: 0.0
label 1: 0.0
label 2: 0.9139359489937812
Recall by label:
label 0: 0.0
label 1: 0.0
label 2: 1.0
F-measure by label:
label 0: 0.0
label 1: 0.0
label 2: 0.9550329513109017
Accuracy: 0.9139359489937812
FPR: 0.9139359489937812
TPR: 0.9139359489937812
F-measure: 0.8728389466766606
Precision: 0.8352789188631633
Recall: 0.9139359489937812


In [25]:
# Make preductions on the test data
mlrPrediction=mlrModel.transform(testDF)

In [26]:
mlrPrediction.select('scoreSentiment','prediction').show(3)

+--------------+----------+
|scoreSentiment|prediction|
+--------------+----------+
|           0.0|       2.0|
|           0.0|       2.0|
|           0.0|       2.0|
+--------------+----------+
only showing top 3 rows



### TBD: Evaluate the predictions. Judging from the training though, it seems to over-predict category 2 "positive" --- which is the most prevalent

In [27]:
# Stuff with ngrams not currently used

#May need to drop col when rerunning
#df=df.drop('body2grams')
#df=df.drop('body3grams')

# Create 2grams
#ngram = NGram(n=2, inputCol="words", outputCol="body2grams")
#df = ngram.transform(df)

# Create 3grams
#ngram = NGram(n=3, inputCol="words", outputCol="body3grams")
#df = ngram.transform(df)

In [28]:
# NOT USED since scoreSentiment is multinomial response not predictor
# OneHotEncoding of Score_sentiment
# since it is already numeric, no need for StringIndexer
#encoder = OneHotEncoder(inputCol="score_sentiment", outputCol="scoreSentimentVec")
#model = encoder.fit(df)
#df = model.transform(df)

## Save notebook as PDF document

In [76]:
# Save notebook as PDF document
!jupyter nbconvert --to pdf `pwd`/*.ipynb

[NbConvertApp] Converting notebook /sfs/qumulo/qhome/jdd5dw/ds5110-project/Jeremey_code.ipynb to pdf
[NbConvertApp] Writing 57104 bytes to notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', 'notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', 'notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 68569 bytes to /sfs/qumulo/qhome/jdd5dw/ds5110-project/Jeremey_code.pdf
[NbConvertApp] Converting notebook /sfs/qumulo/qhome/jdd5dw/ds5110-project/test_file.ipynb to pdf
[NbConvertApp] Writing 26544 bytes to notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', 'notebook.tex', '-quiet']
[NbConvertApp] CRITICAL | xelatex failed: ['xelatex', 'notebook.tex', '-quiet']
This is XeTeX, Version 3.14159265-2.6-0.99999 (TeX Live 2019/dev/Debian) (preloaded format=xelatex)
 restricted \write18 enabled.
entering extended mode
(./notebook.tex
LaTeX2e <2018-12-01>
(/usr/share/texlive/t