## Set Up

In [36]:
import pandas as pd

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import *

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, countDistinct, udf
from pyspark.sql.types import ArrayType, IntegerType,  StringType

spark = SparkSession.builder.getOrCreate()

## Pull in Data

In [37]:
full_path = '/project/ds5559/r-slash-group8/sample.csv'

df = spark.read.csv(full_path,  inferSchema=True, header = True)

In [38]:
# convert integer cols (ups, downs, and gilded) to integers
# Note: we could have done this by defining a schema before the csv read
df=df.withColumn("ups",df.ups.cast(IntegerType()))
df=df.withColumn("downs",df.downs.cast(IntegerType()))
df=df.withColumn("gilded",df.gilded.cast(IntegerType()))

# Confirm new schema
df.printSchema()
df.show(5)

root
 |-- _c0: string (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- ups: integer (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- score_hidden: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- id: string (nullable = true)
 |-- removal_reason: string (nullable = true)
 |-- gilded: integer (nullable = true)
 |-- downs: integer (nullable = true)
 |-- archived: string (nullable = true)
 |-- author: string (nullable = true)
 |-- score: string (nullable = true)
 |-- retrieved_on: string (nullable = true)
 |-- body: string (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- edited: string (nullable = true)
 |-- controversiality: string (nullable = true)
 |-- parent_id: string (nullable = true)

+--------------------------------+-----------+---

In [39]:
df.count()

15317725

In [40]:
# Remove null in important columns
print(df.filter(df['body'].isNull()).count())
print(df.filter(df['subreddit_id'].isNull()).count())

df=df.filter(df['body'].isNotNull())
df=df.filter(df['subreddit_id'].isNotNull())

df.count()

5315303
2969558


10002410

In [59]:
# some rows have the body in the subreddit column, I'll remove these
print(df.filter(df['subreddit'].rlike('\s')).count())

df = df.filter(df['subreddit'].rlike('^[A-Za-z1-9_]+$'))
df.count() #filter out spaces and special characters except for underscores

5491


9950519

In [60]:
# how many subredits are there?
df.select('subreddit_id').distinct().count()

26882

In [61]:
# get highest volume subreddits
top_sr = df.groupby('subreddit_id').agg({'subreddit_id':'count'})
top_sr.show(5)

+------------+-------------------+
|subreddit_id|count(subreddit_id)|
+------------+-------------------+
|    t5_2skgl|                463|
|    t5_326m9|               3527|
|    t5_32ob1|               1138|
|    t5_2scx7|                 47|
|    t5_380yz|                 11|
+------------+-------------------+
only showing top 5 rows



In [62]:
# add names based on frist appearance
# top_sr = top_sr.withColumn('subreddit',df.filter(df['subreddit_id']==top_sr['subreddit_id']))
top_sr.join(df.select('subreddit','subreddit_id').dropDuplicates(['subreddit']),on='subreddit_id', how='inner').sort(col('count(subreddit_id)').desc()).show(50)
# top_sr.show(50)


# NOTE: It looks like some of the data still has issues because I still have duplicates probably meaning there are some rows that have comment text in the subreddit column.  
#They weren't eliminated because the comments happen to follow subreddit naming conventions.  More work would be helpful here.
# My goal is to elminate unpopular subreddits so we have fewer categories to work with.  Not sure how important working with names really is.

+------------+-------------------+--------------------+
|subreddit_id|count(subreddit_id)|           subreddit|
+------------+-------------------+--------------------+
|    t5_2qh1i|             756074|           AskReddit|
|    t5_2rfxx|             194454|     leagueoflegends|
|    t5_2qh0u|             142840|                pics|
|    t5_2qmg3|             138655|                 nfl|
|    t5_2qh33|             137526|               funny|
|    t5_2qo4s|             136164|                 nba|
|    t5_2qh3l|             118304|                news|
|    t5_2qh1e|             101389|              videos|
|    t5_2qqjc|             100897|       todayilearned|
|    t5_2s580|              97940|               DotA2|
|    t5_2sgp1|              87835|             cquyjeo|
|    t5_2sgp1|              87835|             cquymdf|
|    t5_2sgp1|              87835|        pcmasterrace|
|    t5_2qiel|              86515|              hockey|
|    t5_2s7tt|              86010|       AdviceA