In [1]:
from pyspark import SparkContext, SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import col, udf, lower, round

In [2]:
sc = SparkContext("local", "First App")

In [3]:
sqlCtx = SQLContext(sc)

In [4]:
df = sqlCtx.read.csv("/home/jovyan/work/data-science/Exploring Hacker News Posts/hacker_news.csv", header=True, inferSchema=True, quote='"', escape='"')
df = df.fillna(0, subset=['num_points', 'num_comments'])

In [5]:
df.dtypes

[('id', 'int'),
 ('title', 'string'),
 ('url', 'string'),
 ('num_points', 'int'),
 ('num_comments', 'int'),
 ('author', 'string'),
 ('created_at', 'string')]

In [6]:
df = df.withColumn('title_lower', lower(col('title')))
show_posts = df.filter(df.title_lower.startswith('show hn'))
ask_posts = df.filter(df.title_lower.startswith('ask hn'))

In [7]:
total_show_comments = show_posts.groupBy().sum().collect()[0][2]
total_ask_comments = ask_posts.groupBy().sum().collect()[0][2]
avg_show_comments = total_show_comments/show_posts.count()
avg_ask_comments = total_ask_comments/ask_posts.count()

print("Average of comments in show posts: %.2f" % avg_show_comments)
print("Average of comments in ask posts: %.2f" % avg_ask_comments)

Average of comments in show posts: 10.32
Average of comments in ask posts: 14.04


In [8]:
get_hour = udf(lambda x: x.split(" ")[1].split(':')[0] + ":00", StringType())
ask_posts = ask_posts.withColumn('hour', get_hour('created_at'))

In [9]:
ask_posts.groupBy("hour")\
        .avg('num_comments')\
        .sort('avg(num_comments)', ascending=False) \
        .withColumn('avg_num_comments', round('avg(num_comments)',2))\
        .drop('avg(num_comments)')\
        .toPandas()[:5]

Unnamed: 0,hour,avg_num_comments
0,15:00,38.59
1,2:00,23.81
2,20:00,21.53
3,16:00,16.8
4,21:00,16.01
