In [1]:
import pandas as pd
import numpy as np
import sys
import datetime
import scipy.stats as stats
import matplotlib as plt

In [2]:
counts = pd.read_json('reddit-counts.json.gz', lines=True)

In [3]:
counts = counts[counts['date'] >= datetime.date(2012, 1, 1)]
counts = counts[counts['date'] <= datetime.date(2013, 12, 31)]
counts = counts[counts['subreddit'] == 'canada']

'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  """Entry point for launching an IPython kernel.
'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  


In [4]:
counts

Unnamed: 0,comment_count,date,subreddit
66,1657,2013-03-14,canada
69,1369,2013-07-08,canada
97,1343,2012-07-04,canada
115,1619,2013-03-11,canada
165,1909,2013-09-11,canada
179,1196,2012-02-04,canada
181,2112,2012-08-08,canada
182,1929,2012-08-09,canada
193,2086,2012-01-17,canada
251,1570,2012-11-17,canada


In [5]:
#weekdays = counts[(counts['date'].map(lambda date: datetime.date.weekday(date)) != 5) & (counts['date'].map(lambda date: datetime.date.weekday(date)) != 6)]

In [6]:
#weekdays

In [7]:
weekdays = counts[(counts['date'].apply(datetime.date.weekday) != 5) & (counts['date'].apply(datetime.date.weekday) != 6)]

In [50]:
weekdays['comment_count'].mean()

1823.5785440613026

In [9]:
weekends = counts[(counts['date'].apply(datetime.date.weekday) == 5) | (counts['date'].apply(datetime.date.weekday) == 6)]

In [49]:
weekends['comment_count'].mean()

1269.5071770334928

In [11]:
stats.ttest_ind(weekends['comment_count'], weekdays['comment_count'])

Ttest_indResult(statistic=-17.70431974179247, pvalue=1.3005502847207912e-58)

In [12]:
stats.normaltest(weekends['comment_count'])

NormaltestResult(statistic=12.976880141083813, pvalue=0.0015209196859635404)

In [13]:
stats.normaltest(weekdays['comment_count'])

NormaltestResult(statistic=32.21804641032879, pvalue=1.0091137251707994e-07)

In [14]:
stats.levene(weekends['comment_count'], weekdays['comment_count'])

LeveneResult(statistic=4.07889686696194, pvalue=0.04378740989202803)

In [15]:
weekends_log = weekends['comment_count'].apply(np.log)
weekdays_log = weekdays['comment_count'].apply(np.log)

In [16]:
#weekends_log

In [17]:
stats.normaltest(weekends_log)

NormaltestResult(statistic=2.310753456351079, pvalue=0.31493886820667)

In [18]:
stats.normaltest(weekdays_log)

NormaltestResult(statistic=15.640150708373614, pvalue=0.00040159142006827235)

In [19]:
stats.levene(weekends_log, weekdays_log)

LeveneResult(statistic=12.560671586444855, pvalue=0.0004190759393372205)

In [20]:
weekends_exp = weekends['comment_count'].apply(np.exp)
weekdays_exp = weekdays['comment_count'].apply(np.exp)

In [21]:
stats.normaltest(weekends_exp)

  a_zero_mean = a - np.expand_dims(np.mean(a, axis), axis)


NormaltestResult(statistic=nan, pvalue=nan)

In [22]:
stats.normaltest(weekends_exp)

NormaltestResult(statistic=nan, pvalue=nan)

In [23]:
stats.levene(weekends_exp, weekdays_exp)

  Zij[i] = abs(asarray(args[i]) - Yci[i])


LeveneResult(statistic=nan, pvalue=nan)

In [24]:
weekends_sqrt = weekends['comment_count'].apply(np.sqrt)
weekdays_sqrt = weekdays['comment_count'].apply(np.sqrt)

In [25]:
stats.normaltest(weekends_sqrt)

NormaltestResult(statistic=4.458564637984015, pvalue=0.10760562894666933)

In [26]:
stats.normaltest(weekdays_sqrt)

NormaltestResult(statistic=6.600593923705198, pvalue=0.03687221613365365)

In [27]:
stats.levene(weekends_sqrt, weekdays_sqrt)

LeveneResult(statistic=0.34690221366480645, pvalue=0.5560544297516696)

In [28]:
weekends_sq2 = weekends['comment_count']**2
weekdays_sq2 = weekdays['comment_count']**2

In [29]:
stats.normaltest(weekends_sq2)

NormaltestResult(statistic=48.46552571941154, pvalue=2.991195568683962e-11)

In [30]:
stats.normaltest(weekdays_sq2)

NormaltestResult(statistic=131.50515994471635, pvalue=2.7798243005635955e-29)

In [31]:
stats.levene(weekends_sq2, weekdays_sq2)

LeveneResult(statistic=29.56281874435813, pvalue=7.391434727467261e-08)

In [32]:
weekdays_isodate = weekdays['date'].apply(datetime.date.isocalendar).apply(pd.Series)

In [33]:
del weekdays_isodate[2]

In [34]:
weekdays_isodate.columns = ['year','week']

In [35]:
weekdays_group = pd.concat([weekdays, weekdays_isodate], axis = 1)

In [36]:
weekdays_group = weekdays_group.groupby(['year','week']).agg('mean')

In [37]:
weekends_isodate = weekends['date'].apply(datetime.date.isocalendar).apply(pd.Series)

In [38]:
del weekends_isodate[2]

In [39]:
weekends_isodate.columns = ['year','week']

In [40]:
weekends_group = pd.concat([weekends, weekends_isodate], axis = 1)

In [41]:
weekends_group = weekends_group.groupby(['year','week']).agg('mean')

In [42]:
weekends_isodate

Unnamed: 0,year,week
179,2012,5
251,2012,46
401,2013,28
479,2013,25
495,2012,30
536,2013,38
636,2013,1
880,2013,2
894,2013,36
937,2012,19


In [43]:
weekends_group

Unnamed: 0_level_0,Unnamed: 1_level_0,comment_count
year,week,Unnamed: 2_level_1
2011,52,995.0
2012,1,1163.0
2012,2,1372.0
2012,3,915.5
2012,4,1285.0
2012,5,1228.0
2012,6,980.5
2012,7,1273.5
2012,8,1336.5
2012,9,1021.0


In [44]:
stats.normaltest(weekdays_group['comment_count'])

NormaltestResult(statistic=2.35359913339607, pvalue=0.3082637390825463)

In [45]:
stats.normaltest(weekends_group['comment_count'])

NormaltestResult(statistic=3.7552982609164878, pvalue=0.15294924717078442)

In [46]:
stats.levene(weekdays_group['comment_count'], weekends_group['comment_count'])

LeveneResult(statistic=1.6248579510098702, pvalue=0.20383788083573426)

In [47]:
stats.ttest_ind(weekends_group['comment_count'], weekdays_group['comment_count'])

Ttest_indResult(statistic=-14.885763186916106, pvalue=1.3353656052303144e-34)

In [48]:
stats.mannwhitneyu(weekends['comment_count'],weekdays['comment_count'])

MannwhitneyuResult(statistic=15099.0, pvalue=4.3122266173669665e-53)