In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import sys
import scipy.stats as stats


In [2]:
counts=pd.read_json('reddit-counts.json.gz',lines=True)

In [3]:
counts

Unnamed: 0,date,subreddit,comment_count
0,2012-02-20,newfoundland,7
1,2015-01-26,Manitoba,1
2,2013-09-07,Yukon,2
3,2014-02-15,saskatchewan,5
4,2014-07-06,canada,1652
...,...,...,...
15465,2012-05-21,Quebec,365
15466,2012-05-21,britishcolumbia,4
15467,2013-09-07,britishcolumbia,5
15468,2011-09-10,Quebec,2


In [4]:
counts['year'] = pd.DatetimeIndex(counts['date']).year
counts

Unnamed: 0,date,subreddit,comment_count,year
0,2012-02-20,newfoundland,7,2012
1,2015-01-26,Manitoba,1,2015
2,2013-09-07,Yukon,2,2013
3,2014-02-15,saskatchewan,5,2014
4,2014-07-06,canada,1652,2014
...,...,...,...,...
15465,2012-05-21,Quebec,365,2012
15466,2012-05-21,britishcolumbia,4,2012
15467,2013-09-07,britishcolumbia,5,2013
15468,2011-09-10,Quebec,2,2011


In [5]:
counts =(counts[counts['subreddit']=='canada'])

In [6]:
counts = counts[counts['year'] >= 2012]
counts = counts[counts['year'] <= 2013]
counts

Unnamed: 0,date,subreddit,comment_count,year
66,2013-03-14,canada,1657,2013
69,2013-07-08,canada,1369,2013
97,2012-07-04,canada,1343,2012
115,2013-03-11,canada,1619,2013
165,2013-09-11,canada,1909,2013
...,...,...,...,...
15389,2013-01-01,canada,2113,2013
15413,2013-07-27,canada,1070,2013
15430,2012-10-19,canada,1486,2012
15456,2012-01-15,canada,1256,2012


In [7]:
counts['dayofweek'] = counts['date'].dt.weekday
counts

Unnamed: 0,date,subreddit,comment_count,year,dayofweek
66,2013-03-14,canada,1657,2013,3
69,2013-07-08,canada,1369,2013,0
97,2012-07-04,canada,1343,2012,2
115,2013-03-11,canada,1619,2013,0
165,2013-09-11,canada,1909,2013,2
...,...,...,...,...,...
15389,2013-01-01,canada,2113,2013,1
15413,2013-07-27,canada,1070,2013,5
15430,2012-10-19,canada,1486,2012,4
15456,2012-01-15,canada,1256,2012,6


In [8]:
weekends=counts[(counts['dayofweek']).isin([5,6])]


In [9]:
weekends = weekends.reset_index(drop=True)

In [10]:
weekends

Unnamed: 0,date,subreddit,comment_count,year,dayofweek
0,2012-02-04,canada,1196,2012,5
1,2012-11-17,canada,1570,2012,5
2,2013-07-14,canada,908,2013,6
3,2013-06-22,canada,984,2013,5
4,2012-07-29,canada,1199,2012,6
...,...,...,...,...,...
204,2012-11-04,canada,1772,2012,6
205,2012-12-09,canada,1350,2012,6
206,2012-12-02,canada,1725,2012,6
207,2013-07-27,canada,1070,2013,5


In [11]:
weekdays=counts[~((counts['dayofweek']).isin([5,6]))]

In [12]:
weekdays = weekdays.reset_index(drop=True)

In [13]:
weekdays

Unnamed: 0,date,subreddit,comment_count,year,dayofweek
0,2013-03-14,canada,1657,2013,3
1,2013-07-08,canada,1369,2013,0
2,2012-07-04,canada,1343,2012,2
3,2013-03-11,canada,1619,2013,0
4,2013-09-11,canada,1909,2013,2
...,...,...,...,...,...
517,2013-04-10,canada,2021,2013,2
518,2013-03-19,canada,1630,2013,1
519,2013-01-01,canada,2113,2013,1
520,2012-10-19,canada,1486,2012,4


In [14]:
weekdays_mean = weekdays['comment_count'].mean()


In [15]:
weekends_mean = weekends['comment_count'].mean()


In [16]:
#Tests without transformation 
ttest=stats.ttest_ind(weekdays['comment_count'], weekends['comment_count'] ).pvalue
levenetest=stats.levene(weekdays['comment_count'], weekends['comment_count']).pvalue
weekdaysnormal=stats.normaltest(weekdays['comment_count']).pvalue
weekendsnormal=stats.normaltest(weekends['comment_count']).pvalue

In [17]:
#Transformations

In [18]:
#Logarithmic
weekdaysnormal_log = stats.normaltest(np.log(weekdays['comment_count'])).pvalue
weekendsnormal_log= stats.normaltest(np.log(weekends['comment_count'])).pvalue
levenetest_log = stats.levene(np.log(weekdays['comment_count']),np.log(weekends['comment_count'])).pvalue
levenetest_log

0.0004190759393372205

In [19]:
#exponential
weekdaysnormal_exp = stats.normaltest(np.exp(weekdays['comment_count'])).pvalue
weekendsnormal_exp= stats.normaltest(np.exp(weekends['comment_count'])).pvalue
levenetest_exp = stats.levene(np.exp(weekdays['comment_count']),np.exp(weekends['comment_count'])).pvalue
levenetest_exp

  result = getattr(ufunc, method)(*inputs, **kwargs)
  a_zero_mean = a - np.expand_dims(np.mean(a, axis), axis)
  Zij[i] = abs(asarray(args[i]) - Yci[i])


nan

In [20]:
#square root
weekdaysnormal_sqrt = stats.normaltest(np.sqrt(weekdays['comment_count'])).pvalue
weekendsnormal_sqrt= stats.normaltest(np.sqrt(weekends['comment_count'])).pvalue
levenetest_sqrt = stats.levene(np.sqrt(weekdays['comment_count']),np.sqrt(weekends['comment_count'])).pvalue
levenetest_sqrt

0.5560544297516696

In [21]:
#square
levenetest_square=stats.levene(weekdays['comment_count']**2, weekends['comment_count']**2).pvalue
weekdaysnormal_square=stats.normaltest(weekdays['comment_count']**2).pvalue
weekendsnormal_square=stats.normaltest(weekends['comment_count']**2).pvalue
levenetest_square

7.391434727467261e-08

In [22]:
#Square root transformation is closest to the normal distributions so we pick that one

In [23]:
#Applying Central limit theorem

In [25]:
weekdays['week'] = weekdays['date'].apply(lambda x: str(x.isocalendar()[1])).apply(pd.Series)

In [36]:
weekends['week'] = weekends['date'].apply(lambda x: str(x.isocalendar()[1])).apply(pd.Series)

In [35]:
weekdays_mean = weekdays.groupby(['year', 'week']).aggregate('mean').reset_index()
weekends_mean = weekends.groupby(['year', 'week']).aggregate('mean').reset_index()

In [37]:
weekly_weekday_normality_p= stats.normaltest(weekdays_mean['comment_count']).pvalue
weekly_weekend_normality_p= stats.normaltest(weekends_mean['comment_count']).pvalue
weekly_levene_p = stats.levene(weekdays_mean['comment_count'],weekends_mean['comment_count']).pvalue
weekly_ttest_p = stats.ttest_ind(weekdays_mean['comment_count'], weekends_mean['comment_count']).pvalue


In [38]:
utest_p= stats.mannwhitneyu(weekdays['comment_count'],weekends['comment_count'], alternative= 'two-sided').pvalue

In [39]:
    print(OUTPUT_TEMPLATE.format(
        initial_ttest_p=ttest,
        initial_weekday_normality_p=weekdaysnormal,
        initial_weekend_normality_p=weekendsnormal,
        initial_levene_p=levenetest,
        transformed_weekday_normality_p=weekdaysnormal_sqrt ,
        transformed_weekend_normality_p=weekendsnormal_sqrt,
        transformed_levene_p=levenetest_sqrt,
        
        weekly_weekday_normality_p=weekly_weekday_normality_p,
        weekly_weekend_normality_p=weekly_weekend_normality_p,
        weekly_levene_p=weekly_levene_p,
        weekly_ttest_p=weekly_ttest_p,
        utest_p=utest_p,
    ))

NameError: name 'OUTPUT_TEMPLATE' is not defined