In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta, time

In [2]:
import ast # string rep. of list to list

In [3]:
import statsmodels.formula.api as smf
import scipy.stats
from scipy.stats import ttest_ind

In [4]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 20

plt.rc('font', size=BIGGER_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=BIGGER_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [9]:
df_x = pd.read_csv('Data/df_x_nb3-reply.csv', index_col=0)
print(df_x.shape)

(171634, 30)


In [10]:
df_x['reply_link'] = df_x['reply_link'].astype(pd.Int32Dtype())
df_x['reply_list'] = df_x['reply_list'].apply(lambda x: ast.literal_eval(x))

In [11]:
df_x['message_dt'] = pd.to_datetime(df_x["message_dt"], format='%Y-%m-%d %H:%M:00')
df_x['message_date'] = pd.to_datetime(df_x["message_date"], format='%Y-%m-%d')

# General statistics

In [12]:
df_x.shape[0]

171634

In [13]:
# Traced replies
trace = df_x[df_x['reply_link'] > 0].shape[0]
actual = df_x[df_x['reply_to'].notnull()].shape[0]
print(trace)
print(actual)
print(trace/actual)

43913
49212
0.8923230106478095


## Messages that were replied to

In [14]:
# Replied to
df_x[df_x['replies_n'] > 0].shape[0]

34444

In [15]:
df_x[df_x['replies_n'] > 0]['replies_n'].hist(range = (1, 10), bins = 10)
plt.xlabel("Number of Replies Received")
plt.ylabel("# of Messages")
plt.savefig('images/ch-replycascades/hist_replies_n.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [16]:
df_x[df_x['replies_n'] == 1].shape[0]

16959

In [17]:
df_x[df_x['replies_n'] == 2].shape[0]

6944

In [18]:
df_x[(df_x['replies_n'] > 0) & (df_x['replies_n'] < 5)].shape[0]

29478

### Content type

In [19]:
withrep = df_x[(df_x['replies_n'] > 0) & (df_x['text'].notnull())].shape[0]
n = df_x[df_x['text'].notnull()].shape[0]

print(withrep)
print(n)
print(withrep/n)

25191
101414
0.24839765712820716


In [20]:
withrep = df_x[(df_x['replies_n'] > 0) & (df_x['image'].notnull())].shape[0]
n = df_x[df_x['image'].notnull()].shape[0]

print(withrep)
print(n)
print(withrep/n)

6768
38455
0.17599791964633987


In [21]:
withrep = df_x[(df_x['replies_n'] > 0) & (df_x['video_length'].notnull())].shape[0]
n = df_x[df_x['video_length'].notnull()].shape[0]

print(withrep)
print(n)
print(withrep/n)

1872
15596
0.12003077712233906


In [22]:
withrep = df_x[(df_x['replies_n'] > 0) & (df_x['emojis'].notnull())].shape[0]
n = df_x[df_x['emojis'].notnull()].shape[0]

print(withrep)
print(n)
print(withrep/n)

4305
28886
0.14903413418264905


In [23]:
withrep = df_x[(df_x['replies_n'] > 0) & (df_x['audio_length'].notnull())].shape[0]
n = df_x[df_x['audio_length'].notnull()].shape[0]

print(withrep)
print(n)
print(withrep/n)

2053
8918
0.23020856694326083


### Forwarded content

In [24]:
withrep = df_x[(df_x['replies_n'] > 0) & (df_x['forwarded'] | df_x['forwarded_highly'])].shape[0]
n = df_x[(df_x['forwarded'] | df_x['forwarded_highly'])].shape[0]

print(withrep)
print(n)
print(withrep/n)

1354
26168
0.05174258636502599


In [25]:
# not forwarded images
# SLIGHTLY HIGHER
withrep = df_x[(df_x['replies_n'] > 0) & ~(df_x['forwarded'] | df_x['forwarded_highly']) \
    & (df_x['image'].notnull())].shape[0]
n = df_x[~(df_x['forwarded'] | df_x['forwarded_highly']) \
    & (df_x['image'].notnull())].shape[0]

print(withrep)
print(n)
print(withrep/n)

6416
33415
0.19200957653748316


## By nationality

In [26]:
df_c = df_x[df_x.tel.str.startswith('+57')]

In [27]:
df_v = df_x[df_x.tel.str.startswith('+58')]

In [28]:
for col in ['image', 'text', 'video_length', 'audio_length']:
    print(col)
    print(df_c[df_c[col].notnull()].shape[0] / df_c.shape[0])
    print(df_v[df_v[col].notnull()].shape[0] / df_v.shape[0])
    print()

image
0.20055363877076146
0.2420066745613091

text
0.6256519297119474
0.6028636021100227

video_length
0.08243066142448313
0.07270965658305523

audio_length
0.05255556447083367
0.04661427494886425



In [29]:
print((df_c['replies_n'] > 0).mean())
print((df_v['replies_n'] > 0).mean())

0.20650458691058868
0.17937345247066422


In [30]:
print(ttest_ind(df_c['replies_n'] > 0, df_v['replies_n'] > 0))
print(ttest_ind(df_c['image'].notnull(), df_v['image'].notnull()))
print(ttest_ind(df_c['text'].notnull(), df_v['text'].notnull()))

Ttest_indResult(statistic=11.572311999436215, pvalue=5.91035158690587e-31)
Ttest_indResult(statistic=-17.058287055883227, pvalue=3.6152432544988e-65)
Ttest_indResult(statistic=7.936455602334143, pvalue=2.0980350618193418e-15)


# By time of day

In [31]:
df_x['round_hour'] = df_x['message_dt'].dt.round('30T').apply(lambda x: x.time())

In [32]:
x = [(pd.Timestamp(2017, 1, 1, h), pd.Timestamp(2017, 1, 1, h, 30)) for h in range(0, 24)]
x = [j for i in x for j in i]
y = df_x[['round_hour', 'replies_n']].groupby('round_hour').mean().values
y_text = df_x[df_x['text'].notnull()][['round_hour', 'replies_n']].groupby('round_hour').mean().values

In [33]:
fig, ax = plt.subplots(figsize = (6, 4))
ax.plot(x, y, label = 'All Messages')
ax.plot(x, y_text, label = 'Text Messages')

ax.xaxis.set_major_locator(mdates.HourLocator(interval = 8))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))

fig.autofmt_xdate()
plt.xticks(ha='center', rotation=0)
plt.xlabel("Time Message Is Sent\n(30 Minute Buckets)")
plt.ylabel("Avg. # of\nReplies Received")
plt.legend()
plt.savefig('images/ch-replycascades/time_of_day.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()


To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


In [34]:
def replyToNotNull(df):
    return df['reply_to'].notnull().mean()

In [35]:
y = df_x[['round_hour', 'reply_to']].groupby('round_hour').apply(replyToNotNull).values

In [36]:
fig, ax = plt.subplots(figsize = (6, 4))
ax.plot(x, y * 100)
ax.xaxis.set_major_locator(mdates.HourLocator(interval = 8))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))

fig.autofmt_xdate()
plt.xticks(ha='center', rotation=0)
plt.ylabel("What % of messages\nare replies?")
plt.savefig('images/ch-replycascades/time_of_day_areReplies.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

# Average replies_n within groups

In [37]:
df_groups = pd.read_csv('Data/df_groups_nb2b-concentration.csv', index_col = 0)

In [38]:
df_groups['replies_n'] = df_x[['uid', 'replies_n']].groupby('uid').mean()

In [39]:
df_groups.to_csv('Data/df_groups_nb3a-reply.csv')

In [40]:
print((df_groups['replies_n'] == 0).sum())

50


In [41]:
df_groups['replies_n'].hist(bins = 20)
plt.xlabel("Avg. Number of Replies\n(To All Messages)")
plt.ylabel("# of Groups")
plt.savefig('images/ch-replycascades/hist_group_replies_n.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [42]:
reg = smf.ols('replies_n ~ entropy + hhConc + gini', data = df_groups).fit()
print(reg.summary())
# strong in HHCONC

                            OLS Regression Results                            
Dep. Variable:              replies_n   R-squared:                       0.253
Model:                            OLS   Adj. R-squared:                  0.240
Method:                 Least Squares   F-statistic:                     19.21
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           8.90e-11
Time:                        22:49:57   Log-Likelihood:                -51.493
No. Observations:                 174   AIC:                             111.0
Df Residuals:                     170   BIC:                             123.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.2548      0.100      2.551      0.0

In [43]:
reg = smf.ols('replies_n ~ entropy + hhConc + gini',
              data = df_groups[df_groups['replies_n'] > 0]).fit()
print(reg.summary())
# strong in HHCONC

                            OLS Regression Results                            
Dep. Variable:              replies_n   R-squared:                       0.182
Model:                            OLS   Adj. R-squared:                  0.162
Method:                 Least Squares   F-statistic:                     8.906
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           2.25e-05
Time:                        22:49:58   Log-Likelihood:                -48.593
No. Observations:                 124   AIC:                             105.2
Df Residuals:                     120   BIC:                             116.5
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.3855      0.137      2.819      0.0

In [44]:
cols = ['hhConc', 'entropy']
labs = ['H-H Concentration', 'Entropy']

for c, l in zip(cols, labs):
    plt.scatter(df_groups[c], df_groups['replies_n'], alpha = 0.3)
    plt.xlabel(l)
    plt.ylabel("Average # of Replies")
    plt.title("All Groups")
    m, b = np.polyfit(df_groups[c], df_groups['replies_n'], 1)
    plt.plot(df_groups[c], m*df_groups[c] + b, color = 'orange')
    plt.savefig('images/ch-replycascades/scatter_%s_replies_n.png' % c, bbox_inches = 'tight', pad_inches = 0.05)
    plt.close()

## Competing messages

In [None]:
def n_msg_10msurround(row):
    return df_x[(df_x.uid == row.uid) \
     & (df_x.message_dt <= row.message_dt + timedelta(minutes = 5)) \
     & (df_x.message_dt >= row.message_dt - timedelta(minutes = 5))].shape[0] - 1

In [None]:
df_x.loc[df_x['replies_n'] > 0, 'n_msg_10msurround'] = df_x.apply(n_msg_10msurround, axis = 1)

In [None]:
df_x.to_csv('Data/intermediate/df_x_nb3a-surround.csv')

### Reimport

In [5]:
df_x = pd.read_csv('Data/intermediate/df_x_nb3a-surround.csv')

In [6]:
df_x['n_msg_10msurround'].hist(bins = 20)

plt.title("Messages with Replies")
plt.xlabel('Number of "Competing" Messages\nin $[t-5, t+5]$ Window')
plt.ylabel('# of Messages')
plt.savefig('images/ch-replycascades/competing_msg.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [7]:
(df_x.loc[df_x['replies_n'] > 0, 'n_msg_10msurround'] < 10).mean()

0.4398153524561607

In [8]:
(df_x.loc[df_x['replies_n'] > 0, 'n_msg_10msurround'] < 20).mean()

0.6489664382766229