In [3]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta, time
import os

In [2]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 20

plt.rc('font', size=BIGGER_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=BIGGER_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [4]:
import statsmodels.formula.api as smf
import scipy.stats
from scipy.stats import ttest_ind
from scipy.stats import f_oneway

In [5]:
df_x = pd.read_csv("Data/df_x_nb4a-mis.csv", index_col = 0)
print(df_x.shape)

(171634, 34)


In [6]:
df_x['message_dt'] = pd.to_datetime(df_x["message_dt"], format='%Y-%m-%d %H:%M:00')
df_x['message_date'] = pd.to_datetime(df_x["message_date"], format='%Y-%m-%d')

# AM Usage pattern

In [7]:
df_x['prequar'] = df_x['message_date'] <= pd.Timestamp(2020, 3, 10)
df_x['quar'] = df_x['message_date'] >= pd.Timestamp(2020, 3, 24)
df_x['message_satsun'] = df_x['message_date'].dt.weekday >= 5

In [8]:
df_x['round30m'] = df_x['message_dt'].dt.round('30T').apply(lambda x: x.time())
df_x['roundhr'] = df_x['message_dt'].dt.round('H').apply(lambda x: x.time())

### Baseline

### Graphs

In [9]:
x = [(pd.Timestamp(2017, 1, 1, h), pd.Timestamp(2017, 1, 1, h, 30)) for h in range(0, 24)]
x = [j for i in x for j in i]

y_pre = df_x[df_x['prequar'] & ~df_x['message_satsun'] & \
             (df_x['tel'].str.startswith('+57'))].groupby('round30m').size()
y_quar = df_x[df_x['quar'] & ~df_x['message_satsun'] & \
             (df_x['tel'].str.startswith('+57'))].groupby('round30m').size()

y_pre = y_pre / y_pre.sum()
y_quar = y_quar / y_quar.sum()

In [10]:
fig, ax = plt.subplots(figsize = (8, 6))
ax.plot(x, y_pre, label = 'Before 3/10')
ax.plot(x, y_quar, label = 'After 3/24')

ax.xaxis.set_major_locator(mdates.HourLocator(interval = 8))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))

fig.autofmt_xdate()
plt.xticks(ha='center', rotation=0)
plt.xlabel("UTC-5 / Colombia Time\n(30 Minute Buckets)")
plt.ylabel("Density")
plt.title("Weekday Messages (Colombia Users)")
plt.legend()
plt.savefig('images/ch-coronavirus/usage_weekday_co.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()


To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


In [11]:
x = [(pd.Timestamp(2017, 1, 1, h), pd.Timestamp(2017, 1, 1, h, 30)) for h in range(0, 24)]
x = [j for i in x for j in i]

y_pre = df_x[df_x['prequar'] & df_x['message_satsun'] & \
             (df_x['tel'].str.startswith('+57'))].groupby('round30m').size()
y_quar = df_x[df_x['quar'] & df_x['message_satsun'] & \
             (df_x['tel'].str.startswith('+57'))].groupby('round30m').size()

y_pre = y_pre / y_pre.sum()
y_quar = y_quar / y_quar.sum()

In [12]:
fig, ax = plt.subplots(figsize = (8, 6))
ax.plot(x, y_pre, label = 'Before 3/10')
ax.plot(x, y_quar, label = 'After 3/24')

ax.xaxis.set_major_locator(mdates.HourLocator(interval = 8))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))

fig.autofmt_xdate()
plt.xticks(ha='center', rotation=0)
plt.xlabel("UTC-5 / Colombia Time\n(30 Minute Buckets)")
plt.ylabel("Density")
plt.title("Weekend Messages (Colombia Users)")
plt.legend()
plt.savefig('images/ch-coronavirus/usage_weekend_co.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [13]:
x = [pd.Timestamp(2017, 1, 1, h) for h in range(0, 24)]

y_pre = df_x[df_x['prequar'] & ~df_x['message_satsun'] & \
             (df_x['tel'].str.startswith('+58'))].groupby('roundhr').size()
y_quar = df_x[df_x['quar'] & ~df_x['message_satsun'] & \
             (df_x['tel'].str.startswith('+58'))].groupby('roundhr').size()

y_pre = y_pre / y_pre.sum()
y_quar = y_quar / y_quar.sum()

In [14]:
fig, ax = plt.subplots(figsize = (8, 6))
ax.plot(x, y_pre, label = 'Before 3/10')
ax.plot(x, y_quar, label = 'After 3/24')

ax.xaxis.set_major_locator(mdates.HourLocator(interval = 8))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))

fig.autofmt_xdate()
plt.xticks(ha='center', rotation=0)
plt.xlabel("UTC-5 / Colombia Time\n(Hour Buckets)")
plt.ylabel("Density")
plt.title("Weekday Messages (Venezuela Users)")
plt.legend()
plt.savefig('images/ch-coronavirus/usage_weekday_vz.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [15]:
x = [pd.Timestamp(2017, 1, 1, h) for h in range(0, 24)]

y_pre = df_x[df_x['prequar'] & df_x['message_satsun'] & \
             (df_x['tel'].str.startswith('+58'))].groupby('roundhr').size()
y_quar = df_x[df_x['quar'] & df_x['message_satsun'] & \
             (df_x['tel'].str.startswith('+58'))].groupby('roundhr').size()

y_pre = y_pre / y_pre.sum()
y_quar = y_quar / y_quar.sum()

In [16]:
fig, ax = plt.subplots(figsize = (8, 6))
ax.plot(x, y_pre, label = 'Before 3/10')
ax.plot(x, y_quar, label = 'After 3/24')

ax.xaxis.set_major_locator(mdates.HourLocator(interval = 8))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))

fig.autofmt_xdate()
plt.xticks(ha='center', rotation=0)
plt.xlabel("UTC-5 / Colombia Time\n(Hour Buckets)")
plt.ylabel("Density")
plt.title("Weekend Messages (Venezuela Users)")
plt.legend()
plt.savefig('images/ch-coronavirus/usage_weekend_vz.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

### Statistical tests

In [17]:
df_x['5am'] = (df_x['message_dt'].dt.time <= time(5))

In [18]:
prequar = df_x[df_x['prequar'] & ~df_x['message_satsun'] & \
                      df_x['tel'].str.startswith('+57')][['message_date', '5am']].groupby('message_date').mean()

quar = df_x[df_x['quar'] & ~df_x['message_satsun'] & \
                      df_x['tel'].str.startswith('+57')][['message_date', '5am']].groupby('message_date').mean()

In [19]:
print(prequar.mean())
print(quar.mean())
print(ttest_ind(prequar, quar))

5am    0.015088
dtype: float64
5am    0.07753
dtype: float64
Ttest_indResult(statistic=array([-2.72739013]), pvalue=array([0.01128373]))


In [20]:
prequar = df_x[df_x['prequar'] & df_x['message_satsun'] & \
                      df_x['tel'].str.startswith('+57')][['message_date', '5am']].groupby('message_date').mean()

quar = df_x[df_x['quar'] & df_x['message_satsun'] & \
                      df_x['tel'].str.startswith('+57')][['message_date', '5am']].groupby('message_date').mean()

In [21]:
print(prequar.mean())
print(quar.mean())
print(ttest_ind(prequar, quar))

5am    0.04785
dtype: float64
5am    0.073323
dtype: float64
Ttest_indResult(statistic=array([-0.77810524]), pvalue=array([0.45452315]))


# Audio/video lengths

In [22]:
df_text = df_x[df_x['text'].notnull()]
df_audio = df_x[df_x['audio_length'].notnull()]
df_video = df_x[df_x['video_length'].notnull()]

df_text['seconds'] = (df_text['message_dt'] - pd.Timestamp(2020, 2, 13)).dt.seconds
df_audio['seconds'] = (df_audio['message_dt'] - pd.Timestamp(2020, 2, 13)).dt.seconds
df_video['seconds'] = (df_video['message_dt'] - pd.Timestamp(2020, 2, 13)).dt.seconds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [23]:
def get_sec(t):
    if t.count(':') == 1:
        m, s = t.split(':')
        h = 0
    else:
        h, m, s = t.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

In [24]:
df_audio['audio_length'] = df_audio['audio_length'].apply(get_sec)
df_video['video_length'] = df_video['video_length'].apply(get_sec)
df_text['word_length'] = df_text['text'].str.count(' ')
df_text['char_length'] = df_text['text'].str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value i

## All messages

In [25]:
prequar = df_text[df_text['prequar']]['word_length']
quar = df_text[df_text['quar']]['word_length']

print(np.mean(prequar))
print(np.mean(quar))
ttest_ind(quar, prequar)

20.796626921540177
22.611962887411988


Ttest_indResult(statistic=3.551933730794416, pvalue=0.0003826366433006887)

In [26]:
reg = smf.ols('word_length ~ seconds', data = df_text).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:            word_length   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     359.1
Date:                Mon, 13 Apr 2020   Prob (F-statistic):           6.03e-80
Time:                        11:02:32   Log-Likelihood:            -5.7884e+05
No. Observations:              101414   AIC:                         1.158e+06
Df Residuals:                  101412   BIC:                         1.158e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     35.4756      0.671     52.860      0.0

In [27]:
prequar = df_text[df_text['prequar']]['char_length']
quar = df_text[df_text['quar']]['char_length']

print(np.mean(prequar))
print(np.mean(quar))
ttest_ind(quar, prequar)

136.40853363615335
146.12015529380798


Ttest_indResult(statistic=2.974750123063625, pvalue=0.002933171229091544)

In [28]:
reg = smf.ols('char_length ~ seconds', data = df_text).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:            char_length   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     415.0
Date:                Mon, 13 Apr 2020   Prob (F-statistic):           4.53e-92
Time:                        11:03:15   Log-Likelihood:            -7.6733e+05
No. Observations:              101414   AIC:                         1.535e+06
Df Residuals:                  101412   BIC:                         1.535e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    236.2888      4.305     54.882      0.0

In [29]:
prequar = df_audio[df_audio['prequar']]['audio_length']
quar = df_audio[df_audio['quar']]['audio_length']

print(np.mean(prequar))
print(np.mean(quar))
ttest_ind(quar, prequar)

100.00078451882845
123.34257014201593


Ttest_indResult(statistic=5.657908056735413, pvalue=1.5954884774929313e-08)

In [31]:
reg = smf.ols('audio_length ~ seconds', data = df_audio).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:           audio_length   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.2844
Date:                Mon, 13 Apr 2020   Prob (F-statistic):              0.594
Time:                        11:04:35   Log-Likelihood:                -58459.
No. Observations:                8918   AIC:                         1.169e+05
Df Residuals:                    8916   BIC:                         1.169e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    124.0034      5.242     23.657      0.0

In [30]:
prequar = df_video[df_video['prequar']]['video_length']
quar = df_video[df_video['quar']]['video_length']

print(np.mean(prequar))
print(np.mean(quar))
ttest_ind(quar, prequar)

79.06282566622004
105.5407249466951


Ttest_indResult(statistic=10.982222076943751, pvalue=6.420266023942567e-28)

In [32]:
reg = smf.ols('video_length ~ seconds', data = df_video).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:           video_length   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     25.72
Date:                Mon, 13 Apr 2020   Prob (F-statistic):           3.99e-07
Time:                        11:04:48   Log-Likelihood:                -99159.
No. Observations:               15596   AIC:                         1.983e+05
Df Residuals:                   15594   BIC:                         1.983e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    111.2357      3.112     35.748      0.0

# Orig

In [33]:
df_orig_audio = df_audio[~df_audio['forwarded'] & ~df_audio['forwarded_highly']]
df_orig_video = df_video[~df_video['forwarded'] & ~df_video['forwarded_highly']]
df_orig_text = df_text[~df_text['forwarded'] & ~df_text['forwarded_highly']]


In [34]:
prequar = df_orig_text[df_orig_text['prequar']]['word_length']
quar = df_orig_text[df_orig_text['quar']]['word_length']

print(np.mean(prequar))
print(np.mean(quar))
ttest_ind(quar, prequar)

15.573980781789347
16.715894538606403


Ttest_indResult(statistic=2.590971814720861, pvalue=0.00957253440311212)

In [35]:
prequar = df_orig_text[df_orig_text['prequar']]['char_length']
quar = df_orig_text[df_orig_text['quar']]['char_length']

print(np.mean(prequar))
print(np.mean(quar))
ttest_ind(quar, prequar)

99.77355630189383
107.42286252354049


Ttest_indResult(statistic=2.671593364821832, pvalue=0.007550967902962)

In [36]:
prequar = df_orig_audio[df_orig_audio['prequar']]['audio_length']
quar = df_orig_audio[df_orig_audio['quar']]['audio_length']

print(np.mean(prequar))
print(np.mean(quar))
ttest_ind(quar, prequar)

72.02225088681071
77.88888888888889


Ttest_indResult(statistic=1.6149782604954377, pvalue=0.10637559691597213)

In [37]:
prequar = df_orig_video[df_orig_video['prequar']]['video_length']
quar = df_orig_video[df_orig_video['quar']]['video_length']

print(np.mean(prequar))
print(np.mean(quar))
ttest_ind(quar, prequar)

72.22903795233893
93.12815533980583


Ttest_indResult(statistic=7.417833749288304, pvalue=1.328695945339175e-13)

In [39]:
reg = smf.ols('word_length ~ seconds', data = df_orig_text).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:            word_length   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     335.4
Date:                Mon, 13 Apr 2020   Prob (F-statistic):           8.85e-75
Time:                        11:05:41   Log-Likelihood:            -4.8897e+05
No. Observations:               89146   AIC:                         9.779e+05
Df Residuals:                   89144   BIC:                         9.780e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     26.7766      0.570     46.949      0.0

In [38]:
reg = smf.ols('char_length ~ seconds', data = df_orig_text).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:            char_length   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     349.3
Date:                Mon, 13 Apr 2020   Prob (F-statistic):           8.47e-78
Time:                        11:05:32   Log-Likelihood:            -6.5605e+05
No. Observations:               89146   AIC:                         1.312e+06
Df Residuals:                   89144   BIC:                         1.312e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    174.3928      3.716     46.927      0.0

In [40]:
reg = smf.ols('audio_length ~ seconds', data = df_orig_audio).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:           audio_length   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     10.85
Date:                Mon, 13 Apr 2020   Prob (F-statistic):           0.000992
Time:                        11:06:03   Log-Likelihood:                -41410.
No. Observations:                6569   AIC:                         8.282e+04
Df Residuals:                    6567   BIC:                         8.284e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     66.1402      4.702     14.067      0.0

In [41]:
reg = smf.ols('video_length ~ seconds', data = df_orig_video).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:           video_length   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     4.388
Date:                Mon, 13 Apr 2020   Prob (F-statistic):             0.0362
Time:                        11:06:13   Log-Likelihood:                -58760.
No. Observations:                9330   AIC:                         1.175e+05
Df Residuals:                    9328   BIC:                         1.175e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     94.8803      3.684     25.752      0.0

# Messages per user

### Weight by each user/message

In [72]:
prequar = df_x[df_x['prequar']][['tel', 'message_date']].groupby(['tel', 'message_date']).size()
quar = df_x[df_x['quar']][['tel', 'message_date']].groupby(['tel', 'message_date']).size()

print(np.mean(prequar))
print(np.mean(quar))
ttest_ind(prequar, quar)

5.505997054078698
6.091910499139415


Ttest_indResult(statistic=-3.0222270368769566, pvalue=0.002511986852479515)

In [64]:
df_fals = pd.DataFrame(df_x[['tel', 'message_date']].groupby(['tel', 'message_date']).size())
df_fals.columns = ['n']

In [65]:
df_fals['date'] = [i[1] for i in df_fals.index]
df_fals['days_since'] = (df_fals['date'] - pd.Timestamp(2020, 2, 13)).dt.days

In [67]:
reg = smf.ols('n ~ days_since', data = df_fals).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:                      n   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.9462
Date:                Mon, 13 Apr 2020   Prob (F-statistic):              0.331
Time:                        11:21:24   Log-Likelihood:            -1.2569e+05
No. Observations:               31321   AIC:                         2.514e+05
Df Residuals:                   31319   BIC:                         2.514e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      5.3518      0.152     35.260      0.0

# Replies

In [76]:
prequar = df_x[df_x['prequar']]['replies_n']
quar = df_x[df_x['quar']]['replies_n']

print(np.mean(prequar))
print(np.mean(quar))
ttest_ind(prequar, quar)

0.5719181136065428
0.6243996157540826


Ttest_indResult(statistic=-4.88088531512768, pvalue=1.0573399111578853e-06)

In [79]:
df_x['seconds'] = (df_x['message_dt'] - pd.Timestamp(2020, 2, 13)).dt.seconds

In [81]:
prequar = df_x[df_x['prequar'] & (df_x['virality'] > 0)]['virality']
quar = df_x[df_x['quar'] & (df_x['virality'] > 0)]['virality']

print(np.mean(prequar))
print(np.mean(quar))
ttest_ind(prequar, quar)

1.530703628750302
1.6241713657480448


Ttest_indResult(statistic=-7.501287608123921, pvalue=6.423342886998428e-14)

In [82]:
reg = smf.ols('virality ~ seconds', data = df_x[df_x['virality'] > 0]).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:               virality   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     4.865
Date:                Mon, 13 Apr 2020   Prob (F-statistic):             0.0274
Time:                        11:30:34   Log-Likelihood:            -1.0557e+05
No. Observations:               62421   AIC:                         2.111e+05
Df Residuals:                   62419   BIC:                         2.112e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.5335      0.015    100.532      0.0