In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta, time
import os

In [2]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 20

plt.rc('font', size=BIGGER_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=BIGGER_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [3]:
import statsmodels.formula.api as smf
import scipy.stats
from scipy.stats import ttest_ind
from scipy.stats import f_oneway

In [4]:
df_x = pd.read_csv("Data/df_x_nb4a-mis.csv", index_col = 0)
print(df_x.shape)

(171634, 34)


In [5]:
df_x['text'] = df_x['text'].fillna('')
df_x['textlower'] = df_x['textlower'].fillna('')
df_x['message_dt'] = pd.to_datetime(df_x["message_dt"], format='%Y-%m-%d %H:%M:00')
df_x['message_date'] = pd.to_datetime(df_x["message_date"], format='%Y-%m-%d')

## Feature dummy

In [6]:
df_x['coronavirus'] = df_x['textlower'].str.contains('corona') & df_x['textlower'].str.contains('virus')
df_x['virus'] = df_x['textlower'].str.contains('virus')
df_x['frontera'] = df_x['textlower'].str.contains('frontera')
df_x['troch'] = df_x['textlower'].str.contains('troch')
df_x['cuarentena'] = df_x['textlower'].str.contains('cuarentena')

# Graph over time

## Select messages 5+ words

In [7]:
df_x['5_word'] = df_x['textlower'].apply(lambda x: x.count(' ') >= 4)

In [8]:
df_y = df_x[df_x['5_word']]

In [9]:
coronavirus = df_y[['message_date', 'coronavirus']].groupby('message_date').mean()
virus = df_y[['message_date', 'virus']].groupby('message_date').mean()
frontera = df_y[['message_date', 'frontera']].groupby('message_date').mean()
troch = df_y[['message_date', 'troch']].groupby('message_date').mean()
cuarentena = df_y[['message_date', 'cuarentena']].groupby('message_date').mean()

In [10]:
fig, ax = plt.subplots(figsize = (10, 6))
ax.plot(coronavirus * 100, label = '"Coronavirus"')
ax.plot(virus * 100, label = '"Virus"')
ax.plot(cuarentena * 100, label = '"Cuarentena"')

plt.xlabel("UTC-5 / Colombia Time")
plt.ylabel("% of 5+ Word Messages")
ax.xaxis.set_major_locator(mdates.DayLocator(interval = 14))
ax.legend()
plt.xticks(horizontalalignment = 'center', rotation = '0')
plt.savefig('images/ch-coronavirus/5word_proportion.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()


To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


In [11]:
fig, ax = plt.subplots(figsize = (14, 6))
ax.axvline(x = pd.Timestamp(2020, 3, 13), color = 'r', lw = 3)
ax.plot(coronavirus * 100, label = '"Coronavirus"')
ax.plot(frontera * 100, label = '"Frontera"')
ax.plot(troch * 100, label = '"Troch"')

plt.xlabel("UTC-5 / Colombia Time")
plt.ylabel("% of 5+ Word Messages")
ax.xaxis.set_major_locator(mdates.DayLocator(interval = 14))
ax.legend()
plt.xticks(horizontalalignment = 'center', rotation = '0')
plt.savefig('images/ch-coronavirus/5word_proportion_border.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [12]:
print(df_y[df_y['message_date'] < pd.Timestamp(2020, 3, 13)]['troch'].mean())
print(df_y[df_y['message_date'] < pd.Timestamp(2020, 3, 13)]['frontera'].mean())


0.002578471482105408
0.012239144635060336


### t-test

In [19]:
outside = df_y[(df_y['message_date'] < pd.Timestamp(2020, 3, 13)) \
               | (df_y['message_date'] > pd.Timestamp(2020, 3, 15))]
inside = df_y[(df_y['message_date'] >= pd.Timestamp(2020, 3, 13)) \
              & (df_y['message_date'] <= pd.Timestamp(2020, 3, 15))]

print(outside['troch'].mean())
print(inside['troch'].mean())

print(ttest_ind(outside['troch'], inside['troch']))

print(outside['frontera'].mean())
print(inside['frontera'].mean())

print(ttest_ind(outside['frontera'], inside['frontera']))

0.002592307153366496
0.023652694610778444
Ttest_indResult(statistic=-19.398686793340286, pvalue=1.4232135978179732e-83)
0.012593708400476423
0.05778443113772455
Ttest_indResult(statistic=-20.896377194931382, pvalue=1.271070353178447e-96)


# Group %

In [20]:
virus = df_x[['uid', 'message_date', 'virus']].groupby(['uid', 'message_date']).sum() > 0
coronavirus = df_x[['uid', 'message_date', 'coronavirus']].groupby(['uid', 'message_date']).sum() > 0
frontera = df_x[['uid', 'message_date', 'frontera']].groupby(['uid', 'message_date']).sum() > 0
troch = df_x[['uid', 'message_date', 'troch']].groupby(['uid', 'message_date']).sum() > 0
cuarentena = df_x[['uid', 'message_date', 'cuarentena']].groupby(['uid', 'message_date']).sum() > 0

virus = virus.groupby('message_date').mean()
coronavirus = coronavirus.groupby('message_date').mean()
frontera = frontera.groupby('message_date').mean()
troch = troch.groupby('message_date').mean()
cuarentena = cuarentena.groupby('message_date').mean()


In [28]:
fig, ax = plt.subplots(figsize = (14, 6))
ax.axvline(x = pd.Timestamp(2020, 3, 13), color = 'r', lw = 3)
ax.plot(coronavirus * 100, label = '"Coronavirus"')
ax.plot(frontera * 100, label = '"Frontera"')
ax.plot(troch * 100, label = '"Troch"')

plt.xlabel("UTC-5 / Colombia Time")
plt.ylabel("% of Active Groups with Keyword")
ax.xaxis.set_major_locator(mdates.DayLocator(interval = 14))
ax.legend()
plt.xticks(horizontalalignment = 'center', rotation = '0')
plt.savefig('images/ch-coronavirus/group_proportion.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

# % of user

In [30]:
virus = df_x[['tel', 'message_date', 'virus']].groupby(['tel', 'message_date']).sum() > 0
coronavirus = df_x[['tel', 'message_date', 'coronavirus']].groupby(['tel', 'message_date']).sum() > 0
frontera = df_x[['tel', 'message_date', 'frontera']].groupby(['tel', 'message_date']).sum() > 0
troch = df_x[['tel', 'message_date', 'troch']].groupby(['tel', 'message_date']).sum() > 0
cuarentena = df_x[['tel', 'message_date', 'cuarentena']].groupby(['tel', 'message_date']).sum() > 0

virus = virus.groupby('message_date').mean()
coronavirus = coronavirus.groupby('message_date').mean()
frontera = frontera.groupby('message_date').mean()
troch = troch.groupby('message_date').mean()
cuarentena = cuarentena.groupby('message_date').mean()


In [40]:
fig, ax = plt.subplots(figsize = (14, 6))
ax.axvline(x = pd.Timestamp(2020, 3, 13), color = 'r', lw = 3)
ax.plot(coronavirus * 100, label = '"Coronavirus"')
ax.plot(frontera * 100, label = '"Frontera"')
ax.plot(troch * 100, label = '"Troch"')

plt.xlabel("UTC-5 / Colombia time")
plt.ylabel("% of Active Users\nWho Discuss Keyword")
ax.xaxis.set_major_locator(mdates.DayLocator(interval = 14))
ax.legend()
plt.xticks(horizontalalignment = 'center', rotation = '0')
plt.savefig('images/ch-coronavirus/user_proportion.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

### Users with meaningful text messages

In [43]:
virus = df_y[['tel', 'message_date', 'virus']].groupby(['tel', 'message_date']).sum() > 0
coronavirus = df_y[['tel', 'message_date', 'coronavirus']].groupby(['tel', 'message_date']).sum() > 0
frontera = df_y[['tel', 'message_date', 'frontera']].groupby(['tel', 'message_date']).sum() > 0
troch = df_y[['tel', 'message_date', 'troch']].groupby(['tel', 'message_date']).sum() > 0
cuarentena = df_y[['tel', 'message_date', 'cuarentena']].groupby(['tel', 'message_date']).sum() > 0

virus = virus.groupby('message_date').mean()
coronavirus = coronavirus.groupby('message_date').mean()
frontera = frontera.groupby('message_date').mean()
troch = troch.groupby('message_date').mean()
cuarentena = cuarentena.groupby('message_date').mean()


In [44]:
fig, ax = plt.subplots(figsize = (14, 6))
ax.axvline(x = pd.Timestamp(2020, 3, 13), color = 'r', lw = 3)
ax.plot(coronavirus * 100, label = '"Coronavirus"')
ax.plot(frontera * 100, label = '"Frontera"')
ax.plot(troch * 100, label = '"Troch"')

plt.xlabel("UTC-5 / Colombia time")
plt.ylabel("% of Active Users\nWho Discuss Keyword")
plt.title("Users Who Send 5+ Word Text Messages")
ax.xaxis.set_major_locator(mdates.DayLocator(interval = 14))
ax.legend()
plt.xticks(horizontalalignment = 'center', rotation = '0')
plt.savefig('images/ch-coronavirus/user_proportion_text.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

# Group regressions

In [57]:
df_groups = pd.read_csv('Data/df_groups_nb4c-scam.csv', index_col = 0)

In [58]:
group_troch_pre = df_x[(df_x['message_date'] <= pd.Timestamp(2020, 3, 12)) & df_x['troch']]['uid'].unique()
group_troch_after = df_x[(df_x['message_date'] >= pd.Timestamp(2020, 3, 13)) & df_x['troch']]['uid'].unique()

In [59]:
df_groups['troch_pre'] = False
df_groups.loc[group_troch_pre, 'troch_pre'] = True

df_groups['troch_after'] = 0 # SMF requires numeric
df_groups.loc[group_troch_after, 'troch_after'] = 1

In [60]:
print(df_groups['troch_pre'].sum())
print(df_groups['troch_after'].sum())

16
23


In [62]:
cols = ['Size', 'pVZ', 'pCO', 'pUS',
       'pPE', 'pCL', 'pEC', 'p3rdCountry', 'entropy', 'activity',
       'degree', 'hhConc', 'gini',
       'replies_n', 'virality', 'fakeNews', 'fakeNews_users', 'scam',
       'scam_users', 'troch_pre']

In [63]:
for col in cols:
    print("============= %s =============" % col)
    print(scipy.stats.pearsonr(df_groups[col], df_groups['troch_after']))

(0.40918779889670226, 2.0699540250112056e-08)
(0.36960474046093744, 5.189790706303542e-07)
(-0.2050623829257252, 0.0066402129818484935)
(-0.07808289272946727, 0.30577636173002937)
(0.02311428909307101, 0.7620870724780106)
(0.11585469802079935, 0.1279228445579995)
(0.015213566845745763, 0.8420704740303421)
(-0.05883352763086867, 0.44062270726230224)
(0.11759738690393114, 0.12224894858901159)
(0.09769788429728969, 0.19966550029934713)
(0.22236252328880474, 0.0031876955908943473)
(-0.21791991206988387, 0.0038691061777596995)
(0.3272387033781887, 1.046062016735444e-05)
(0.011562075545629635, 0.8796443949325436)
(0.10670979615131151, 0.16107713509011537)
(-0.015495649086381354, 0.8391814956282548)
(0.1508199701860354, 0.04697889863297336)
(-0.14648406053096896, 0.05376224275580397)
(-0.09210918404702881, 0.2267329094751771)
(0.5217573349000768, 1.5527424018677899e-13)


In [66]:
reg = smf.probit('troch_after ~ troch_pre + Size + pVZ + pCO + degree + hhConc + gini', data = df_groups).fit()
print(reg.summary())

Optimization terminated successfully.
         Current function value: 0.198750
         Iterations 9
                          Probit Regression Results                           
Dep. Variable:            troch_after   No. Observations:                  174
Model:                         Probit   Df Residuals:                      166
Method:                           MLE   Df Model:                            7
Date:                Sun, 12 Apr 2020   Pseudo R-squ.:                  0.4911
Time:                        22:46:24   Log-Likelihood:                -34.582
converged:                       True   LL-Null:                       -67.950
Covariance Type:            nonrobust   LLR p-value:                 6.732e-12
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            -4.2230      1.119     -3.775      0.000      -6.416      -2.030
troch_pr

In [70]:
print(reg.get_margeff().summary())

       Probit Marginal Effects       
Dep. Variable:            troch_after
Method:                          dydx
At:                           overall
                       dy/dx    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
troch_pre[T.True]     0.1701      0.047      3.611      0.000       0.078       0.262
Size                  0.0004      0.000      1.479      0.139      -0.000       0.001
pVZ                   0.3668      0.099      3.717      0.000       0.173       0.560
pCO                   0.0984      0.078      1.255      0.209      -0.055       0.252
degree                0.0001      0.001      0.200      0.841      -0.001       0.001
hhConc               -0.1135      0.129     -0.877      0.380      -0.367       0.140
gini                  0.2628      0.143      1.836      0.066      -0.018       0.543
