In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

In [2]:
import ast # string rep. of list to list

In [3]:
import statsmodels.formula.api as smf
import scipy.stats
from scipy.stats import ttest_ind

In [4]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 20

plt.rc('font', size=BIGGER_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=BIGGER_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [5]:
df_x = pd.read_csv('Data/df_x_nb3-reply.csv', index_col=0)
print(df_x.shape)

(171634, 30)


In [6]:
df_x['reply_link'] = df_x['reply_link'].astype(pd.Int32Dtype())
df_x['reply_list'] = df_x['reply_list'].apply(lambda x: ast.literal_eval(x))

In [7]:
df_x['message_dt'] = pd.to_datetime(df_x["message_dt"], format='%Y-%m-%d %H:%M:00')
df_x['message_date'] = pd.to_datetime(df_x["message_date"], format='%Y-%m-%d')

# Graph construction

In [8]:
adjacency_matrix = {}
root_connected = {}
root_diameter = {}
root_virality = {}

In [9]:
df_r = df_x[df_x['reply_link'].notnull()]

In [10]:
for index, row in df_r.iterrows():
    # construct graph
    adjacency_matrix[index] = [row['reply_link']]
    adjacency_matrix[row['reply_link']] = adjacency_matrix.get(row['reply_link'], []) + [index]
    
    # attribute to root
    root = row['reply_list'][0]
    root_connected[root] = root_connected.get(root, []) + [index]

In [11]:
for root in root_connected:
    root_connected[root] += [root]

In [12]:
for root in root_connected:
    components = root_connected[root]
    n = len(components)
    dist_matrix = np.zeros((n,n))
    for i in range(n-1):
        # BFS
        discovered = [False] * n
        distances = [0] * n
        
        # Label start
        queue = [i]
        discovered[i] = True
                
        while len(queue) > 0:
            vi = queue.pop(0)
                        
            neighbors_i = [components.index(x) for x in adjacency_matrix[components[vi]]]
            for ni in neighbors_i:
                if not discovered[ni]:
                    discovered[ni] = True
                    queue.append(ni)
                    distances[ni] = distances[vi] + 1
        dist_matrix[i] = distances
        dist_matrix[:, i] = distances
        
    root_diameter[root] = np.max(dist_matrix)
    root_virality[root] = np.mean(dist_matrix)

In [13]:
df_r['virality'] = df_r['reply_list'].apply(lambda x: root_virality[x[0]])
df_r['diameter'] = df_r['reply_list'].apply(lambda x: root_diameter[x[0]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [14]:
df_x['virality'] = df_r['virality']
df_x['virality'] = df_x['virality'].fillna(0)

In [15]:
root_i = (df_x['virality'] == 0) & (df_x['replies_n'] > 0)
df_x.loc[root_i, 'virality'] =\
    df_x.loc[root_i].index.map(lambda x: root_virality[x])

In [16]:
df_x.to_csv('Data/df_x_nb3b-virality.csv')

In [17]:
df_x['diameter'] = df_r['diameter']
df_x['diameter'] = df_x['diameter'].fillna(0)

df_x.loc[root_i, 'diameter'] =\
    df_x.loc[root_i].index.map(lambda x: root_diameter[x])

# Overview - Virality

In [18]:
df_x[(df_x['replies_n'] > 0) & (df_x['reply_link'].isnull())]['virality'].hist(bins = 30)
plt.xlabel("Virality")
plt.ylabel("# of Cascades")
plt.title("Unique Reply Cascades")
plt.savefig('images/ch-replycascades/hist_virality.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [19]:
plt.scatter(df_x[(df_x['replies_n'] > 0) & (df_x['reply_link'].isnull())]['replies_n'],
             df_x[(df_x['replies_n'] > 0) & (df_x['reply_link'].isnull())]['virality'], alpha = 0.3)
plt.xlabel("# of Replies")
plt.ylabel("Virality")
plt.title("Root Nodes")
plt.savefig('images/ch-replycascades/scatter_root_repliesn_virality.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

#### Images actually more viral

In [20]:
ttest_ind(df_x[df_x['image'].notnull() & (df_x['virality'] > 0)]['virality'],
            df_x[(df_x['text'] != '') & (df_x['virality'] > 0)]['virality'])

Ttest_indResult(statistic=15.34552101877468, pvalue=4.5857007455330665e-53)

In [21]:
# Images go more viral!
print(np.mean(df_x[df_x['image'].notnull() & (df_x['virality'] > 0)]['virality']))
print(np.mean(df_x[(df_x['text'] != '') & (df_x['virality'] > 0)]['virality']))


1.7142182236946955
1.5019615678654146


#### VZ actually more viral

In [22]:
ttest_ind(df_x[(df_x['virality'] > 0) & (df_x['tel'].str.startswith('+57'))]['virality'],
            df_x[(df_x['virality'] > 0) & (df_x['tel'].str.startswith('+58'))]['virality'])

Ttest_indResult(statistic=-6.592070354761768, pvalue=4.386280849302184e-11)

In [23]:
# Closer, which makes sense!
print(np.mean(df_x[(df_x['virality'] > 0) & (df_x['tel'].str.startswith('+57'))]['virality']))
print(np.mean(df_x[(df_x['virality'] > 0) & (df_x['tel'].str.startswith('+58'))]['virality']))


1.44997091549334
1.5357922108782354


# Diameter

In [24]:
plt.scatter(df_x['diameter'], df_x['virality'])
plt.xlabel("Diameter")
plt.ylabel("Virality")
plt.title("All Messages")
plt.savefig('images/ch-replycascades/scatter_diameter_virality.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

# Groups

In [25]:
df_groups = pd.read_csv('Data/df_groups_nb3a-reply.csv', index_col = 0)

In [26]:
df_groups['virality'] = df_x[df_x['virality'] > 0][['uid', 'virality']].groupby('uid').mean()
df_groups['virality'] = df_groups['virality'].fillna(0)

In [27]:
df_groups.to_csv('Data/df_groups_nb3b-virality.csv')

In [28]:
df_groups['virality'].hist(bins = 20)
plt.title("All Groups")
plt.xlabel("Avg. Virality\n(Within Cascades)")
plt.ylabel("# of Groups")
plt.savefig('images/ch-replycascades/hist_group_virality.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [29]:
plt.scatter(df_groups['replies_n'], df_groups['virality'], alpha = 0.3)
plt.xlabel("Avg. Number of Replies")
plt.ylabel("Avg. Virality\n(Within Cascades)")
plt.title("All Groups")
plt.savefig('images/ch-replycascades/scatter_group_repliesn_virality.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
print(scipy.stats.pearsonr(df_groups['replies_n'], df_groups['virality']))

(0.9038850128124559, 2.4889221316363124e-65)


In [30]:
reg = smf.ols('virality ~ entropy + hhConc + gini',
              data = df_groups).fit()
print(reg.summary())
# strong in HHCONC

                            OLS Regression Results                            
Dep. Variable:               virality   R-squared:                       0.474
Model:                            OLS   Adj. R-squared:                  0.465
Method:                 Least Squares   F-statistic:                     51.14
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           1.31e-23
Time:                        23:52:29   Log-Likelihood:                -119.30
No. Observations:                 174   AIC:                             246.6
Df Residuals:                     170   BIC:                             259.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.5648      0.147      3.830      0.0

In [31]:
reg = smf.ols('virality ~ entropy + hhConc + gini',
              data = df_groups[df_groups['replies_n'] > 0]).fit()
print(reg.summary())
# strong in HHCONC

                            OLS Regression Results                            
Dep. Variable:               virality   R-squared:                       0.246
Model:                            OLS   Adj. R-squared:                  0.227
Method:                 Least Squares   F-statistic:                     13.05
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           1.97e-07
Time:                        23:52:31   Log-Likelihood:                -84.366
No. Observations:                 124   AIC:                             176.7
Df Residuals:                     120   BIC:                             188.0
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.7623      0.183      4.177      0.0

# Temporal properties of reply cascade

In [32]:
df_r['root'] = df_r['reply_list'].apply(lambda x: x[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [33]:
root_df = pd.DataFrame(df_x.loc[df_r['root'].values]['message_dt'])
root_df.columns = ['first_dt']

In [34]:
root_df['last_dt'] = df_r[['message_dt', 'root']].groupby('root').last()
root_df['timespan'] = (root_df['last_dt'] - root_df['first_dt']).dt.total_seconds()
root_df['timespan'] = root_df['timespan'] / 3600

In [35]:
root_df['virality'] = root_df.index.map(lambda x: root_virality[x])
root_df['diameter'] = root_df.index.map(lambda x: root_diameter[x])
root_df['size'] = root_df.index.map(lambda x: len(root_connected[x]))

### Size

In [36]:
x = root_df['timespan']
y = root_df['size']
plt.figure(figsize = (12, 4))
plt.scatter(x, y, alpha = 0.1)
plt.xlabel("Cascade Duration (hours)")
plt.ylabel("Size of Cascade")
plt.title("All Reply Cascades")
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, color = 'orange')
plt.savefig('images/ch-replycascades/temporal_size.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
print(m)
print(scipy.stats.pearsonr(x, y))

0.017917869017981484
(0.042380158229362426, 6.410351086646129e-19)


In [37]:
x = root_df[root_df['timespan'] < 12]['timespan']
y = root_df[root_df['timespan'] < 12]['size']
plt.figure(figsize = (12, 4))
plt.scatter(x, y, alpha = 0.1)
plt.xlabel("Cascade Duration (hours)")
plt.ylabel("Size of Cascade")
plt.title("Cascade with Duration $< 12$ Hours")
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, color = 'orange')
plt.savefig('images/ch-replycascades/temporal_size_12h.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
print(m)
print(scipy.stats.pearsonr(x, y))

0.5791698020060221
(0.15102853858327142, 1.718602979510054e-215)


In [38]:
x = root_df['timespan']
y = root_df['virality']
plt.figure(figsize = (12, 4))
plt.scatter(x, y, alpha = 0.1)
plt.xlabel("Cascade Duration (hours)")
plt.ylabel("Virality")
plt.title("All Reply Cascades")
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, color = 'orange')
plt.savefig('images/ch-replycascades/temporal_virality.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
print(m)
print(scipy.stats.pearsonr(x, y))

0.0031555859136958853
(0.03413200616754636, 8.404636191298922e-13)


In [39]:
x = root_df[root_df['timespan'] < 12]['timespan']
y = root_df[root_df['timespan'] < 12]['virality']
plt.figure(figsize = (12, 4))
plt.scatter(x, y, alpha = 0.1)
plt.xlabel("Cascade Duration (hours)")
plt.ylabel("Virality")
plt.title("Cascade with Duration $< 12$ Hours")
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b, color = 'orange')
plt.savefig('images/ch-replycascades/temporal_virality_12h.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
print(m)
print(scipy.stats.pearsonr(x, y))

0.10223997515601098
(0.12065778090466422, 1.0379210282492673e-137)
