In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

In [2]:
import matplotlib.pyplot as plt
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 20

plt.rc('font', size=BIGGER_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=BIGGER_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [3]:
import statsmodels.formula.api as smf
from scipy.stats import ttest_ind
from collections import Counter

In [4]:
df_x = pd.read_csv("Data/df_x_nb0a-groupMerge.csv", index_col = 0)
print(df_x.shape)

(171634, 26)


In [5]:
df_x['textlower'] = df_x['textlower'].fillna('')
df_x['image'] = df_x['image'].fillna('')
df_x['video_thumb'] = df_x['video_thumb'].fillna('')
df_x['video_length'] = df_x['video_length'].fillna('')
df_x['audio_length'] = df_x['audio_length'].fillna('')

In [6]:
df_x['message_dt'] = pd.to_datetime(df_x["message_dt"], format='%Y-%m-%d %H:%M:00')

In [7]:
df_groups = pd.read_csv('Data/df_groups_nb2b-concentration.csv', index_col = 0)

# Repeated images

In [8]:
df_x['image_list'] = df_x['image'].apply(lambda x: [i for i in x.split('|') if i != ''])
all_images = df_x['image_list'].sum()

In [9]:
image_counter = Counter(all_images)

In [10]:
image_shares = pd.DataFrame.from_dict(image_counter, orient='index', columns=['n_shares'])

In [11]:
print(len(image_shares))

23131


### n_shares

In [12]:
image_shares[image_shares['n_shares'] >= 2]['n_shares'].hist(bins = 40)
plt.xlabel("Number of Shares")
plt.ylabel("# of Images")
plt.title("Images Shared Twice or More")
plt.savefig('images/ch-messages/hist_image_nshare.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [13]:
print(np.mean(image_shares['n_shares'] == 1))
print(np.sum(image_shares['n_shares'] == 1))
print(np.sum(image_shares['n_shares'] > 1))
print()
print(np.sum(image_shares['n_shares'] == 2))
print(np.mean(image_shares['n_shares'] <= 5))

0.7534045220699495
17427
5704

3149
0.9632527776576888


### Images shared over 5 times

In [14]:
groups_anyshare = set()
groups_firstshare = set()

for image in image_shares[image_shares['n_shares'] > 5].index:
    groups_anyshare.update(df_x[df_x['image'].str.contains(image)]['uid'].unique())
    groups_firstshare.add(df_x[df_x['image'].str.contains(image)]['uid'].iloc[0])

In [15]:
print(image_shares[image_shares['n_shares'] > 5].shape[0])
print(len(groups_anyshare))
print(len(groups_firstshare))
print(df_groups.shape[0])

850
96
66
174


In [16]:
df_groups.columns

Index(['+52', '+55', '+57', '+58', '+34', '+1', '+263', '+27', '+381', '+505',
       '+506', '+51', '+54', '+591', '+593', '+595', '+91', '+92', '+98',
       '+56', '+39', '+53', '+229', '+502', '+592', '+599', '+504', '+32',
       '+33', '+44', '+351', '+49', '+20', '+212', '+213', '+241', '+966',
       '+967', '+971', '+503', '+40', '+297', '+41', '+507', '+597', '+598',
       '+46', '+254', '+258', '+240', '+244', 'Size', 'pVZ', 'pCO', 'pUS',
       'pPE', 'pCL', 'pEC', 'p3rdCountry', 'entropy', 'degree', 'activity',
       'hhConc', 'gini'],
      dtype='object')

In [17]:
stat_cols = ['Size', 'pVZ', 'pCO', 'pUS', 'pPE',
       'pCL', 'pEC', 'p3rdCountry', 'entropy', 'degree', 'activity', 'hhConc', 'gini']

In [18]:
df_groups_anyshare = df_groups.loc[df_groups.index.isin(groups_anyshare)][stat_cols]
df_groups_firstshare = df_groups.loc[df_groups.index.isin(groups_firstshare)][stat_cols]

df_groups_notany = df_groups.loc[~df_groups.index.isin(groups_anyshare)][stat_cols]
df_groups_notfirst = df_groups.loc[~df_groups.index.isin(groups_firstshare)][stat_cols]

In [19]:
for col in stat_cols:
    print(col)
    print(np.mean(df_groups_firstshare[col]), np.mean(df_groups_notfirst[col]))
    print(ttest_ind(df_groups_firstshare[col], df_groups_notfirst[col]))
    print()

Size
101.31818181818181 15.925925925925926
Ttest_indResult(statistic=9.145797204451101, pvalue=1.6584456919364905e-16)

pVZ
0.21487750230628364 0.1672975121794505
Ttest_indResult(statistic=1.1986664989533455, pvalue=0.23230654199320824)

pCO
0.48509819320461356 0.5371752508768671
Ttest_indResult(statistic=-0.9093799711806654, pvalue=0.36442210894987737)

pUS
0.011506983260118024 0.020107651923990093
Ttest_indResult(statistic=-0.8357691291009817, pvalue=0.4044443073444731)

pPE
0.10871153293093667 0.10849132314832276
Ttest_indResult(statistic=0.006815595052237837, pvalue=0.9945698825880428)

pCL
0.020349459688619766 0.018398305651694997
Ttest_indResult(statistic=0.1733456372208999, pvalue=0.8625835854003021)

pEC
0.033426735079130875 0.046569246545402405
Ttest_indResult(statistic=-0.7274685433636063, pvalue=0.46792803751199274)

p3rdCountry
0.3000243044891029 0.29552723694368244
Ttest_indResult(statistic=0.08936052093843692, pvalue=0.9288993651821302)

entropy
1.04205005447749 0.6097164

In [28]:
for col in stat_cols:
    print(col)
    print(np.mean(df_groups_anyshare[col]), np.mean(df_groups_notany[col]))
    print(ttest_ind(df_groups_anyshare[col], df_groups_notany[col]))
    print()

Size
81.6875 7.243589743589744
Ttest_indResult(statistic=7.798695905538668, pvalue=5.754314591774663e-13)

pVZ
0.20977890594875484 0.15527271149378083
Ttest_indResult(statistic=1.409635459144458, pvalue=0.16045292400452468)

pCO
0.5140096618126656 0.5216215424639775
Ttest_indResult(statistic=-0.13591637171199888, pvalue=0.8920463351467443)

pUS
0.012662706695469724 0.021993172566584958
Ttest_indResult(statistic=-0.9297475315308802, pvalue=0.35380484681428337)

pPE
0.101142532829869 0.11772231951016991
Ttest_indResult(statistic=-0.5263742526364362, pvalue=0.5993063012483106)

pCL
0.01751443600321755 0.021137121711834357
Ttest_indResult(statistic=-0.3299518420851185, pvalue=0.7418378599442195)

pEC
0.035657604477248174 0.048878373234747086
Ttest_indResult(statistic=-0.7501252187326267, pvalue=0.45420415397938674)

p3rdCountry
0.27621143223857964 0.32310574604224157
Ttest_indResult(statistic=-0.9575910006556948, pvalue=0.3396130909236412)

entropy
0.943611929192918 0.5645888445850323
Ttes

### Regressing number of shares on group characteristics

In [None]:
def firstGroupAppearImage(image):
    return df_x.loc[df_x.image.str.contains(image), 'uid'].iloc[0]

In [None]:
image_shares['first_group'] = image_shares.index.map(firstGroupAppearImage).values

In [None]:
df_groups_Size = df_groups['Size'].to_dict()
df_groups_entropy = df_groups['entropy'].to_dict()
df_groups_activity = df_groups['activity'].to_dict()
df_groups_hhConc = df_groups['hhConc'].to_dict()
df_groups_gini = df_groups['gini'].to_dict()
df_groups_degree = df_groups['degree'].to_dict()

In [None]:
image_shares['Size'] = image_shares['first_group'].apply(lambda x: df_groups_Size[x])
image_shares['entropy'] = image_shares['first_group'].apply(lambda x: df_groups_entropy[x])
image_shares['activity'] = image_shares['first_group'].apply(lambda x: df_groups_activity[x])
image_shares['hhConc'] = image_shares['first_group'].apply(lambda x: df_groups_hhConc[x])
image_shares['gini'] = image_shares['first_group'].apply(lambda x: df_groups_gini[x])
image_shares['degree'] = image_shares['first_group'].apply(lambda x: df_groups_degree[x])

In [24]:
image_shares.to_csv('Data/intermediate/df_image_shares_nb2a-repeated.csv')

#### Re-import data from last run

In [29]:
image_shares = pd.read_csv('Data/intermediate/df_image_shares_nb2a-repeated.csv', index_col = 0)

In [30]:
image_shares

Unnamed: 0,n_shares,first_group,Size,entropy,activity,hhConc,gini,degree
fbb87cb59de678c12360a168bb8dee34,1,573006401779-1563676942,147.0,0.631187,41.830189,0.053503,0.692626,61.0
5fd916eaa31e2aed75226758d8a12f24,1,573006401779-1563676942,147.0,0.631187,41.830189,0.053503,0.692626,61.0
744598bc2086ce9ec0b9571ef7ec0dcd,1,573137678218-1572620120,12.0,1.234268,1.981132,0.206349,0.592857,4.0
0aacdbf24a5bace94c1d9d54ead90452,2,584247622382-1574092989,1.0,0.000000,1.142857,1.000000,0.000000,3.0
5342d9cdb8197a3013fa61153591e839,1,584143426635-1560226365,285.0,1.344692,163.339623,0.071055,0.792048,123.0
...,...,...,...,...,...,...,...,...
ac6a8726183779f75305d06e4cfab8a2,1,573117575342-1576375028,4.0,1.039721,0.282051,0.388430,0.386364,0.0
fc52b125e10d95f167afa073c30f11b7,1,573227727724-1572791992,23.0,1.398144,22.981132,0.286956,0.737060,6.0
ba445a09ac3ed0524e1c3bb388d626c6,1,584143426635-1560226365,285.0,1.344692,163.339623,0.071055,0.792048,123.0
f1e1cd651c9e39ad073d0ca7994a79f1,2,51955257649-1554235320,73.0,0.980372,129.452830,0.424649,0.907560,2.0


In [31]:
reg = smf.ols('n_shares ~ Size + entropy + degree + activity + hhConc + gini',
              data = image_shares).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:               n_shares   R-squared:                       0.021
Model:                            OLS   Adj. R-squared:                  0.021
Method:                 Least Squares   F-statistic:                     81.95
Date:                Tue, 14 Apr 2020   Prob (F-statistic):          6.47e-102
Time:                        21:06:19   Log-Likelihood:                -58006.
No. Observations:               23131   AIC:                         1.160e+05
Df Residuals:                   23124   BIC:                         1.161e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.4880      0.183      8.129      0.0

In [33]:
reg = smf.ols('n_shares ~ Size + entropy + degree + activity + hhConc + gini',
              data = image_shares[image_shares['n_shares'] > 1]).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:               n_shares   R-squared:                       0.032
Model:                            OLS   Adj. R-squared:                  0.031
Method:                 Least Squares   F-statistic:                     31.73
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           9.44e-38
Time:                        21:09:31   Log-Likelihood:                -17632.
No. Observations:                5704   AIC:                         3.528e+04
Df Residuals:                    5697   BIC:                         3.532e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.2665      0.668      4.889      0.0

### Image share time

In [34]:
def image_shareTimerange(image):
    times = df_x[df_x['image'].str.contains(image)]['message_dt']
    return (times.iloc[-1] - times.iloc[0]).total_seconds()

In [35]:
image_shares.loc[image_shares['n_shares'] > 1, 'timerange'] = image_shares.loc[image_shares['n_shares'] > 1].index.map(image_shareTimerange)
image_shares['timerange'] = image_shares['timerange'] / 3600
image_shares['timerange'] = image_shares['timerange'].fillna(0)

In [36]:
plt.figure()
image_shares[image_shares['timerange'] > 0]['timerange'].hist(bins = 50)
plt.xlabel("Share Time Range (Hours)")
plt.ylabel("# of Images")
plt.title("Images Shared Twice or More")
plt.savefig('images/ch-messages/hist_image_timerange.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [37]:
reg = smf.ols('timerange ~ Size + entropy + degree + activity + hhConc + gini',
              data = image_shares).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:              timerange   R-squared:                       0.036
Model:                            OLS   Adj. R-squared:                  0.035
Method:                 Least Squares   F-statistic:                     142.9
Date:                Tue, 14 Apr 2020   Prob (F-statistic):          1.40e-178
Time:                        21:23:02   Log-Likelihood:            -1.5509e+05
No. Observations:               23131   AIC:                         3.102e+05
Df Residuals:                   23124   BIC:                         3.102e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     83.3603     12.170      6.850      0.0

In [38]:
reg = smf.ols('timerange ~ Size + entropy + degree + activity + hhConc + gini',
              data = image_shares[image_shares['timerange'] > 0]).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:              timerange   R-squared:                       0.108
Model:                            OLS   Adj. R-squared:                  0.107
Method:                 Least Squares   F-statistic:                     89.76
Date:                Tue, 14 Apr 2020   Prob (F-statistic):          1.14e-106
Time:                        21:23:03   Log-Likelihood:                -32580.
No. Observations:                4462   AIC:                         6.517e+04
Df Residuals:                    4455   BIC:                         6.522e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    655.5457     54.962     11.927      0.0

# Repeated text

In [39]:
text_counter = Counter(df_x['textlower'])
text_shares = pd.DataFrame.from_dict(text_counter, orient='index', columns=['n_shares'])
text_shares = text_shares.drop('')
text_shares = text_shares.iloc[text_shares.index.str.len() >= 20]

In [40]:
print(np.mean(text_shares['n_shares'] == 1))
print(np.mean(text_shares['n_shares'] <= 2))
print(np.mean(text_shares['n_shares'] <= 5))

0.9362971925636456
0.9819323402933338
0.997661832273255


Not interesting!

# Video

In [41]:
video_counter = Counter(df_x['video_thumb'] + df_x['video_length'])
video_shares = pd.DataFrame.from_dict(video_counter, orient='index', columns=['n_shares'])
video_shares = video_shares.drop('')

In [42]:
print(np.mean(video_shares['n_shares'] == 1))
print(np.mean(video_shares['n_shares'] == 2))
print(np.mean(video_shares['n_shares'] <= 5))

0.895507172504187
0.08439525231194932
0.9989077404791379


In [43]:
print(len(video_shares))

13733


### Identify first group and characteristics

In [None]:
def firstGroupAppearVideo(videoLength):
    return df_x.loc[(df_x['video_thumb'] + df_x['video_length'] == videoLength), 'uid'].iloc[0]

In [None]:
video_shares['first_group'] = video_shares.index.map(firstGroupAppearVideo).values

In [None]:
video_shares['Size'] = video_shares['first_group'].apply(lambda x: df_groups_Size[x])
video_shares['entropy'] = video_shares['first_group'].apply(lambda x: df_groups_entropy[x])
video_shares['activity'] = video_shares['first_group'].apply(lambda x: df_groups_activity[x])
video_shares['hhConc'] = video_shares['first_group'].apply(lambda x: df_groups_hhConc[x])
video_shares['gini'] = video_shares['first_group'].apply(lambda x: df_groups_gini[x])

In [63]:
video_shares.to_csv('Data/intermediate/df_video_shares_nb2a-repeated.csv')

#### Import from previous

In [64]:
video_shares = pd.read_csv('Data/intermediate/df_video_shares_nb2a-repeated.csv', index_col = 0)

In [65]:
video_shares

Unnamed: 0,n_shares,first_group,Size,entropy,activity,hhConc,gini,degree
2cc65b0ac2b7854d4b65b12d16958c7f0:43,4,573106499958-1529031886,26.0,0.935182,3.283019,0.236821,0.683024,19.0
fe1416b2ef8f01ff6073dca335dc134b0:25,1,584164805353-1497744666,30.0,0.146145,26.471698,0.467487,0.886315,0.0
e25deed8a674d970b1e51eb79daf695b1:46,1,573145114610-1573649691,262.0,0.871355,97.339623,0.026352,0.736434,4.0
e178f7dda6796426baf3315427f0a7be0:30,1,e50fd0133fda35e4709d62c81059381e,26.0,1.864746,405.000000,0.101003,0.616619,26.0
dd874905ce1b61610c0900221e5f578d0:29,1,e50fd0133fda35e4709d62c81059381e,26.0,1.864746,405.000000,0.101003,0.616619,26.0
...,...,...,...,...,...,...,...,...
eee59856d15f93091c83400f7a42609e0:30,1,573006632902-1573570307,164.0,0.392869,44.396226,0.030326,0.687900,0.0
55e298941ab7b28ed5c1e62a790162411:28,1,573006632902-1573570307,164.0,0.392869,44.396226,0.030326,0.687900,0.0
ca8748ca862d722739c7ff9f648d77d40:28,1,584143426635-1560226365,285.0,1.344692,163.339623,0.071055,0.792048,123.0
1eb97c98c42cb6b4485ce854f8b913812:20,2,51955257649-1554235320,73.0,0.980372,129.452830,0.424649,0.907560,2.0


## Regression

In [66]:
reg = smf.ols('n_shares ~ Size + entropy + degree + activity + hhConc + gini',
              data = video_shares[video_shares['n_shares'] > 1]).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:               n_shares   R-squared:                       0.016
Model:                            OLS   Adj. R-squared:                  0.012
Method:                 Least Squares   F-statistic:                     3.962
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           0.000617
Time:                        21:33:01   Log-Likelihood:                -1653.9
No. Observations:                1435   AIC:                             3322.
Df Residuals:                    1428   BIC:                             3359.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.5046      0.216     11.591      0.0

### Time range

In [67]:
def video_shareTimerange(videoLength):
    times = df_x.loc[(df_x['video_thumb'] + df_x['video_length'] == videoLength), 'message_dt']
    return (times.iloc[-1] - times.iloc[0]).total_seconds()

In [68]:
video_shares.loc[video_shares['n_shares'] > 1, 'timerange'] = video_shares.loc[video_shares['n_shares'] > 1].index.map(video_shareTimerange)
video_shares['timerange'] = video_shares['timerange'] / 3600
video_shares['timerange'] = video_shares['timerange'].fillna(0)

In [72]:
reg = smf.ols('timerange ~ Size + entropy + degree + activity + hhConc + gini',
              data = video_shares[video_shares['n_shares'] > 1]).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:              timerange   R-squared:                       0.045
Model:                            OLS   Adj. R-squared:                  0.041
Method:                 Least Squares   F-statistic:                     11.27
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           2.49e-12
Time:                        21:36:57   Log-Likelihood:                -9227.9
No. Observations:                1435   AIC:                         1.847e+04
Df Residuals:                    1428   BIC:                         1.851e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     73.5493     42.349      1.737      0.0