In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
from collections import Counter

In [3]:
import matplotlib.pyplot as plt
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 20

plt.rc('font', size=BIGGER_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=BIGGER_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [4]:
if not os.path.exists('images/ch-members/active/'):
    os.makedirs('images/ch-members/active/')

In [5]:
import statsmodels.formula.api as smf
import scipy.stats
from scipy.stats import ttest_ind
from scipy.stats import f_oneway

In [6]:
import networkx as nx

# Read in

In [7]:
df_x = pd.read_csv("Data/df_x_nb0a-groupMerge.csv", index_col = 0)
print(df_x.shape)

(171634, 26)


### Skip this processing

In [None]:
group_members = df_x[['uid', 'tel']].groupby('uid')['tel'].unique()

In [None]:
df_interact = pd.DataFrame()
df_groups = pd.DataFrame()

for member_list in group_members:
    member_list = list(member_list)
    df_new = pd.DataFrame(index = member_list, columns = member_list + ['Group']).fillna(1)
        
    df_interact = df_interact.add(df_new, fill_value = 0)

    country_codes = [m.split(' ')[0] for m in member_list]
    df_groups = df_groups.append(Counter(country_codes), ignore_index=True)

In [None]:
df_groups.index = group_members.index
df_groups = df_groups.fillna(0)
df_interact = df_interact.fillna(0)

In [None]:
df_groups.to_csv('Data/intermediate/df_groups_nb1-members-byMessaging.csv')
df_interact.to_csv('Data/intermediate/df_interact_nb1-members-byMessaging.csv')

### Reload from saved output

In [8]:
# Re-import from first run
df_groups = pd.read_csv('Data/intermediate/df_groups_nb1-members-byMessaging.csv', index_col = 0)
df_interact = pd.read_csv('Data/intermediate/df_interact_nb1-members-byMessaging.csv', index_col = 0)

# Group size

In [9]:
print("Group/interact shapes:")
print(df_groups.shape)
print(df_interact.shape)

Group/interact shapes:
(174, 51)
(7860, 7861)


In [10]:
count_cols = [c for c in df_groups.columns if c[0] == '+']
df_groups["Size"] = df_groups[count_cols].sum(axis = 1)

In [11]:
print("Group sizes <=10, <=20, >=150")
print(np.mean(df_groups['Size'] <= 10))
print(np.mean(df_groups['Size'] <= 20))
print(np.mean(df_groups['Size'] >= 150))

Group sizes <=10, <=20, >=150
0.4425287356321839
0.5862068965517241
0.09770114942528736


In [12]:
# distribution of group sizes

plt.hist(df_groups["Size"], range = (0,256), bins = 20)
plt.xlabel("# of Members in Group")
plt.ylabel("# of Groups")
plt.savefig('images/ch-members/active/hist_size.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [13]:
# distribution of groups/user
print("Groups/user value counts:")
print(df_interact["Group"].value_counts())

Groups/user value counts:
1.0    7377
2.0     434
3.0      36
4.0      11
5.0       2
Name: Group, dtype: int64


# Country breakdown

In [14]:
df_groups["pVZ"] = df_groups["+58"] / df_groups["Size"]
df_groups["pCO"] = df_groups["+57"] / df_groups["Size"]
df_groups["pUS"] = df_groups["+1"] / df_groups["Size"]
df_groups["pPE"] = df_groups["+51"] / df_groups["Size"]
df_groups["pCL"] = df_groups["+56"] / df_groups["Size"]
df_groups["pEC"] = df_groups["+593"] / df_groups["Size"]
df_groups['p3rdCountry'] = (1 - (df_groups['pVZ'] + df_groups['pCO']))

In [15]:
# distribution of pCountry in various groups
# APPENDIX

for col in ["pVZ", "pCO", "pUS", "pPE", "pCL", "pEC", "p3rdCountry"]:
    plt.hist(100 * df_groups[col], range = (0,100), bins = 16)
    plt.xlabel("%% of Group Members from %s" % col[1:])
    plt.ylabel("# of Groups")   
    plt.savefig('images/ch-appendicies/members/prop_%s.png' % col[1:], bbox_inches = 'tight', pad_inches = 0.05)
    plt.close()

In [16]:
# comparing VZ and CO membership in groups

plt.scatter(100 * df_groups["pVZ"], 100 * df_groups["pCO"], alpha = 0.5)
plt.xlabel("% VZ in Group")
plt.ylabel("% CO in Group")

plt.savefig('images/ch-members/active/pVZ_vs_pCO.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [17]:
plt.hist(df_groups[df_groups["pVZ"] > df_groups["pCO"]]["Size"], range = (0,256), bins = 20)
plt.xlabel("Size of Group")
plt.ylabel("Number of Groups")
plt.title("Groups where %VZ > %CO")
plt.savefig('images/ch-members/active/hist_size_moreVZ.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [18]:
plt.hist(df_groups[df_groups["pVZ"] < df_groups["pCO"]]["Size"], range = (0,256), bins = 20)
plt.xlabel("Size of Group")
plt.ylabel("Number of Groups")
plt.title("Groups where %VZ < %CO")
plt.savefig('images/ch-members/active/hist_size_moreCO.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

## Who's connected to who

In [19]:
df_interact["CO Connections"] = df_interact[[col for col in df_interact if col[:4] == "+57 "]].sum(axis = 1)
df_interact["VZ Connections"] = df_interact[[col for col in df_interact if col[:4] == "+58 "]].sum(axis = 1)
df_interact["PE Connections"] = df_interact[[col for col in df_interact if col[:4] == "+51 "]].sum(axis = 1)
df_interact["US Connections"] = df_interact[[col for col in df_interact if col[:3] == "+1 "]].sum(axis = 1)
df_interact["BR Connections"] = df_interact[[col for col in df_interact if col[:4] == "+55 "]].sum(axis = 1)
df_interact["CL Connections"] = df_interact[[col for col in df_interact if col[:4] == "+56 "]].sum(axis = 1)
df_interact["EC Connections"] = df_interact[[col for col in df_interact if col[:5] == "+593 "]].sum(axis = 1)
df_interact["nConnections"] = df_interact[[col for col in df_interact if col[0] == "+"]].sum(axis = 1)

In [20]:
df_interact_vz = df_interact.loc[[num for num in df_interact.index if num[:4] == "+58 "]]
df_interact_co = df_interact.loc[[num for num in df_interact.index if num[:4] == "+57 "]]
df_interact_pe = df_interact.loc[[num for num in df_interact.index if num[:4] == "+51 "]]
df_interact_cl = df_interact.loc[[num for num in df_interact.index if num[:4] == "+56 "]]
df_interact_ec = df_interact.loc[[num for num in df_interact.index if num[:5] == "+593 "]]
df_interact_us = df_interact.loc[[num for num in df_interact.index if num[:3] == "+1 "]]

### Comparing connections to VZ and CO users


In [21]:
plt.hist(df_interact_vz["CO Connections"], label = "To CO Users", range = (0,300), alpha = 0.5, bins = 30)
plt.hist(df_interact_vz["VZ Connections"], label = "To VZ Users", range = (0,300), alpha = 0.5, bins = 30)
plt.xlabel("Number of Connections")
plt.ylabel("Number of Users")
plt.title("Venezuelan WhatsApp Users (+58)")
plt.legend()
plt.savefig("images/ch-members/active/hist_connToCOVZ_VZ.png", bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

plt.hist(df_interact_co["CO Connections"], label = "To CO Users", range = (0,300), alpha = 0.5, bins = 30)
plt.hist(df_interact_co["VZ Connections"], label = "To VZ Users", range = (0,300), alpha = 0.5, bins = 30)
plt.xlabel("Number of Connections")
plt.ylabel("Number of Users")
plt.title("Colombian WhatsApp Users (+57)")
plt.legend()
plt.savefig("images/ch-members/active/hist_connToCOVZ_CO.png", bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

plt.hist(df_interact_pe["CO Connections"], label = "To CO Users", range = (0,300), alpha = 0.5, bins = 30)
plt.hist(df_interact_pe["VZ Connections"], label = "To VZ Users", range = (0,300), alpha = 0.5, bins = 30)
plt.xlabel("Number of Connections")
plt.ylabel("Number of Users")
plt.title("Peruvian WhatsApp Users (+51)")
plt.legend()
plt.savefig("images/ch-members/active/hist_connToCOVZ_PE.png", bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

plt.hist(df_interact_ec["CO Connections"], label = "To CO Users", range = (0,300), alpha = 0.5, bins = 30)
plt.hist(df_interact_ec["VZ Connections"], label = "To VZ Users", range = (0,300), alpha = 0.5, bins = 30)
plt.xlabel("Number of Connections")
plt.ylabel("Number of Users")
plt.title("Ecuadorian WhatsApp Users (+593)")
plt.legend()
plt.savefig("images/ch-members/active/hist_connToCOVZ_EC.png", bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

plt.hist(df_interact_cl["CO Connections"], label = "To CO Users", range = (0,300), alpha = 0.5, bins = 30)
plt.hist(df_interact_cl["VZ Connections"], label = "To VZ Users", range = (0,300), alpha = 0.5, bins = 30)
plt.xlabel("Number of Connections")
plt.ylabel("Number of Users")
plt.title("Chilean WhatsApp Users (+56)")
plt.legend()
plt.savefig("images/ch-members/active/hist_connToCOVZ_CL.png", bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

### Bar graphs of user connected to users from each country

In [22]:
def selectCountryCode(df, cc):
    return df[df.index.str.startswith(cc + ' ')]

def allCountryCode(df):
    return [x[0] for x in df.index.str.split(" ")]

cc = {'+51': 'PER', '+55': 'BRA', '+56': 'CHL', '+57': 'COL', '+58': 'VEN',
        '+593': 'ECU', '+1': 'USA'}

def countryCodeToCountry(code):
    return cc[code]

selectCountries = cc.keys()

In [23]:
# bar graph by country

df_interest = df_interact
countCountryCode = Counter(allCountryCode(df_interest))

x = [countryCodeToCountry(c) for c in countCountryCode and selectCountries]
y = [100 * countCountryCode[c] / len(df_interest) for c in countCountryCode and selectCountries]

plt.figure(figsize = (9, 5))
plt.bar(x, y)
plt.xlabel("Country")
plt.ylabel("% of Users")
plt.title("All Users (%s)" % len(df_interest))
plt.savefig("images/ch-members/active/bar_byCountry.png", bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [24]:
# bar graph by country, for those with Peru connections

df_interest = df_interact[df_interact["PE Connections"] > 0]
countCountryCode = Counter(allCountryCode(df_interest))

x = [countryCodeToCountry(c) for c in countCountryCode and selectCountries]
y = [100 * countCountryCode[c] / len(df_interest) for c in countCountryCode and selectCountries]

plt.figure(figsize = (9, 5))
plt.bar(x, y)
plt.xlabel("Country")
plt.ylabel("% of Users")
plt.title("Users Connected to Peru Users (%s)" % len(df_interest))
plt.savefig("images/ch-members/active/bar_withConnTo_PE.png", bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [25]:
# bar graph by country, for those with Chile connections

df_interest = df_interact[df_interact["CL Connections"] > 0]
countCountryCode = Counter(allCountryCode(df_interest))

x = [countryCodeToCountry(c) for c in countCountryCode and selectCountries]
y = [100 * countCountryCode[c] / len(df_interest) for c in countCountryCode and selectCountries]

plt.figure(figsize = (9, 5))
plt.bar(x, y)
plt.xlabel("Country")
plt.ylabel("% of Users")
plt.title("Users Connected to Chile Users (%s)" % len(df_interest))
plt.savefig("images/ch-members/active/bar_withConnTo_CL.png", bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [26]:
""# bar graph by country, for those with Ecuador connections

df_interest = df_interact[df_interact["EC Connections"] > 0]
countCountryCode = Counter(allCountryCode(df_interest))

x = [countryCodeToCountry(c) for c in countCountryCode and selectCountries]
y = [100 * countCountryCode[c] / len(df_interest) for c in countCountryCode and selectCountries]

plt.figure(figsize = (9, 5))
plt.bar(x, y)
plt.xlabel("Country")
plt.ylabel("% of Users")
plt.title("Users Connected to Ecuador Users (%s)" % len(df_interest))
plt.savefig("images/ch-members/active/bar_withConnTo_EC.png", bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

# Diversity

### Entropy

In [27]:
def entr(row):
    return scipy.stats.entropy(row[count_cols])

In [28]:
df_groups['entropy'] = df_groups.apply(entr, axis = 1)

In [29]:
# distribution of group entropies

plt.hist(df_groups["entropy"], bins = 15)
plt.xlabel("Entropy")
plt.ylabel("# of Groups")
plt.title("Diversity within Groups")
plt.savefig('images/ch-members/active/hist_entropy.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

### Simpson

In [30]:
def simpson(row):
    counts = row[count_cols]
    return np.sum(np.square(counts)) / (row['Size']**2)

In [31]:
df_groups['simpson'] = df_groups.apply(simpson, axis = 1)

### Entropy vs. Simpson

In [32]:
plt.scatter(df_groups['simpson'], df_groups['entropy'])
plt.xlabel("Simpson Index (similarity)")
plt.ylabel("Entropy (uncertainty)")
plt.title("National Diversity within Groups")
plt.savefig('images/ch-members/active/scatter_simpson_entropy.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [33]:
print("Entropy ~ simpson")

reg = smf.ols('entropy ~ simpson', data = df_groups).fit()
print(scipy.stats.pearsonr(df_groups['entropy'], df_groups['simpson']))
print(reg.summary())

Entropy ~ simpson
(-0.9620096952633582, 6.673013759688798e-99)
                            OLS Regression Results                            
Dep. Variable:                entropy   R-squared:                       0.925
Model:                            OLS   Adj. R-squared:                  0.925
Method:                 Least Squares   F-statistic:                     2136.
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           6.67e-99
Time:                        17:40:16   Log-Likelihood:                 57.166
No. Observations:                 174   AIC:                            -110.3
Df Residuals:                     172   BIC:                            -104.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------

## Diversity correlates

In [34]:
plt.scatter(df_groups['pVZ'], df_groups['entropy'])
plt.xlabel("Proportion VZ in Group")
plt.ylabel("Entropy")
plt.savefig('images/ch-members/active/scatter_pVZ_entropy.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [35]:
plt.scatter(df_groups['pCO'], df_groups['entropy'])
plt.xlabel("Proportion CO in Group")
plt.ylabel("Entropy")
plt.savefig('images/ch-members/active/scatter_pCO_entropy.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [36]:
print("Entropy ~ pVZ")

reg = smf.ols('entropy ~ pVZ', data = df_groups).fit()
print(scipy.stats.pearsonr(df_groups['pVZ'], df_groups['entropy']))
print(reg.summary())

Entropy ~ pVZ
(0.07785176680860256, 0.30721461019271123)
                            OLS Regression Results                            
Dep. Variable:                entropy   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.049
Date:                Tue, 14 Apr 2020   Prob (F-statistic):              0.307
Time:                        17:40:26   Log-Likelihood:                -168.20
No. Observations:                 174   AIC:                             340.4
Df Residuals:                     172   BIC:                             346.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------

In [37]:
print("Entropy ~ pCO")
reg = smf.ols('entropy ~ pCO', data = df_groups).fit()
print(scipy.stats.pearsonr(df_groups['pCO'], df_groups['entropy']))
print(reg.summary())

Entropy ~ pCO
(-0.5047564149426101, 1.2301502872321198e-12)
                            OLS Regression Results                            
Dep. Variable:                entropy   R-squared:                       0.255
Model:                            OLS   Adj. R-squared:                  0.250
Method:                 Least Squares   F-statistic:                     58.80
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           1.23e-12
Time:                        17:40:26   Log-Likelihood:                -143.14
No. Observations:                 174   AIC:                             290.3
Df Residuals:                     172   BIC:                             296.6
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------

In [38]:
plt.scatter(df_groups['p3rdCountry'], df_groups['entropy'])
plt.xlabel("Prop. Non-CO/Non-VZ in Group")
plt.ylabel("Entropy")
plt.savefig('images/ch-members/active/scatter_p3rd_entropy.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [39]:
print("Entropy ~ p3rd")

reg = smf.ols('entropy ~ p3rdCountry', data = df_groups).fit()
print(scipy.stats.pearsonr(df_groups['p3rdCountry'], df_groups['entropy']))
print(reg.summary())

Entropy ~ p3rd
(0.5140851118532208, 4.0079190930919647e-13)
                            OLS Regression Results                            
Dep. Variable:                entropy   R-squared:                       0.264
Model:                            OLS   Adj. R-squared:                  0.260
Method:                 Least Squares   F-statistic:                     61.79
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           4.01e-13
Time:                        17:40:29   Log-Likelihood:                -142.02
No. Observations:                 174   AIC:                             288.0
Df Residuals:                     172   BIC:                             294.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------

In [40]:
plt.scatter(df_groups['Size'], df_groups['entropy'])
plt.xlabel("Group Size")
plt.ylabel("Entropy")
plt.savefig('images/ch-members/active/scatter_size_entropy.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [41]:
print("Entropy ~ Size")

reg = smf.ols('entropy ~ Size', data = df_groups).fit()
print(scipy.stats.pearsonr(df_groups['Size'], df_groups['entropy']))
print(reg.summary())

Entropy ~ Size
(0.3217276643275484, 1.4982159358062118e-05)
                            OLS Regression Results                            
Dep. Variable:                entropy   R-squared:                       0.104
Model:                            OLS   Adj. R-squared:                  0.098
Method:                 Least Squares   F-statistic:                     19.86
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           1.50e-05
Time:                        17:40:32   Log-Likelihood:                -159.22
No. Observations:                 174   AIC:                             322.4
Df Residuals:                     172   BIC:                             328.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------

### Simpson, to check

In [42]:
print("Simpson ~ pVZ, pCO, p3rd, Size (to check)")
reg = smf.ols('simpson ~ pVZ', data = df_groups).fit()
print(scipy.stats.pearsonr(df_groups['pVZ'], df_groups['simpson']))
print(reg.summary())

Simpson ~ pVZ, pCO, p3rd, Size (to check)
(-0.11795490916390278, 0.12110939301296374)
                            OLS Regression Results                            
Dep. Variable:                simpson   R-squared:                       0.014
Model:                            OLS   Adj. R-squared:                  0.008
Method:                 Least Squares   F-statistic:                     2.427
Date:                Tue, 14 Apr 2020   Prob (F-statistic):              0.121
Time:                        17:40:45   Log-Likelihood:                -30.348
No. Observations:                 174   AIC:                             64.70
Df Residuals:                     172   BIC:                             71.01
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------

In [43]:
reg = smf.ols('simpson ~ pCO', data = df_groups).fit()
print(scipy.stats.pearsonr(df_groups['pCO'], df_groups['simpson']))
print(reg.summary())

(0.5334682161290204, 3.484884090432477e-14)
                            OLS Regression Results                            
Dep. Variable:                simpson   R-squared:                       0.285
Model:                            OLS   Adj. R-squared:                  0.280
Method:                 Least Squares   F-statistic:                     68.42
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           3.48e-14
Time:                        17:40:46   Log-Likelihood:                -2.4310
No. Observations:                 174   AIC:                             8.862
Df Residuals:                     172   BIC:                             15.18
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercep

In [44]:
reg = smf.ols('simpson ~ p3rdCountry', data = df_groups).fit()
print(scipy.stats.pearsonr(df_groups['p3rdCountry'], df_groups['simpson']))
print(reg.summary())

(-0.5150722864106063, 3.552284240929059e-13)
                            OLS Regression Results                            
Dep. Variable:                simpson   R-squared:                       0.265
Model:                            OLS   Adj. R-squared:                  0.261
Method:                 Least Squares   F-statistic:                     62.11
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           3.55e-13
Time:                        17:40:48   Log-Likelihood:                -4.7456
No. Observations:                 174   AIC:                             13.49
Df Residuals:                     172   BIC:                             19.81
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Inter

In [45]:
reg = smf.ols('simpson ~ Size', data = df_groups).fit()
print(scipy.stats.pearsonr(df_groups['Size'], df_groups['simpson']))
print(reg.summary())

(-0.19807507966452123, 0.008793798880589837)
                            OLS Regression Results                            
Dep. Variable:                simpson   R-squared:                       0.039
Model:                            OLS   Adj. R-squared:                  0.034
Method:                 Least Squares   F-statistic:                     7.024
Date:                Tue, 14 Apr 2020   Prob (F-statistic):            0.00879
Time:                        17:40:49   Log-Likelihood:                -28.085
No. Observations:                 174   AIC:                             60.17
Df Residuals:                     172   BIC:                             66.49
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Interce

# Network: Groups as nodes

In [46]:
members_many_groups = df_interact.index[df_interact['Group'] > 1]

In [47]:
df_interact_groups_nodes = pd.DataFrame()

for member in members_many_groups:
    group_list = list(df_x[df_x['tel'] == member]['uid'].unique())
    df_new = pd.DataFrame(index = group_list, columns = group_list).fillna(1)
        
    df_interact_groups_nodes = df_interact_groups_nodes.add(df_new, fill_value = 0)

df_interact_groups_nodes = df_interact_groups_nodes.fillna(0)

In [48]:
matr_interact = df_interact_groups_nodes.values

In [49]:
df_groups['degree'] = df_interact_groups_nodes.sum(axis = 1)
df_groups['degree'] = df_groups['degree'].fillna(0)

In [50]:
G = nx.from_numpy_matrix(matr_interact)

In [51]:
Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
G0 = G.subgraph(Gcc[0])

In [52]:
average_dists = []

for i in list(G):
    lengths = nx.shortest_path_length(G, source = i).values()
    average_dists.append(sum(lengths) / (len(lengths) - 1))

In [53]:
print("Group graph")
print("Largest CC n_nodes: %s" % G0.number_of_nodes())
print("Graph n_nodes: %s" % G.number_of_nodes())

Group graph
Largest CC n_nodes: 86
Graph n_nodes: 107


### Groups networkx graph

In [54]:
nodes_co = [i for i in list(G) if df_groups.loc[df_interact_groups_nodes.index[i], 'pCO'] >= 0.5]
nodes_vz = [i for i in list(G) if df_groups.loc[df_interact_groups_nodes.index[i], 'pVZ'] >= 0.5]
nodes_3rd = [i for i in list(G) if i not in nodes_co and i not in nodes_vz]

In [55]:
pos = nx.spring_layout(G, k = 1.2, iterations = 200, seed = 35)
plt.figure(figsize = (14, 8))

nx.draw_networkx_nodes(G, pos=pos, nodelist = nodes_co, node_size = 50, node_color = 'blue', alpha = 0.7)
nx.draw_networkx_nodes(G, pos=pos, nodelist = nodes_vz, node_size = 50, node_color = 'red', alpha = 0.7)
nx.draw_networkx_nodes(G, pos=pos, nodelist = nodes_3rd, node_size = 50, node_color = 'orange', alpha = 0.7)
nx.draw_networkx_nodes(G, pos=pos, nodelist = [nodes_co[0]], node_size = 50, node_color = 'blue', alpha = 0.7, label = '>50% CO')
nx.draw_networkx_nodes(G, pos=pos, nodelist = [nodes_vz[0]], node_size = 50, node_color = 'red', alpha = 0.7, label = '>50% VZ')
nx.draw_networkx_nodes(G, pos=pos, nodelist = [nodes_3rd[0]], node_size = 50, node_color = 'orange', alpha = 0.7, label = 'Neither')

nx.draw_networkx_edges(G, pos=pos, width = 1)
plt.legend()
plt.title("Groups as Nodes (connected if they share members)")
plt.savefig('images/ch-members/active/networkx_groups_co_vz_3rd.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [56]:
dists_co = [average_dists[i] for i in nodes_co if i in Gcc[0]]
dists_vz = [average_dists[i] for i in nodes_vz if i in Gcc[0]]
dists_3rd = [average_dists[i] for i in nodes_3rd if i in Gcc[0]]

print('CO Centrality: %.3f' % np.mean(dists_co))
print('VZ Centrality: %.3f' % np.mean(dists_vz))
print('3rd Centrality: %.3f' % np.mean(dists_3rd))

print(f_oneway(dists_co, dists_vz, dists_3rd))
print(ttest_ind(dists_co, dists_vz))

CO Centrality: 3.487
VZ Centrality: 3.684
3rd Centrality: 3.590
F_onewayResult(statistic=0.466308657713113, pvalue=0.6289469826725671)
Ttest_indResult(statistic=-0.8426719644136987, pvalue=0.40359138120413085)


In [57]:
print("Group graph, average dist <= 3: number and number VZ")
co = 0
vz = 0
print(len([i for i in Gcc[0] if average_dists[i] <= 3]))
print(len([i for i in nodes_vz if average_dists[i] <= 3 and i in Gcc[0]]))


Group graph, average dist <= 3: number and number VZ
16
1


## Correlates of centrality

In [58]:
df_interact_groups_nodes['avg_dist'] = average_dists

In [59]:
df_groups['G0_avg_dist'] = df_interact_groups_nodes.iloc[list(G0)]['avg_dist']
df_groups_G0 = df_groups[df_groups['G0_avg_dist'].notnull()]

In [60]:
print("With G0_avg_dist")
for col in ['Size', 'pVZ', 'pCO', 'pUS', 'pPE', \
       'pCL', 'pEC', 'p3rdCountry', 'entropy', 'simpson']:
    
    print("============= %s =============" % col)
    print(scipy.stats.pearsonr(df_groups_G0[col], df_groups_G0['G0_avg_dist']))

With G0_avg_dist
(-0.27182086536756184, 0.011349852354647947)
(0.04225742555547344, 0.6992615411753947)
(-0.1271925679198834, 0.24320143643718442)
(-0.04418591729836163, 0.6862387100174795)
(-0.010607229883454472, 0.9227814300685595)
(0.12707729505844576, 0.24363231830327206)
(-0.016935378155973707, 0.8770064578003953)
(0.12077502007717883, 0.2679945769253617)
(0.021454026594639134, 0.8445572040709681)
(-0.02235149777871722, 0.8381412332484495)


In [61]:
print("G0_avg_dist ~ Size")
reg = smf.ols('G0_avg_dist ~ Size', data = df_groups).fit()
print(reg.summary())

G0_avg_dist ~ Size
                            OLS Regression Results                            
Dep. Variable:            G0_avg_dist   R-squared:                       0.074
Model:                            OLS   Adj. R-squared:                  0.063
Method:                 Least Squares   F-statistic:                     6.702
Date:                Tue, 14 Apr 2020   Prob (F-statistic):             0.0113
Time:                        17:42:31   Log-Likelihood:                -77.335
No. Observations:                  86   AIC:                             158.7
Df Residuals:                      84   BIC:                             163.6
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.7008      0.088 

## Correlates of group degree

In [62]:
# distribution of group degrees

plt.hist(df_groups["degree"], bins = 30)
plt.xlabel("Degree")
plt.ylabel("# of Groups")
plt.title("Group Network")
plt.savefig('images/ch-members/active/hist_degree.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [63]:
print("Group degree: mean, median, <5 prop, >20 prop")
print(np.mean(df_groups["degree"]))
print(np.median(df_groups["degree"]))
print(np.mean(df_groups["degree"] < 5))
print(np.mean(df_groups["degree"] > 20))

Group degree: mean, median, <5 prop, >20 prop
13.137931034482758
3.0
0.5919540229885057
0.20689655172413793


In [64]:
print("With degree")

for col in ['Size', 'pVZ', 'pCO', 'pUS', 'pPE', \
       'pCL', 'pEC', 'p3rdCountry', 'entropy', 'simpson']:
    
    print("============= %s =============" % col)
    print(scipy.stats.pearsonr(df_groups[col], df_groups['degree']))
    
print("============= G0_avg_dist =============")
print(scipy.stats.pearsonr(df_groups_G0['degree'], df_groups_G0['G0_avg_dist']))

With degree
(0.48336865909383336, 1.4203527681663546e-11)
(0.0229013644263432, 0.7642139984639535)
(-0.04298145909497579, 0.5733407963987452)
(-0.07306825743003638, 0.33797658223277727)
(0.08115494749447351, 0.28708037108299356)
(-0.030011719395183382, 0.6942331260013966)
(0.0246334702318857, 0.7469643488557437)
(0.030887952989118356, 0.6857716187223513)
(0.31686284686195454, 2.0454453381775717e-05)
(-0.2484712583922792, 0.0009465896880191944)
(-0.30639430164783776, 0.004115715195865375)


In [65]:
print("Degree ~ size + entropy")

reg = smf.ols('degree ~ Size + entropy', data = df_groups).fit()
print(reg.summary())

Degree ~ size + entropy
                            OLS Regression Results                            
Dep. Variable:                 degree   R-squared:                       0.263
Model:                            OLS   Adj. R-squared:                  0.254
Method:                 Least Squares   F-statistic:                     30.46
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           4.83e-12
Time:                        17:42:37   Log-Likelihood:                -765.05
No. Observations:                 174   AIC:                             1536.
Df Residuals:                     171   BIC:                             1546.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.6517      2

In [66]:
print("Degree ~ size + entropy, only degree <= 70")

reg = smf.ols('degree ~ Size + entropy', data = df_groups[df_groups['degree'] <= 70]).fit()
print(reg.summary())

Degree ~ size + entropy, only degree <= 70
                            OLS Regression Results                            
Dep. Variable:                 degree   R-squared:                       0.110
Model:                            OLS   Adj. R-squared:                  0.099
Method:                 Least Squares   F-statistic:                     10.19
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           6.69e-05
Time:                        17:42:37   Log-Likelihood:                -677.38
No. Observations:                 168   AIC:                             1361.
Df Residuals:                     165   BIC:                             1370.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept

## Clustering coefficient

In [67]:
groups_clustering = pd.DataFrame(index = df_interact_groups_nodes.index.values,
                                 data = nx.clustering(G).values(), columns = ['clustering'])

In [68]:
df_groups['clustering'] = groups_clustering['clustering']

In [69]:
df_c = df_groups[df_groups['clustering'].notna()]

In [70]:
print("df_c (clustering) shape")
print(df_c.shape)

df_c (clustering) shape
(107, 64)


In [71]:
# distribution of group sizes

plt.hist(df_c["clustering"], bins = 20)
plt.xlabel("Clustering Coefficient")
plt.ylabel("# of Groups")
plt.title("Group Network")
plt.savefig('images/ch-members/active/hist_clustering.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [72]:
print("With clustering")

for col in ['Size', 'pVZ', 'pCO', 'pUS', 'pPE', \
       'pCL', 'pEC', 'p3rdCountry', 'entropy', 'simpson', 'degree']:
    
    print("============= %s =============" % col)
    print(scipy.stats.pearsonr(df_c[col], df_c['clustering']))

With clustering
(-0.29467228825574776, 0.0020628825201293157)
(0.06135727682049273, 0.5301206519554046)
(0.1134853873531027, 0.244473333888821)
(0.020269469129507965, 0.8358320915814725)
(-0.19719569083534397, 0.04176176348618285)
(-0.20941831365486735, 0.030398280078141677)
(0.11174856082441106, 0.25181481943674505)
(-0.17760130406248176, 0.0672316681600139)
(-0.07111335920770363, 0.4666842701179089)
(0.06831240082307934, 0.4844604283633672)
(0.057584993160039656, 0.5557562071928962)


## 3 Class graphs (groups)

In [73]:
def threeClassGraphG0(colname, label, appendix = False):
    def valsMap(uid):
        return df_groups.loc[uid, colname]
    
    l = np.quantile(df_groups[colname], .3)
    h = np.quantile(df_groups[colname], .7)

    vals = [valsMap(u) for u in df_interact_groups_nodes.index.values]

    nodes_low = [i for i in Gcc[0] if vals[i] <= l]
    nodes_mid = [i for i in Gcc[0] if vals[i] > l and vals[i] <= h]
    nodes_high = [i for i in Gcc[0] if vals[i] > h]
    
    plt.figure(figsize = (14, 14))
    pos = nx.spring_layout(G0, iterations = 200, seed = 0)
    nx.draw_networkx_nodes(G0, pos=pos, nodelist=nodes_low, node_color='#ffb1b1',
                           label='%s $\leq$ %.3f' % (label, l), node_size = 120)
    nx.draw_networkx_nodes(G0, pos=pos, nodelist=nodes_mid, node_color='#d80000',
                           label='%s $>$ %.3f, $\leq$ %.3f' % (label, l, h), node_size = 120)
    nx.draw_networkx_nodes(G0, pos=pos, nodelist=nodes_high, node_color='#4e0000',
                           label='%s $>$ %.3f' % (label, h), node_size = 120)
    nx.draw_networkx_edges(G0, pos=pos, width = 1)
    plt.legend(scatterpoints = 1)
    plt.title("Groups as Nodes")
    
    if appendix:
        plt.savefig('images/ch-appendicies/members/networkx_3cls_%s.png' % colname, bbox_inches = 'tight', pad_inches = 0.05)
        plt.close()
    else:
        plt.savefig('images/ch-members/active/networkx_3cls_%s.png' % colname, bbox_inches = 'tight', pad_inches = 0.05)
        plt.close()
    
    dists_low = [average_dists[i] for i in nodes_low]
    dists_mid = [average_dists[i] for i in nodes_mid]
    dists_high = [average_dists[i] for i in nodes_high]
    print("Three class graph")
    print('%s < %s Centrality: %.3f' %
          (label, l, np.mean(dists_low)))
    print('%s > %s, <= %s Centrality: %.3f' %
          (label, l, h, np.mean(dists_mid)))    
    print('%s > %s Centrality: %.3f' %
          (label, h, np.mean(dists_high)))
    print(f_oneway(dists_low, dists_mid, dists_high))
    print(ttest_ind(dists_low, dists_high))
    

In [74]:
threeClassGraphG0('Size', 'Size')

Three class graph
Size < 5.0 Centrality: 3.929
Size > 5.0, <= 46.099999999999994 Centrality: 3.689
Size > 46.099999999999994 Centrality: 3.351
F_onewayResult(statistic=4.492003079061608, pvalue=0.014050103968561442)
Ttest_indResult(statistic=2.020439643113485, pvalue=0.04945073331728726)


In [75]:
threeClassGraphG0('pCO', 'Prop. CO')

Three class graph
Prop. CO < 0.23076923076923078 Centrality: 3.734
Prop. CO > 0.23076923076923078, <= 0.8065217391304347 Centrality: 3.492
Prop. CO > 0.8065217391304347 Centrality: 3.459
F_onewayResult(statistic=1.4304707292530303, pvalue=0.24503127359905394)
Ttest_indResult(statistic=1.6482245190285338, pvalue=0.10676847456880452)


In [76]:
threeClassGraphG0('pVZ', 'Prop. VZ', True)

Three class graph
Prop. VZ < 0.0 Centrality: 3.738
Prop. VZ > 0.0, <= 0.24471830985915488 Centrality: 3.436
Prop. VZ > 0.24471830985915488 Centrality: 3.625
F_onewayResult(statistic=1.5052703790141295, pvalue=0.22795346132671993)
Ttest_indResult(statistic=0.6109110317693726, pvalue=0.5445507066407507)


In [77]:
threeClassGraphG0('p3rdCountry', 'Prop. Non-CO, Non-VZ', True)

Three class graph
Prop. Non-CO, Non-VZ < 0.012294543063773877 Centrality: 3.739
Prop. Non-CO, Non-VZ > 0.012294543063773877, <= 0.43977154724818274 Centrality: 3.483
Prop. Non-CO, Non-VZ > 0.43977154724818274 Centrality: 3.543
F_onewayResult(statistic=1.0653012702597509, pvalue=0.349288254557079)
Ttest_indResult(statistic=1.0061648530476544, pvalue=0.3210516280586712)


In [78]:
threeClassGraphG0('entropy', 'Entropy', True)

Three class graph
Entropy < 0.3237385189613794 Centrality: 3.580
Entropy > 0.3237385189613794, <= 1.164033446128445 Centrality: 3.544
Entropy > 1.164033446128445 Centrality: 3.541
F_onewayResult(statistic=0.020816675737600158, pvalue=0.9794036072568366)
Ttest_indResult(statistic=0.19060401502645746, pvalue=0.8496567751837993)


# Graph: Users as nodes

In [79]:
users_cols = [c for c in df_interact.columns if c[0] == '+']

In [80]:
matr_interact = df_interact[users_cols].fillna(0).values

In [81]:
G = nx.from_numpy_matrix(matr_interact)

In [82]:
Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
G0 = G.subgraph(Gcc[0])

In [83]:
print("Users as nodes graph")
print("Largest CC n_nodes: %s" % G0.number_of_nodes())
print("Graph n_nodes: %s" % G.number_of_nodes())

Users as nodes graph
Largest CC n_nodes: 5693
Graph n_nodes: 7860


In [84]:
def colorMap(tel):
    if tel[:4] == '+57 ':
        return '#8b0000'
    elif tel[:4] == '+58 ':
        return '#0000ff'
    else:
        return '#ffff00'

cmap = [colorMap(t) for t in df_interact.index.values]
cmap0 = [cmap[i] for i in list(Gcc[0])]

In [85]:
pos = nx.spring_layout(G0, iterations = 50, seed = 0)

In [86]:
nodes_co = [i for i in Gcc[0] if df_interact.index[i][:4] == '+57 ']
nodes_vz = [i for i in Gcc[0] if df_interact.index[i][:4] == '+58 ']
nodes_3rd = [i for i in Gcc[0] if (df_interact.index[i][:4] != '+57 ') and (df_interact.index[i][:4] != '+58 ')]

In [87]:
plt.figure(figsize = (14, 14))

nx.draw_networkx_nodes(G0, pos=pos, nodelist = nodes_co, node_size = 50, node_color = 'blue', alpha = 0.05)
nx.draw_networkx_nodes(G0, pos=pos, nodelist = nodes_vz, node_size = 50, node_color = 'red', alpha = 0.05)
nx.draw_networkx_nodes(G0, pos=pos, nodelist = [nodes_co[0]], node_size = 50, node_color = 'blue', alpha = 0.5, label = 'CO Users')
nx.draw_networkx_nodes(G0, pos=pos, nodelist = [nodes_vz[0]], node_size = 50, node_color = 'red', alpha = 0.5, label = 'VZ Users')

nx.draw_networkx_edges(G0, pos=pos, width = 0.005)
plt.legend()
plt.title("Users as Nodes (connected if in same group)")
plt.savefig('images/ch-members/active/networkx_co_vz.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [88]:
plt.figure(figsize = (14, 14))

nx.draw_networkx_nodes(G0, pos=pos, nodelist = nodes_co, node_size = 50, node_color = 'blue', alpha = 0.05)
nx.draw_networkx_nodes(G0, pos=pos, nodelist = nodes_vz, node_size = 50, node_color = 'red', alpha = 0.05)
nx.draw_networkx_nodes(G0, pos=pos, nodelist = nodes_3rd, node_size = 50, node_color = 'yellow', alpha = 0.05)
nx.draw_networkx_nodes(G0, pos=pos, nodelist = [nodes_co[0]], node_size = 50, node_color = 'blue', alpha = 0.5, label = 'CO Users')
nx.draw_networkx_nodes(G0, pos=pos, nodelist = [nodes_vz[0]], node_size = 50, node_color = 'red', alpha = 0.5, label = 'VZ Users')
nx.draw_networkx_nodes(G0, pos=pos, nodelist = [nodes_3rd[0]], node_size = 50, node_color = 'yellow', alpha = 0.5, label = 'Other Users')

nx.draw_networkx_edges(G0, pos=pos, width = 0.005)
plt.legend()
plt.title("Users as Nodes (connected if in same group)")
plt.savefig('images/ch-members/active/networkx_co_vz_other.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

## Degree distributions of users

In [89]:
plt.figure(figsize = (12, 6))
df_interact['nConnections'].hist(bins = 100)
plt.xlabel("Number of Connections")
plt.ylabel('# of Users')
plt.title("User Degree Distribution (All Users)")
plt.savefig("images/ch-members/active/degree_dist.png", bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [90]:
plt.figure(figsize = (12, 6))
df_interact_co['nConnections'].hist(bins = 100, alpha = 0.3, range = (0, 500), label = 'CO Users')
df_interact_vz['nConnections'].hist(bins = 100, alpha = 0.4, range = (0, 500), label = 'VZ Users')
plt.legend()
plt.xlabel("Number of Connections")
plt.ylabel('# of Users')
plt.title("User Degree Distribution (CO and VZ Users)")
plt.savefig("images/ch-members/active/degree_dist_VZ_vs_CO.png", bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [91]:
print("nConnections means: VZ, CO")

print(df_interact_vz['nConnections'].mean())
print(df_interact_co['nConnections'].mean())

nConnections means: VZ, CO
165.938713592233
161.9832481544577


In [92]:
print(ttest_ind(df_interact_vz['nConnections'], df_interact_co['nConnections']))

Ttest_indResult(statistic=1.3184905615625262, pvalue=0.18739791040818618)


In [93]:
print("Users with most connections")

print(df_interact['nConnections'].sort_values(ascending = False)[:20])

Users with most connections
+52 1 55 8520 8763    701.0
+51 923 793 356       589.0
+58 414-2014369       583.0
+58 414-7912202       545.0
+58 414-2685679       534.0
+57 311 6302043       509.0
+57 301 4350295       506.0
+58 424-9474681       506.0
+57 313 7614232       498.0
+58 416-8410879       498.0
+57 322 2611650       498.0
+58 416-8748808       498.0
+57 313 2882015       479.0
+593 97 897 4451      440.0
+55 16 99347-9730     438.0
+58 416-8741626       437.0
+51 985 159 137       436.0
+51 943 344 989       432.0
+51 955 087 225       427.0
+57 311 4922543       420.0
Name: nConnections, dtype: float64


In [94]:
print("nConnections mean, median, prop > 100, prop > 200")

print(np.mean(df_interact['nConnections']))
print(np.median(df_interact['nConnections']))
print(np.mean(df_interact['nConnections'] > 100))
print(np.mean(df_interact['nConnections'] > 200))

nConnections mean, median, prop > 100, prop > 200
167.83091603053435
155.0
0.7057251908396946
0.3567430025445293


In [95]:
df_groups.to_csv("Data/df_groups_nb1-members-byMessaging.csv")