In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

In [2]:
import matplotlib.pyplot as plt
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 20

plt.rc('font', size=BIGGER_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=BIGGER_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [3]:
import statsmodels.formula.api as smf
import scipy.stats

In [4]:
df_x = pd.read_csv("Data/df_x_nb0a-groupMerge.csv", index_col = 0)
print(df_x.shape)

(171634, 26)


# Concentration

In [5]:
def hhConc(df):
    counts = df['tel'].value_counts()
    props = counts / sum(counts)
    
    return sum(props**2)

def top5Conc(df):
    counts = df['tel'].value_counts()
    props = counts / sum(counts)
    
    return sum(props[:5])

def gini(df):
    counts = np.sort(df['tel'].value_counts())
    n = len(counts)
    index = np.arange(1, n + 1)
    
    return sum((2 * index - n  - 1) * counts) / (n * sum(counts))

In [6]:
df_groups = pd.read_csv('Data/df_groups_nb2-messages.csv', index_col = 0)

In [7]:
df_groups['hhConc'] = df_x[['uid', 'tel']].groupby('uid').apply(hhConc)
df_groups['gini'] = df_x[['uid', 'tel']].groupby('uid').apply(gini)

In [8]:
df_groups = df_groups.drop(['simpson', 'G0_avg_dist', 'clustering', 'n_days'], axis = 1)

In [9]:
df_groups.to_csv('Data/df_groups_nb2b-concentration.csv')

In [10]:
df_groups['top5Conc'] = df_x[['uid', 'tel']].groupby('uid').apply(top5Conc)

In [11]:
plt.scatter(df_groups['hhConc'], df_groups['top5Conc'], alpha = 0.3)
plt.xlabel("H-H Concentration")
plt.ylabel("Top 5 Concentration")
plt.title("All Groups")
plt.savefig('images/ch-messages/scatter_hh_top5.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
print(scipy.stats.pearsonr(df_groups['hhConc'], df_groups['top5Conc']))

(0.7307356779240799, 2.51683268140486e-30)


In [12]:
plt.scatter(df_groups['hhConc'], df_groups['gini'], alpha = 0.3)
plt.xlabel("H-H Concentration")
plt.ylabel("Gini Coefficient")
plt.title("All Groups")
m, b = np.polyfit(df_groups['hhConc'], df_groups['gini'], 1)
plt.plot(df_groups['hhConc'], m*df_groups['hhConc'] + b, color = 'orange')
plt.savefig('images/ch-messages/scatter_hh_gini.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
print(scipy.stats.pearsonr(df_groups['hhConc'], df_groups['gini']))

(-0.7085117192117811, 7.821940749975295e-28)


In [13]:
plt.scatter(df_groups[df_groups['Size'] > 1]['hhConc'], df_groups[df_groups['Size'] > 1]['gini'], alpha = 0.3)
plt.xlabel("H-H Concentration")
plt.ylabel("Gini Coefficient")
plt.title("Groups With $> 1$ Member")
m, b = np.polyfit(df_groups[df_groups['Size'] > 1]['hhConc'], df_groups[df_groups['Size'] > 1]['gini'], 1)
plt.plot(df_groups[df_groups['Size'] > 1]['hhConc'], m*df_groups[df_groups['Size'] > 1]['hhConc'] + b, color = 'orange')
plt.savefig('images/ch-messages/scatter_hh_gini_drop.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
print(scipy.stats.pearsonr(df_groups[df_groups['Size'] > 1]['hhConc'], df_groups[df_groups['Size'] > 1]['gini']))

(-0.37363115787430357, 2.123858098384153e-06)


In [14]:
print((df_groups['Size'] == 1).sum())

22


## hhConc correlations

In [15]:
df_groups.columns

Index(['+52', '+55', '+57', '+58', '+34', '+1', '+263', '+27', '+381', '+505',
       '+506', '+51', '+54', '+591', '+593', '+595', '+91', '+92', '+98',
       '+56', '+39', '+53', '+229', '+502', '+592', '+599', '+504', '+32',
       '+33', '+44', '+351', '+49', '+20', '+212', '+213', '+241', '+966',
       '+967', '+971', '+503', '+40', '+297', '+41', '+507', '+597', '+598',
       '+46', '+254', '+258', '+240', '+244', 'Size', 'pVZ', 'pCO', 'pUS',
       'pPE', 'pCL', 'pEC', 'p3rdCountry', 'entropy', 'degree', 'activity',
       'hhConc', 'gini', 'top5Conc'],
      dtype='object')

In [16]:
for col in ['Size', 'pVZ', 'pCO', 'pUS', 'pPE', \
       'pCL', 'pEC', 'p3rdCountry', 'entropy', 'degree', 'activity']:
    
    print("============= %s =============" % col)
    print(scipy.stats.pearsonr(df_groups[col], df_groups['hhConc']))

(-0.5075681355609349, 8.804748052670846e-13)
(-0.05753039269311824, 0.4508294138931243)
(0.08844087309651463, 0.2458525826168)
(-0.015706936515108044, 0.8370189745557833)
(-8.261152904454061e-05, 0.9991367956444462)
(-0.0290912326836545, 0.7031628233942069)
(0.05744906176507564, 0.4514708006448012)
(-0.055313765310498184, 0.46849199086135934)
(-0.4814544956034361, 1.7536414164221756e-11)
(-0.36216841861569893, 9.078047756914363e-07)
(-0.2947384307161544, 7.883786879078602e-05)


In [17]:
reg = smf.ols('hhConc ~ Size + entropy + degree + activity', data = df_groups).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:                 hhConc   R-squared:                       0.375
Model:                            OLS   Adj. R-squared:                  0.361
Method:                 Least Squares   F-statistic:                     25.39
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           1.76e-16
Time:                        20:43:00   Log-Likelihood:                -1.5659
No. Observations:                 174   AIC:                             13.13
Df Residuals:                     169   BIC:                             28.93
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.5460      0.030     18.028      0.0

In [18]:
plt.scatter(df_groups['Size'], df_groups['hhConc'])
plt.xlabel("Size")
plt.ylabel("H-H Concentration")
m, b = np.polyfit(df_groups['Size'], df_groups['hhConc'], 1)
plt.plot(range(0, 200), m*range(0, 200) + b, color = 'orange')
plt.savefig("images/ch-messages/scatter_size_hh.png", bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [19]:
plt.scatter(df_groups['entropy'], df_groups['hhConc'])
plt.xlabel("Entropy")
plt.ylabel("H-H Concentration")
m, b = np.polyfit(df_groups['entropy'], df_groups['hhConc'], 1)
plt.plot(df_groups['entropy'], m*df_groups['entropy'] + b, color = 'orange')
plt.savefig("images/ch-messages/scatter_entropy_hh.png", bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

## Gini correlations

In [20]:
for col in ['Size', 'pVZ', 'pCO', 'pUS', 'pPE', \
       'pCL', 'pEC', 'p3rdCountry', 'entropy', 'activity', 'degree']:
    
    print("============= %s =============" % col)
    print(scipy.stats.pearsonr(df_groups[col], df_groups['gini']))

(0.5438867640149615, 8.779324318855178e-15)
(0.13040825109232373, 0.08632025657235302)
(-0.09378610022609103, 0.21835213511357515)
(-0.03028656919773385, 0.6915748606510072)
(-0.0316274881334363, 0.678660381354542)
(-0.013677611357560814, 0.8578371674416525)
(-0.03785997941534598, 0.6199045009697741)
(0.003689702744077858, 0.9614613114620281)
(0.49980350487117475, 2.2008762306473083e-12)
(0.3486409500132344, 2.422864215788752e-06)
(0.4335386141525756, 2.2918205263423386e-09)


In [21]:
reg = smf.ols('gini ~ Size + entropy + activity + degree', data = df_groups).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:                   gini   R-squared:                       0.431
Model:                            OLS   Adj. R-squared:                  0.418
Method:                 Least Squares   F-statistic:                     32.03
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           7.38e-20
Time:                        20:44:53   Log-Likelihood:                 30.061
No. Observations:                 174   AIC:                            -50.12
Df Residuals:                     169   BIC:                            -34.33
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.2771      0.025     10.972      0.0

In [22]:
plt.scatter(df_groups['Size'], df_groups['gini'])
plt.xlabel("Size")
plt.ylabel("Gini")
m, b = np.polyfit(df_groups['Size'], df_groups['gini'], 1)
plt.plot(range(0, 300), m*range(0, 300) + b, color = 'orange')
plt.savefig("images/ch-messages/scatter_size_gini.png", bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [23]:
plt.scatter(df_groups['entropy'], df_groups['gini'])
plt.xlabel("Entropy")
plt.ylabel("Gini")
m, b = np.polyfit(df_groups['entropy'], df_groups['gini'], 1)
plt.plot(df_groups['entropy'], m*df_groups['entropy'] + b, color = 'orange')
plt.savefig("images/ch-messages/scatter_entropy_gini.png", bbox_inches = 'tight', pad_inches = 0.05)
plt.close()