In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 20

plt.rc('font', size=BIGGER_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=BIGGER_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [3]:
import statsmodels.formula.api as smf
import scipy.stats
from scipy.stats import ttest_ind
from scipy.stats import f_oneway

In [4]:
df_x = pd.read_csv("Data/df_x_nb4a-mis.csv", index_col = 0)
print(df_x.shape)

(171634, 34)


In [5]:
df_x['text'] = df_x['text'].fillna('')
df_x['textlower'] = df_x['textlower'].fillna('')
df_x['virality'] = df_x['virality'].fillna(0)

### Text similarity imports/functions

In [6]:
import string, re, unidecode
from sklearn.decomposition import LatentDirichletAllocation as LDA

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
stemmer = SnowballStemmer('spanish')
remove_punc = str.maketrans(string.punctuation, len(string.punctuation) * " ")
stopwords_ascii = [unidecode.unidecode(w) for w in stopwords.words('spanish')]

def tokenize(s):
    s = unidecode.unidecode(s)
    s = s.translate(remove_punc)
    tokens = nltk.word_tokenize(s)
    filtered = [w for w in tokens if w not in stopwords_ascii]
    return [stemmer.stem(w) for w in filtered]

In [8]:
def dummy(x):
    return x

vectorizer = TfidfVectorizer(analyzer='word', tokenizer=dummy, preprocessor=dummy,
                            token_pattern=None)

count_vectorizer = CountVectorizer(analyzer='word', tokenizer=dummy, preprocessor=dummy,
                            token_pattern=None)

In [9]:
df_groups = pd.read_csv('Data/df_groups_nb3b-virality.csv', index_col = 0)
print(df_groups.head())

                                  +52  +55   +57  +58  +34   +1  +263  +27  \
uid                                                                          
0526efbcbfd2a4c352206eef2a4dd6da  1.0  1.0   3.0  3.0  0.0  0.0   0.0  0.0   
07d5068cc56fa32bb22935edee3cf10a  0.0  0.0   8.0  3.0  0.0  0.0   0.0  0.0   
1eea1e85e89157f35943c0c1e8de0535  0.0  0.0  10.0  0.0  0.0  0.0   0.0  0.0   
2bcb099b4f7c3ddf444f15d69fce0ed8  0.0  0.0   3.0  0.0  0.0  0.0   0.0  0.0   
35c0a8c5a334567e7087db5c6c8d38c5  0.0  0.0  18.0  0.0  1.0  0.0   0.0  0.0   

                                  +381  +505  ...  pCL  pEC  p3rdCountry  \
uid                                           ...                          
0526efbcbfd2a4c352206eef2a4dd6da   0.0   0.0  ...  0.0  0.0     0.250000   
07d5068cc56fa32bb22935edee3cf10a   0.0   0.0  ...  0.0  0.0     0.000000   
1eea1e85e89157f35943c0c1e8de0535   0.0   0.0  ...  0.0  0.0     0.000000   
2bcb099b4f7c3ddf444f15d69fce0ed8   0.0   0.0  ...  0.0  0.0     0.000000 

# Initial statistics

# Message dynamics

In [10]:
df_x['posMisinfo'] = (df_x['textlower'].apply(lambda x: len(tokenize(x)) >= 5))
df_p = df_x[df_x['posMisinfo']]

In [11]:
print(df_p[df_p['fake_news']].shape)
print(df_p.shape)

(472, 35)
(44025, 35)


### Replies

In [12]:
reg = smf.ols('replies_n ~ fake_news', data = df_p).fit()
print(reg.summary())
reg = smf.ols('replies_n ~ fake_news', data = df_x).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:              replies_n   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     37.55
Date:                Wed, 15 Apr 2020   Prob (F-statistic):           8.99e-10
Time:                        10:30:08   Log-Likelihood:                -86133.
No. Observations:               44025   AIC:                         1.723e+05
Df Residuals:                   44023   BIC:                         1.723e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept             0.5659      0.00

In [13]:
print(np.quantile(df_p[df_p['fake_news']]['replies_n'], .95))
print(np.quantile(df_p[~df_p['fake_news']]['replies_n'], .95))

0.0
3.0


In [14]:
print(np.mean(df_p[df_p['fake_news']]['replies_n'] > 0))

0.048728813559322036


### Virality

In [16]:
reg = smf.ols('virality ~ fake_news', data = df_p[df_p['virality'] > 0]).fit()
print(reg.summary())
reg = smf.ols('virality ~ fake_news', data = df_x[df_x['virality'] > 0]).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:               virality   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     6.129
Date:                Wed, 15 Apr 2020   Prob (F-statistic):             0.0133
Time:                        10:31:04   Log-Likelihood:                -22069.
No. Observations:               14227   AIC:                         4.414e+04
Df Residuals:                   14225   BIC:                         4.416e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept             1.3565      0.01

In [17]:
print(np.quantile(df_p[df_p['fake_news'] & (df_p['virality'] > 0)]['virality'], .95))

1.4084999999999994


In [18]:
print(np.quantile(df_p[~df_p['fake_news'] & (df_p['virality'] > 0)]['virality'], .95))

3.5546875


### Size

In [19]:
df_p['charlength'] = df_p['text'].apply(len)
df_p['wordlength'] = df_p['text'].apply(lambda x: x.count(" "))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [20]:
reg = smf.ols('charlength ~ fake_news', data = df_p).fit()
print(reg.summary())
reg = smf.ols('wordlength ~ fake_news', data = df_p).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:             charlength   R-squared:                       0.027
Model:                            OLS   Adj. R-squared:                  0.027
Method:                 Least Squares   F-statistic:                     1210.
Date:                Wed, 15 Apr 2020   Prob (F-statistic):          1.37e-261
Time:                        10:34:59   Log-Likelihood:            -3.4845e+05
No. Observations:               44025   AIC:                         6.969e+05
Df Residuals:                   44023   BIC:                         6.969e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept           318.1145      3.17

### LDA

In [21]:
df_t = df_p[df_p['fake_news']]
df_t.loc[:,'token'] = df_t.loc[:,'textlower'].apply(tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [22]:
X = count_vectorizer.fit_transform(df_t['token'])

# Tweak the two parameters below
number_topics = 10
number_words = 10

# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(X)

# Helper function (sourced from somewhere online)
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic %d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# Print the topics found by the LDA model
print_topics(lda, count_vectorizer, number_words)


Topic 0:
virus chin mund salud pais cas egipt limon pas merc

Topic 1:
virus dias pulmon tom vias agu chin evit pais sol

Topic 2:
alert hij inform nin pas compart ser pais segur escuel

Topic 3:
limon tom agu pued calient cuerp celul cuid alcalin sustanci

Topic 4:
agu tom inclu ibuprofen sintom sal favor salv ajo virus

Topic 5:
contact virus pasal urgent celular mensaj llam dil vide murcielag

Topic 6:
chin accion telon mund coronavirus mundial virus empres compr tod

Topic 7:
virus pued calient sol agu coronavirus man hor beb hac

Topic 8:
40 dios dias person jesus mand mensaj pued despu famili

Topic 9:
chin virus caf wuh mund coron quimic pacient km beijing


# User dynamics

In [23]:
df_x[df_x['fake_news']]['tel'].value_counts().hist(bins = 20)
plt.xlabel("Number of Fake News Shared")
plt.ylabel("# of Users")
plt.savefig('images/ch-misinformation/hist_fakenews_user_frequency.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
print(df_x[df_x['fake_news']]['tel'].unique().shape)

(309,)


In [24]:
print(np.mean(df_x[df_x['fake_news']]['tel'].value_counts() == 1))
print(np.mean(df_x[df_x['fake_news']]['tel'].value_counts() == 2))

0.7443365695792881
0.1488673139158576


In [25]:
tel_fake_news = df_p[['tel', 'fake_news']].groupby('tel').mean()
tel_fake_news_n = df_p[['tel', 'fake_news']].groupby('tel').sum()

In [26]:
print(np.mean(tel_fake_news == 0))
print(tel_fake_news.shape)

fake_news    0.933477
dtype: float64
(4645, 1)


In [27]:
tel_fake_news[tel_fake_news > 0].hist(bins = 20)
plt.xlabel("Prevalence of Fake News")
plt.ylabel("# of Users")
plt.title(None)
plt.savefig('images/ch-misinformation/hist_fakenews_user_prevalence.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [28]:
tel_fake_news_n[tel_fake_news_n['fake_news'] >= 5]

Unnamed: 0_level_0,fake_news
tel,Unnamed: 1_level_1
+51 949 409 535,5.0
+51 993 052 933,5.0
+57 300 5290738,6.0
+57 302 2355873,11.0
+57 318 6079933,5.0
+58 412-6397289,5.0
+58 414-6959685,7.0
+58 416-1952254,9.0
+58 424-9163789,8.0
+58 426-2841379,7.0


## Comparing by country

### User shares fake news

In [29]:
x = ['PER', 'CHL', 'COL', 'VEN', 'ECU']

In [39]:
pe = tel_fake_news[tel_fake_news.index.str.startswith('+51')] > 0
cl = tel_fake_news[tel_fake_news.index.str.startswith('+56')] > 0
co = tel_fake_news[tel_fake_news.index.str.startswith('+57')] > 0
vz = tel_fake_news[tel_fake_news.index.str.startswith('+58')] > 0
ec = tel_fake_news[tel_fake_news.index.str.startswith('+593')] > 0

y = [100 * np.mean(country)[0] for country in [pe, cl, co, vz, ec]]

plt.figure()
plt.bar(x, y)
plt.xlabel("Country")
plt.ylabel("% of Users Who've\nShared Fake News")
plt.savefig('images/ch-misinformation/bar_fakenews_country_whoshared.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

print(f_oneway(pe, cl, co, vz, ec))

print("\nVZ t-tests")
print(ttest_ind(vz, pe))
print(ttest_ind(vz, cl))
print(ttest_ind(vz, co))
print(ttest_ind(vz, ec))

print("\nCO t-tests")
print(ttest_ind(co, pe))
print(ttest_ind(co, cl))
print(ttest_ind(co, vz))
print(ttest_ind(co, ec))

F_onewayResult(statistic=array([7.73945238]), pvalue=array([3.28530764e-06]))

VZ t-tests
Ttest_indResult(statistic=array([2.82214302]), pvalue=array([0.00482543]))
Ttest_indResult(statistic=array([2.00589309]), pvalue=array([0.04507311]))
Ttest_indResult(statistic=array([5.34657351]), pvalue=array([9.57544473e-08]))
Ttest_indResult(statistic=array([0.8631068]), pvalue=array([0.38823578]))

CO t-tests
Ttest_indResult(statistic=array([-0.73764127]), pvalue=array([0.46079739]))
Ttest_indResult(statistic=array([0.15758127]), pvalue=array([0.87480083]))
Ttest_indResult(statistic=array([-5.34657351]), pvalue=array([9.57544473e-08]))
Ttest_indResult(statistic=array([-1.42379329]), pvalue=array([0.15464376]))


In [40]:
print(np.mean(vz))
print(np.mean(co))

fake_news    0.101796
dtype: float64
fake_news    0.052334
dtype: float64


### Prop of user content is fake news

In [41]:
pe = tel_fake_news[tel_fake_news.index.str.startswith('+51')]
cl = tel_fake_news[tel_fake_news.index.str.startswith('+56')]
co = tel_fake_news[tel_fake_news.index.str.startswith('+57')]
vz = tel_fake_news[tel_fake_news.index.str.startswith('+58')]
ec = tel_fake_news[tel_fake_news.index.str.startswith('+593')]

y = [np.mean(country)[0] for country in [pe, cl, co, vz, ec]]

plt.figure()
plt.bar(x, y)
plt.xlabel("Country")
plt.ylabel("Avg. Fake News Prevalence")
plt.title("All Users from Country")
plt.savefig('images/ch-misinformation/bar_fakenews_country_prevalence.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
print(f_oneway(pe, cl, co, vz, ec))

print("\nVZ t-tests")
print(ttest_ind(vz, pe))
print(ttest_ind(vz, cl))
print(ttest_ind(vz, co))
print(ttest_ind(vz, ec))

print("\nCO t-tests")
print(ttest_ind(co, pe))
print(ttest_ind(co, cl))
print(ttest_ind(co, vz))
print(ttest_ind(co, ec))

F_onewayResult(statistic=array([5.31009887]), pvalue=array([0.00029021]))

VZ t-tests
Ttest_indResult(statistic=array([1.41746528]), pvalue=array([0.15652869]))
Ttest_indResult(statistic=array([0.5828995]), pvalue=array([0.56006135]))
Ttest_indResult(statistic=array([4.71825229]), pvalue=array([2.47793248e-06]))
Ttest_indResult(statistic=array([0.90261145]), pvalue=array([0.36689707]))

CO t-tests
Ttest_indResult(statistic=array([-1.87477248]), pvalue=array([0.06093328]))
Ttest_indResult(statistic=array([-1.44180965]), pvalue=array([0.14949457]))
Ttest_indResult(statistic=array([-4.71825229]), pvalue=array([2.47793248e-06]))
Ttest_indResult(statistic=array([-1.10727577]), pvalue=array([0.26829203]))


In [42]:
print(np.mean(vz))
print(np.mean(co))

fake_news    0.032561
dtype: float64
fake_news    0.013752
dtype: float64


### Prop user content is fake news, ONLY FOR FAKE NEWS sharers

In [43]:
tel_fake_news_pos = tel_fake_news[tel_fake_news['fake_news'] > 0]

In [44]:
pe = tel_fake_news_pos[tel_fake_news_pos.index.str.startswith('+51')]['fake_news'].values
cl = tel_fake_news_pos[tel_fake_news_pos.index.str.startswith('+56')]['fake_news'].values
co = tel_fake_news_pos[tel_fake_news_pos.index.str.startswith('+57')]['fake_news'].values
vz = tel_fake_news_pos[tel_fake_news_pos.index.str.startswith('+58')]['fake_news'].values
ec = tel_fake_news_pos[tel_fake_news_pos.index.str.startswith('+593')]['fake_news'].values

fig, ax = plt.subplots()
ax.violinplot([pe, cl, co, vz, ec], showmeans = True)
ax.set_xticklabels(['','PER', 'CHL', 'COL', 'VEN', 'ECU'])
plt.xlabel("Country")
plt.ylabel("Fake News Prevalence")
plt.title("Users Who've Shared Fake News")
plt.savefig('images/ch-misinformation/violin_fakenews_country_prevalence_sharers.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
print(f_oneway(pe, cl, co, vz, ec))

print("\nVZ t-tests")
print(ttest_ind(vz, pe))
print(ttest_ind(vz, cl))
print(ttest_ind(vz, co))
print(ttest_ind(vz, ec))

print("\nCO t-tests")
print(ttest_ind(co, pe))
print(ttest_ind(co, cl))
print(ttest_ind(co, vz))
print(ttest_ind(co, ec))

F_onewayResult(statistic=1.7051998289452537, pvalue=0.14898933598209077)

VZ t-tests
Ttest_indResult(statistic=-0.7841262952679269, pvalue=0.43420250389631265)
Ttest_indResult(statistic=-1.4745370001139, pvalue=0.14287146642057622)
Ttest_indResult(statistic=1.4193907506446761, pvalue=0.15715084067384924)
Ttest_indResult(statistic=0.4652672892833055, pvalue=0.6425245334935901)

CO t-tests
Ttest_indResult(statistic=-1.7953800640299724, pvalue=0.07471984527298495)
Ttest_indResult(statistic=-2.175503521849528, pvalue=0.03162097284105715)
Ttest_indResult(statistic=-1.4193907506446761, pvalue=0.15715084067384924)
Ttest_indResult(statistic=-0.1492532755784232, pvalue=0.8816022616296135)


### Number of fake news

In [45]:
print(np.mean(tel_fake_news_n[tel_fake_news_n.index.str.startswith('+57')]))
print(np.mean(tel_fake_news_n[tel_fake_news_n.index.str.startswith('+58')]))
ttest_ind(tel_fake_news_n[tel_fake_news_n.index.str.startswith('+57')],
          tel_fake_news_n[tel_fake_news_n.index.str.startswith('+58')])

fake_news    0.077793
dtype: float64
fake_news    0.166809
dtype: float64


Ttest_indResult(statistic=array([-4.56394396]), pvalue=array([5.20392321e-06]))

In [46]:
pe = tel_fake_news_n[tel_fake_news_n.index.str.startswith('+51')]
cl = tel_fake_news_n[tel_fake_news_n.index.str.startswith('+56')]
co = tel_fake_news_n[tel_fake_news_n.index.str.startswith('+57')]
vz = tel_fake_news_n[tel_fake_news_n.index.str.startswith('+58')]
ec = tel_fake_news_n[tel_fake_news_n.index.str.startswith('+593')]

y = [np.mean(country)[0] for country in [pe, cl, co, vz, ec]]

plt.figure()
plt.bar(x, y)
plt.xlabel("Country")
plt.ylabel("Avg. # of Fake News")
plt.title("All Users from Country")
plt.savefig('images/ch-misinformation/bar_fakenews_country_frequency.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
print(f_oneway(pe, cl, co, vz, ec))

print("\nVZ t-tests")
print(ttest_ind(vz, pe))
print(ttest_ind(vz, cl))
print(ttest_ind(vz, co))
print(ttest_ind(vz, ec))

print("\nCO t-tests")
print(ttest_ind(co, pe))
print(ttest_ind(co, cl))
print(ttest_ind(co, vz))
print(ttest_ind(co, ec))

F_onewayResult(statistic=array([5.96462604]), pvalue=array([8.77767881e-05]))

VZ t-tests
Ttest_indResult(statistic=array([2.13577096]), pvalue=array([0.03283944]))
Ttest_indResult(statistic=array([1.94620977]), pvalue=array([0.05184328]))
Ttest_indResult(statistic=array([4.56394396]), pvalue=array([5.20392321e-06]))
Ttest_indResult(statistic=array([0.60427657]), pvalue=array([0.54576378]))

CO t-tests
Ttest_indResult(statistic=array([-0.96709862]), pvalue=array([0.33358245]))
Ttest_indResult(statistic=array([0.56909404]), pvalue=array([0.56934885]))
Ttest_indResult(statistic=array([-4.56394396]), pvalue=array([5.20392321e-06]))
Ttest_indResult(statistic=array([-1.43608499]), pvalue=array([0.15111595]))


### Number of fake news, only for fake news sharers

In [47]:
tel_fake_news_pos_n = tel_fake_news_n[tel_fake_news_n['fake_news'] > 0]

In [48]:
pe = tel_fake_news_pos_n[tel_fake_news_pos_n.index.str.startswith('+51')]['fake_news'].values
cl = tel_fake_news_pos_n[tel_fake_news_pos_n.index.str.startswith('+56')]['fake_news'].values
co = tel_fake_news_pos_n[tel_fake_news_pos_n.index.str.startswith('+57')]['fake_news'].values
vz = tel_fake_news_pos_n[tel_fake_news_pos_n.index.str.startswith('+58')]['fake_news'].values
ec = tel_fake_news_pos_n[tel_fake_news_pos_n.index.str.startswith('+593')]['fake_news'].values

fig, ax = plt.subplots()
ax.violinplot([pe, cl, co, vz, ec], showmeans = True)
ax.set_xticklabels(['','PER', 'CHL', 'COL', 'VEN', 'ECU'])
plt.xlabel("Country")
plt.ylabel("# of Fake News")
plt.title("Users Who've Shared Fake News")
plt.savefig('images/ch-misinformation/violin_fakenews_country_frequency_sharers.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
print(f_oneway(pe, cl, co, vz, ec))

print("\nVZ t-tests")
print(ttest_ind(vz, pe))
print(ttest_ind(vz, cl))
print(ttest_ind(vz, co))
print(ttest_ind(vz, ec))

print("\nCO t-tests")
print(ttest_ind(co, pe))
print(ttest_ind(co, cl))
print(ttest_ind(co, vz))
print(ttest_ind(co, ec))

F_onewayResult(statistic=0.4134323649610367, pvalue=0.7989197527541476)

VZ t-tests
Ttest_indResult(statistic=0.00853998495162298, pvalue=0.9931975080891522)
Ttest_indResult(statistic=0.9195460610923518, pvalue=0.3595953728339022)
Ttest_indResult(statistic=0.8464394806084964, pvalue=0.3981951398286613)
Ttest_indResult(statistic=-0.06615212843892694, pvalue=0.9473591219462485)

CO t-tests
Ttest_indResult(statistic=-0.5973845156394809, pvalue=0.5512020987118321)
Ttest_indResult(statistic=0.6952615306165695, pvalue=0.4882818472963889)
Ttest_indResult(statistic=-0.8464394806084964, pvalue=0.3981951398286613)
Ttest_indResult(statistic=-0.46073828873098926, pvalue=0.6458133998194324)


# Group dynamics

In [49]:
df_groups['fakeNews'] = df_p.groupby('uid')['fake_news'].mean()
df_groups['fakeNews'] = df_groups['fakeNews'].fillna(0)

In [50]:
df_groups['fakeNews_users'] = (df_p[['tel', 'uid', 'fake_news']].groupby(['tel', 'uid']).sum() > 1).groupby('uid').mean()
df_groups['fakeNews_users'] = df_groups['fakeNews_users'].fillna(0)

In [51]:
df_groups.to_csv('Data/df_groups_nb4b-fakeNews.csv')

In [52]:
print(np.mean(df_groups['fakeNews'] == 0))
print(np.sum(df_groups['fakeNews'] == 0))
print(np.sum(df_groups['fakeNews'] <= 0.1))

0.6436781609195402
112
166


In [53]:
df_groups[df_groups['fakeNews'] > 0]['fakeNews'].hist(bins = 20)
plt.xlabel("Message Prevalence")
plt.ylabel("# of Groups")
plt.title("Groups with Fake News")
plt.savefig('images/ch-misinformation/hist_fakenews_group_prevalence.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [54]:
plt.scatter(df_groups[df_groups['fakeNews'] > 0]['fakeNews'], df_groups[df_groups['fakeNews'] > 0]['fakeNews_users'], alpha = 0.2)
plt.xlabel("Message Prevalence")
plt.ylabel("User Prevalence")
plt.title("Groups with Fake News")
plt.savefig('images/ch-misinformation/scatter_fakenews_group.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [55]:
print(scipy.stats.pearsonr(df_groups['fakeNews'], df_groups['fakeNews_users']))
print(scipy.stats.pearsonr(df_groups[df_groups['fakeNews'] > 0]['fakeNews'], df_groups[df_groups['fakeNews'] > 0]['fakeNews_users']))
# Fake news less likely to go viral?!?!

(0.6787991515260561, 7.614051648119618e-25)
(0.6441870666560023, 1.6137914418074645e-08)


In [56]:
df_groups[df_groups['fakeNews'] > 0.4]
# Small groups, one 2 messages, one 24 messages

Unnamed: 0_level_0,+52,+55,+57,+58,+34,+1,+263,+27,+381,+505,...,p3rdCountry,entropy,degree,activity,hhConc,gini,replies_n,virality,fakeNews,fakeNews_users
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
573123674647-1578400081,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.5,0.0
584167881770-1573754266,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.333333,1.011404,0.0,0.8,0.253472,0.347222,0.0,0.0,0.625,0.666667


In [57]:
df_p[df_p['fake_news']].groupby('uid')['fake_news'].sum().hist()
plt.xlabel("# Fake News Shared")
plt.ylabel("# of Groups")
plt.title("Groups with Fake News")
plt.savefig('images/ch-misinformation/hist_fakenews_group_frequency.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

### Correlates

In [58]:
cols = ['Size', 'pVZ', 'pCO',
       'pUS', 'pPE', 'pCL', 'pEC', 'p3rdCountry', 'entropy',
       'activity', 'degree', 'hhConc',
       'gini', 'virality']

In [59]:
for col in cols:
    print("============= %s =============" % col)
    print(scipy.stats.pearsonr(df_groups[col], df_groups['fakeNews']))
    print(scipy.stats.pearsonr(df_groups[col], df_groups['fakeNews_users']))

(-0.04992951328889845, 0.5129336071415392)
(0.00730552665489265, 0.923780124413848)
(0.09132253433290873, 0.23074176166532093)
(0.20518134425467405, 0.006608027641445932)
(-0.023680592463684938, 0.7564389124887922)
(-0.13240184360174437, 0.08158063304611017)
(-0.035606885836921134, 0.6408936792779503)
(-0.007655404627076388, 0.9201416146176552)
(0.03472991820590573, 0.6491420725033092)
(0.06713697100256533, 0.3787466857674571)
(-0.05371775723547723, 0.4814408216803178)
(-0.04321794161231393, 0.5712310248964376)
(0.0061657419926271755, 0.9356437398767903)
(0.025244215025865456, 0.7409114209900581)
(-0.04531863629068106, 0.5526523732828665)
(-0.011485733494061592, 0.8804331355393074)
(0.032591424036658594, 0.6694338348870772)
(0.08046408571294406, 0.2912166718957516)
(-0.0695195294772921, 0.3620218341405534)
(-0.027941828234352352, 0.7143703859154057)
(-0.08035211203817734, 0.29189079828954334)
(-0.021957431761547668, 0.7736640840555922)
(0.009421793041127634, 0.901799124843371)
(-0.0811

In [60]:
reg = smf.ols('fakeNews_users ~ pVZ',
              data = df_groups).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:         fakeNews_users   R-squared:                       0.042
Model:                            OLS   Adj. R-squared:                  0.037
Method:                 Least Squares   F-statistic:                     7.559
Date:                Wed, 15 Apr 2020   Prob (F-statistic):            0.00661
Time:                        10:49:58   Log-Likelihood:                 258.48
No. Observations:                 174   AIC:                            -513.0
Df Residuals:                     172   BIC:                            -506.6
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0034      0.005      0.657      0.5

#### Of groups that share ANY fake news

In [61]:
for col in cols:
    print("============= %s =============" % col)
    print(scipy.stats.pearsonr(df_groups[df_groups['fakeNews'] > 0][col], df_groups[df_groups['fakeNews'] > 0]['fakeNews']))
    print(scipy.stats.pearsonr(df_groups[df_groups['fakeNews'] > 0][col], df_groups[df_groups['fakeNews'] > 0]['fakeNews_users']))
    

(-0.3486823695031871, 0.005479595370356749)
(-0.1933604031931211, 0.13212184920483167)
(0.010171695939757988, 0.9374589192566938)
(0.21340710814446245, 0.09582934059837665)
(0.07599419486539552, 0.5571693146750872)
(-0.15175102529251508, 0.23903384082949636)
(-0.04242841287678181, 0.7433424861091564)
(0.05621522147897937, 0.6643070266544209)
(-0.003392525944652537, 0.9791223759957611)
(0.06588765731456397, 0.6108957321141079)
(-0.10751060459189044, 0.4055636430059557)
(-0.08407839611456094, 0.5158777356328844)
(0.0009662557647893838, 0.9940530486123315)
(0.053887033067781295, 0.6774287928332988)
(-0.09826752771259642, 0.4473416672220871)
(-0.029759129371364112, 0.8183981655990866)
(-0.07915858403707808, 0.5408213019170923)
(0.034948856403175875, 0.7874144526072789)
(-0.20766333470041776, 0.10532350745933453)
(-0.11140481731666199, 0.3886664553892663)
(-0.23205282125998192, 0.06954517161817202)
(-0.1226078920713557, 0.34243930470518724)
(0.4710359852320506, 0.00011158690428809644)
(0.08

In [62]:
reg = smf.ols('fakeNews ~ Size + activity + hhConc + gini + virality',
              data = df_groups[df_groups['fakeNews'] > 0]).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:               fakeNews   R-squared:                       0.506
Model:                            OLS   Adj. R-squared:                  0.462
Method:                 Least Squares   F-statistic:                     11.49
Date:                Wed, 15 Apr 2020   Prob (F-statistic):           1.17e-07
Time:                        10:52:29   Log-Likelihood:                 71.141
No. Observations:                  62   AIC:                            -130.3
Df Residuals:                      56   BIC:                            -117.5
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.2825      0.051      5.496      0.0

In [63]:
reg = smf.ols('fakeNews ~ Size + activity + hhConc + gini + virality',
              data = df_groups[(df_groups['fakeNews'] > 0) & (df_groups['fakeNews'] < 0.3)]).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:               fakeNews   R-squared:                       0.337
Model:                            OLS   Adj. R-squared:                  0.275
Method:                 Least Squares   F-statistic:                     5.487
Date:                Wed, 15 Apr 2020   Prob (F-statistic):           0.000373
Time:                        10:52:34   Log-Likelihood:                 98.298
No. Observations:                  60   AIC:                            -184.6
Df Residuals:                      54   BIC:                            -172.0
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.1653      0.034      4.804      0.0

# Grouping fake news together

In [64]:
fake_news_corpus = df_x[df_x['fake_news']]['textlower'].unique()
fake_news_tokenize = [tokenize(c) for c in fake_news_corpus]

In [65]:
print(len(fake_news_corpus))

214


In [66]:
X = vectorizer.fit_transform(fake_news_tokenize)

In [67]:
redirect_from_to = {}

In [68]:
for i, j in zip(np.where(cosine_similarity(X) > 0.8)[0], np.where(cosine_similarity(X) > 0.8)[1]):
    if i < j:
        if i in redirect_from_to:
            redirect_from_to[j] = redirect_from_to[i]
        else:
            redirect_from_to[j] = i

print(len(redirect_from_to))

98


In [69]:
print(redirect_from_to)

{7: 0, 14: 0, 28: 0, 40: 0, 44: 0, 35: 2, 42: 2, 47: 2, 36: 10, 37: 11, 13: 12, 15: 12, 18: 12, 19: 12, 27: 12, 34: 0, 187: 20, 25: 24, 33: 31, 48: 31, 73: 32, 185: 38, 59: 43, 90: 43, 91: 43, 49: 45, 88: 45, 120: 45, 51: 50, 52: 50, 188: 55, 170: 57, 62: 60, 64: 60, 69: 61, 84: 63, 85: 63, 89: 63, 95: 63, 100: 63, 87: 66, 96: 66, 99: 66, 118: 66, 68: 67, 82: 67, 97: 67, 189: 70, 83: 71, 74: 72, 75: 72, 109: 92, 110: 103, 111: 103, 116: 103, 105: 104, 107: 104, 114: 104, 117: 104, 113: 106, 166: 106, 202: 108, 182: 121, 131: 122, 163: 122, 134: 124, 146: 124, 160: 124, 168: 124, 186: 124, 205: 124, 130: 126, 154: 126, 133: 127, 153: 127, 143: 135, 145: 135, 148: 135, 138: 136, 141: 136, 158: 136, 196: 136, 161: 137, 156: 147, 167: 151, 197: 151, 173: 126, 165: 159, 179: 171, 208: 177, 183: 181, 192: 191, 193: 191, 199: 191, 206: 191, 201: 198, 203: 198, 211: 207}


In [70]:
print(df_p[df_p['textlower'] == fake_news_corpus[116]]['text'].iloc[0])
print()
print(df_p[df_p['textlower'] == fake_news_corpus[103]]['text'].iloc[0])

SeñoreS, les  recuerdo que deben evitar tomar  Ibuprofeno, Motrin, Advil y aspirina para síntomas de fiebre por el Covid - 19 En Italia y Francia han descubierto que personas que han fallecido es porque han tomado ibuprofeno y ocasiona que el virus se potencie 5 o más veces. Favor evitar la automedicacion,   divulgar esto y a cuidarnos. 

Buenos días familia como amanecieron, les  recuerdo que deben evitar tomar  Ibuprofeno, Motrin, Advil y aspirina para síntomas de fiebre por el Covid - 19 En Italia y Francia han descubierto que personas que han fallecido es porque han tomado ibuprofeno y ocasiona que el virus se potencie 5 o más veces. Favor evitar la automedicacion,   divulgar esto y a cuidarnos. 


In [71]:
print(df_p[df_p['textlower'] == fake_news_corpus[97]]['text'].iloc[0])
print()
print(df_p[df_p['textlower'] == fake_news_corpus[67]]['text'].iloc[0])


*Consejo del Dr. Yuri Ortega Sotelo +51987453411
El coronavirus es de gran tamaño con un diámetro celular de 400-500 micras, por lo que cualquier máscara impide su entrada, por lo que no es necesario explotar a los farmacéuticos para comerciar con bozales.
El virus no se instala en el aire, sino en el suelo, por lo que no se transmite por el aire.
El virus, cuando cae sobre una superficie de metal, vivirá durante 12 horas, por lo que lavarse bien las manos con agua y jabón será suficiente.
El virus cuando cae sobre las telas permanece durante 9 horas, por lo que lavar la ropa o exponerla al sol durante dos horas es suficiente para matarlo.
El virus vive en las manos durante 10 minutos, por lo que llevar un desinfectante con alcohol en el bolsillo y aplicar es suficiente para prevenirlo.
Si el virus se expone a una temperatura de 26-27 ° C, se matará, no vive en áreas calientes. También es suficiente beber agua caliente y exponerse al sol. Mantenerse alejado del helado y la comida fría 

In [72]:
groups_where_shared = {}
users_who_shared = {}

In [73]:
for i in range(len(fake_news_corpus)):
    groups_where_shared[i] = df_x.loc[df_x['textlower'] == fake_news_corpus[i], 'uid'].tolist()
    users_who_shared[i] = df_x.loc[df_x['textlower'] == fake_news_corpus[i], 'tel'].tolist()

In [74]:
for from_i in redirect_from_to.keys():
    to_i = redirect_from_to[from_i]
    groups_where_shared[to_i] = groups_where_shared[to_i] + groups_where_shared[from_i]
    users_who_shared[to_i] = users_who_shared[to_i] + users_who_shared[from_i]
    
    del groups_where_shared[from_i]
    del users_who_shared[from_i]

In [75]:
list_mergedFakeNews = \
    [[fake_news_corpus[i], groups_where_shared[i], users_who_shared[i]] for i in groups_where_shared.keys()]
df_mergedFakeNews = pd.DataFrame(list_mergedFakeNews, columns = ['textlower', 'groups', 'users'])

In [76]:
df_mergedFakeNews['n_shares'] = df_mergedFakeNews['groups'].apply(len)
df_mergedFakeNews['n_groups'] = df_mergedFakeNews['groups'].apply(lambda x: len(set(x)))
df_mergedFakeNews['n_users'] = df_mergedFakeNews['users'].apply(lambda x: len(set(x)))
df_mergedFakeNews['shares/groups'] = df_mergedFakeNews['n_shares'] / df_mergedFakeNews['n_groups']
df_mergedFakeNews['shares/users'] = df_mergedFakeNews['n_shares'] / df_mergedFakeNews['n_users']

In [77]:
df_mergedFakeNews['n_shares'].hist()
plt.xlabel("Number of Shares")
plt.ylabel("# of Fake News")
plt.title("Unique Pieces of Fake News")
plt.savefig('images/ch-misinformation/hist_fakenews_unique_frequency.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [78]:
df_mergedFakeNews['n_users'].hist()
plt.xlabel("# of Unique Users Shared By")
plt.ylabel("# of Fake News")
plt.title("Unique Pieces of Fake News")
plt.savefig('images/ch-misinformation/hist_fakenews_unique_user.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [79]:
df_mergedFakeNews['n_groups'].hist()
plt.xlabel("# of Unique Groups Shared In")
plt.ylabel("# of Fake News")
plt.title("Unique Pieces of Fake News")
plt.savefig('images/ch-misinformation/hist_fakenews_unique_group.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [80]:
print(np.mean(df_mergedFakeNews['shares/groups']))
print(np.mean(df_mergedFakeNews['shares/users']))

1.2839610700602078
1.063767697819422


In [81]:
Counter(redirect_from_to.values())

Counter({0: 6,
         2: 3,
         10: 1,
         11: 1,
         12: 5,
         20: 1,
         24: 1,
         31: 2,
         32: 1,
         38: 1,
         43: 3,
         45: 3,
         50: 2,
         55: 1,
         57: 1,
         60: 2,
         61: 1,
         63: 5,
         66: 4,
         67: 3,
         70: 1,
         71: 1,
         72: 2,
         92: 1,
         103: 3,
         104: 4,
         106: 2,
         108: 1,
         121: 1,
         122: 2,
         124: 6,
         126: 3,
         127: 2,
         135: 3,
         136: 4,
         137: 1,
         147: 1,
         151: 2,
         159: 1,
         171: 1,
         177: 1,
         181: 1,
         191: 4,
         198: 2,
         207: 1})

In [82]:
print(fake_news_corpus[0])
print()
print(fake_news_corpus[124])

hablando con una otorrinolaringóloga y comentando del coronavirus,ella dijo que definitivamente el virus va a llegar a todo el mundo y que viene lo peor a nivel mundial debido a que no hay vacuna y el virus resiste a los antivirales actuales. su recomendación es que empecemos niños y adultos a tomar 1 tableta diaria de vitamina c y omega (de gnc) y comer frutas y verduras para fortalecer el sistema inmunológico. la cantidad de casos y muertes que se reportan oficialmente no son las reales. otra recomendación importante es lavarse las manos muy seguido, tomar agua, evitar el saludo de mano o de beso desde ya, incluso no dar la paz con la mano en misa... y conforme  esto avance  "evitar lugares publicos y cubrirse la boca" por último, comentó que diariamente están  llegando personas de china a todos los paises y no hay cerco sanitario, esto al menos hasta que ya se declaró como emergencia mundial... esperemos que los gobiernos y sistemas de salud realmente se pongan las pilas por el bien