In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 20

plt.rc('font', size=BIGGER_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=BIGGER_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [3]:
import statsmodels.formula.api as smf
import scipy.stats
from scipy.stats import ttest_ind
from scipy.stats import f_oneway

In [4]:
df_x = pd.read_csv("Data/df_x_nb4a-mis.csv", index_col = 0)
print(df_x.shape)

(171634, 34)


In [5]:
df_x['text'] = df_x['text'].fillna('')
df_x['textlower'] = df_x['textlower'].fillna('')
df_x['virality'] = df_x['virality'].fillna(0)

### Text similarity imports/functions

In [6]:
import string, re, unidecode
from sklearn.decomposition import LatentDirichletAllocation as LDA

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
stemmer = SnowballStemmer('spanish')
remove_punc = str.maketrans(string.punctuation, len(string.punctuation) * " ")
stopwords_ascii = [unidecode.unidecode(w) for w in stopwords.words('spanish')]

def tokenize(s):
    s = unidecode.unidecode(s)
    s = s.translate(remove_punc)
    tokens = nltk.word_tokenize(s)
    filtered = [w for w in tokens if w not in stopwords_ascii]
    return [stemmer.stem(w) for w in filtered]

In [8]:
def dummy(x):
    return x

vectorizer = TfidfVectorizer(analyzer='word', tokenizer=dummy, preprocessor=dummy,
                            token_pattern=None)

count_vectorizer = CountVectorizer(analyzer='word', tokenizer=dummy, preprocessor=dummy,
                            token_pattern=None)

In [9]:
df_groups = pd.read_csv('Data/df_groups_nb4b-fakeNews.csv', index_col = 0)
print(df_groups.head())

                                  +52  +55   +57  +58  +34   +1  +263  +27  \
uid                                                                          
0526efbcbfd2a4c352206eef2a4dd6da  1.0  1.0   3.0  3.0  0.0  0.0   0.0  0.0   
07d5068cc56fa32bb22935edee3cf10a  0.0  0.0   8.0  3.0  0.0  0.0   0.0  0.0   
1eea1e85e89157f35943c0c1e8de0535  0.0  0.0  10.0  0.0  0.0  0.0   0.0  0.0   
2bcb099b4f7c3ddf444f15d69fce0ed8  0.0  0.0   3.0  0.0  0.0  0.0   0.0  0.0   
35c0a8c5a334567e7087db5c6c8d38c5  0.0  0.0  18.0  0.0  1.0  0.0   0.0  0.0   

                                  +381  +505  ...  p3rdCountry   entropy  \
uid                                           ...                          
0526efbcbfd2a4c352206eef2a4dd6da   0.0   0.0  ...     0.250000  1.255482   
07d5068cc56fa32bb22935edee3cf10a   0.0   0.0  ...     0.000000  0.585953   
1eea1e85e89157f35943c0c1e8de0535   0.0   0.0  ...     0.000000  0.000000   
2bcb099b4f7c3ddf444f15d69fce0ed8   0.0   0.0  ...     0.000000  0.000000 

# Message dynamics

In [10]:
df_x['posMisinfo'] = (df_x['textlower'].apply(lambda x: len(tokenize(x)) >= 5))
df_p = df_x[df_x['posMisinfo']]

In [11]:
print(df_p[df_p['scam']].shape)
print(df_p.shape)

(886, 35)
(44025, 35)


### Replies

In [12]:
reg = smf.ols('replies_n ~ scam', data = df_p).fit()
print(reg.summary())
reg = smf.ols('replies_n ~ scam', data = df_x).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:              replies_n   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     59.45
Date:                Wed, 15 Apr 2020   Prob (F-statistic):           1.28e-14
Time:                        11:17:32   Log-Likelihood:                -86122.
No. Observations:               44025   AIC:                         1.722e+05
Df Residuals:                   44023   BIC:                         1.723e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.5697      0.008     69.146   

In [13]:
print(np.quantile(df_p[df_p['scam']]['replies_n'], .95))
print(np.quantile(df_p[~df_p['scam']]['replies_n'], .95))

1.0
3.0


### Virality

In [14]:
reg = smf.ols('virality ~ scam', data = df_p[df_p['virality'] > 0]).fit()
print(reg.summary())
reg = smf.ols('virality ~ scam', data = df_x[df_x['virality'] > 0]).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:               virality   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     25.81
Date:                Wed, 15 Apr 2020   Prob (F-statistic):           3.81e-07
Time:                        11:18:03   Log-Likelihood:                -22059.
No. Observations:               14227   AIC:                         4.412e+04
Df Residuals:                   14225   BIC:                         4.414e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        1.3590      0.010    141.744   

In [15]:
print(np.quantile(df_p[df_p['scam'] & (df_p['virality'] > 0)]['virality'], .95))
print(np.quantile(df_p[~df_p['scam'] & (df_p['virality'] > 0)]['virality'], .95))

1.28
3.5546875


### Size

In [16]:
df_p['charlength'] = df_p['text'].apply(len)
df_p['wordlength'] = df_p['text'].apply(lambda x: x.count(" "))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [17]:
print(scipy.stats.pearsonr(df_p['scam'], df_p['charlength']))
print(scipy.stats.pearsonr(df_p['scam'], df_p['wordlength']))
# Fake news less likely to go viral?!?!

(-0.007029465870105722, 0.1402371773640947)
(-0.013018010780668613, 0.0063047734688692954)


In [18]:
reg = smf.ols('charlength ~ scam', data = df_p).fit()
print(reg.summary())
reg = smf.ols('wordlength ~ scam', data = df_p).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:             charlength   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     2.175
Date:                Wed, 15 Apr 2020   Prob (F-statistic):              0.140
Time:                        11:19:29   Log-Likelihood:            -3.4904e+05
No. Observations:               44025   AIC:                         6.981e+05
Df Residuals:                   44023   BIC:                         6.981e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      330.2227      3.232    102.159   

### LDA

In [19]:
df_t = df_p[df_p['scam']]
df_t.loc[:,'token'] = df_t.loc[:,'textlower'].apply(tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [20]:
X = count_vectorizer.fit_transform(df_t['token'])

# Tweak the two parameters below
number_topics = 10
number_words = 10

# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(X)

# Helper function (sourced from somewhere online)
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic %d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# Print the topics found by the LDA model
print_topics(lda, count_vectorizer, number_words)


Topic 0:
bon pais prestam http com cupon diner hol exit 000

Topic 1:
internet gb 100 dat gratis obteng ahor https consiguel cualqui

Topic 2:
grup vide bienven prestam https siguient pas va voy javi

Topic 3:
ayud resib us 77 alimentari onu earn 00 invest clic

Topic 4:
prest 000 prestam personal 3 eur tas plaz whatsapp interes

Topic 5:
https whatsapp and oscur com ly bit of to activ

Topic 6:
https 000 sisb netflix period aislamient cupon com entra rap

Topic 7:
netflix period aislamient https pandemi dand gratis deb coronavirus mund

Topic 8:
https com chat diplom whatsapp ayud c z l grup

Topic 9:
tarjet alimentari madr cp https nuev solicitud bon to crypto


# User dynamics

In [21]:
df_x[df_x['scam']]['tel'].value_counts().hist(bins = 40)
plt.xlabel("Number of Scams Shared")
plt.ylabel("# of Users")
plt.savefig('images/ch-misinformation/hist_scam_user_frequency.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
print(df_x[df_x['scam']]['tel'].unique().shape)

(473,)


In [22]:
df_x[df_x['scam']]['tel'].value_counts().hist(range = (10, 60))
plt.xlabel("Number of Scams Shared")
plt.ylabel("# of Users")
plt.savefig('images/ch-misinformation/hist_scam_user_frequency_extreme.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [23]:
print(np.mean(df_x[df_x['scam']]['tel'].value_counts() == 1))
print(np.mean(df_x[df_x['scam']]['tel'].value_counts() == 2))

0.7082452431289641
0.160676532769556


In [24]:
tel_scam = df_p[['tel', 'scam']].groupby('tel').mean()
tel_scam_n = df_p[['tel', 'scam']].groupby('tel').sum()

In [25]:
print(np.mean(tel_scam == 0))
print(tel_scam.shape)

scam    0.89817
dtype: float64
(4645, 1)


In [26]:
tel_scam[tel_scam > 0].hist(bins = 20)
plt.xlabel("Prevalence of Scams")
plt.ylabel("# of Users")
plt.title(None)
plt.savefig('images/ch-misinformation/hist_scam_user_prevalence.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [27]:
(tel_scam[tel_scam > 0] == 1).sum()

scam    251
dtype: int64

In [28]:
df_user = df_x[['tel', 'fake_news']].groupby('tel').mean()
df_user['scam'] = df_x[['tel', 'scam']].groupby('tel').mean()

In [29]:
print(scipy.stats.pearsonr(df_user['fake_news'], df_user['scam']))

(-0.01469654004754208, 0.19263997673949335)


In [30]:
df_user['1_fake_news'] = df_user['fake_news'] > 0
df_user['1_scam'] = df_user['scam'] > 0

In [31]:
print(scipy.stats.pearsonr(df_user['1_fake_news'], df_user['1_scam']))

(0.02864289709856719, 0.011101099376421833)


In [32]:
print(df_user[df_user['1_fake_news']]['1_scam'].mean())
print(df_user[~df_user['1_fake_news']]['1_scam'].mean())
print()
print(df_user[df_user['1_fake_news']]['scam'].mean())
print(df_user[~df_user['1_fake_news']]['scam'].mean())

0.09385113268608414
0.05880015891934843

0.0082438578675575
0.02745949841726481


In [33]:
print(df_user[df_user['1_scam']]['1_fake_news'].mean())
print(df_user[~df_user['1_scam']]['1_fake_news'].mean())
print()
print(df_user[df_user['1_scam']]['fake_news'].mean())
print(df_user[~df_user['1_scam']]['fake_news'].mean())

0.0613107822410148
0.03790442669554623

0.005375598416855285
0.006604165618343614


## Comparing by country

### User shares scams

In [34]:
x = ['PER', 'CHL', 'COL', 'VEN', 'ECU']

In [35]:
pe = tel_scam[tel_scam.index.str.startswith('+51')] > 0
cl = tel_scam[tel_scam.index.str.startswith('+56')] > 0
co = tel_scam[tel_scam.index.str.startswith('+57')] > 0
vz = tel_scam[tel_scam.index.str.startswith('+58')] > 0
ec = tel_scam[tel_scam.index.str.startswith('+593')] > 0

y = [100 * np.mean(country)[0] for country in [pe, cl, co, vz, ec]]

plt.figure()
plt.bar(x, y)
plt.xlabel("Country")
plt.ylabel("% of Users Who've\nShared Scams")
plt.savefig('images/ch-misinformation/bar_scam_country_whoshared.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

print(f_oneway(pe, cl, co, vz, ec))

print("\nVZ t-tests")
print(ttest_ind(vz, pe))
print(ttest_ind(vz, cl))
print(ttest_ind(vz, co))
print(ttest_ind(vz, ec))

print("\nCO t-tests")
print(ttest_ind(co, pe))
print(ttest_ind(co, cl))
print(ttest_ind(co, vz))
print(ttest_ind(co, ec))

F_onewayResult(statistic=array([13.50641294]), pvalue=array([6.1171337e-11]))

VZ t-tests
Ttest_indResult(statistic=array([-5.0778172]), pvalue=array([4.23371005e-07]))
Ttest_indResult(statistic=array([1.38307908]), pvalue=array([0.16687648]))
Ttest_indResult(statistic=array([-6.41054738]), pvalue=array([1.65705885e-10]))
Ttest_indResult(statistic=array([-4.05085605]), pvalue=array([5.40084922e-05]))

CO t-tests
Ttest_indResult(statistic=array([0.04585049]), pvalue=array([0.96343285]))
Ttest_indResult(statistic=array([3.41370748]), pvalue=array([0.00065215]))
Ttest_indResult(statistic=array([6.41054738]), pvalue=array([1.65705885e-10]))
Ttest_indResult(statistic=array([-0.51022416]), pvalue=array([0.60994403]))


In [36]:
print(np.mean(co))
print(np.mean(vz))

scam    0.112211
dtype: float64
scam    0.046193
dtype: float64


### Prop of user content is scams

In [44]:
pe = tel_scam[tel_scam.index.str.startswith('+51')]
cl = tel_scam[tel_scam.index.str.startswith('+56')]
co = tel_scam[tel_scam.index.str.startswith('+57')]
vz = tel_scam[tel_scam.index.str.startswith('+58')]
ec = tel_scam[tel_scam.index.str.startswith('+593')]

y = [np.mean(country)[0] for country in [pe, cl, co, vz, ec]]

plt.bar(x, y)
plt.xlabel("Country")
plt.ylabel("Avg. Scam Prevalence")
plt.title("All Users from Country")
plt.savefig('images/ch-misinformation/bar_scam_country_prevalence.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
print(f_oneway(pe, cl, co, vz, ec))

print("\nVZ t-tests")
print(ttest_ind(vz, pe))
print(ttest_ind(vz, cl))
print(ttest_ind(vz, co))
print(ttest_ind(vz, ec))

print("\nCO t-tests")
print(ttest_ind(co, pe))
print(ttest_ind(co, cl))
print(ttest_ind(co, vz))
print(ttest_ind(co, ec))

F_onewayResult(statistic=array([10.88603342]), pvalue=array([8.85346177e-09]))

VZ t-tests
Ttest_indResult(statistic=array([-5.3017984]), pvalue=array([1.29548642e-07]))
Ttest_indResult(statistic=array([0.97057808]), pvalue=array([0.33193773]))
Ttest_indResult(statistic=array([-5.6651217]), pvalue=array([1.5957194e-08]))
Ttest_indResult(statistic=array([-3.43448838]), pvalue=array([0.00061211]))

CO t-tests
Ttest_indResult(statistic=array([-0.82526513]), pvalue=array([0.40929492]))
Ttest_indResult(statistic=array([2.8210517]), pvalue=array([0.00482843]))
Ttest_indResult(statistic=array([5.6651217]), pvalue=array([1.5957194e-08]))
Ttest_indResult(statistic=array([-0.31609025]), pvalue=array([0.75196307]))


In [45]:
print(np.mean(co))
print(np.mean(vz))

scam    0.069888
dtype: float64
scam    0.027205
dtype: float64


### Prop user content is scams, ONLY FOR scams sharers

In [46]:
tel_scam_pos = tel_scam[tel_scam['scam'] > 0]

In [47]:
pe = tel_scam_pos[tel_scam_pos.index.str.startswith('+51')]['scam'].values
cl = tel_scam_pos[tel_scam_pos.index.str.startswith('+56')]['scam'].values
co = tel_scam_pos[tel_scam_pos.index.str.startswith('+57')]['scam'].values
vz = tel_scam_pos[tel_scam_pos.index.str.startswith('+58')]['scam'].values
ec = tel_scam_pos[tel_scam_pos.index.str.startswith('+593')]['scam'].values

fig, ax = plt.subplots()
ax.violinplot([pe, cl, co, vz, ec], showmeans = True)
ax.set_xticklabels(['','PER', 'CHL', 'COL', 'VEN', 'ECU'])
plt.xlabel("Country")
plt.ylabel("Scam Prevalence")
plt.title("Users Who've Shared Scams")
plt.savefig('images/ch-misinformation/violin_scam_country_prevalence_sharers.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
print(f_oneway(pe, cl, co, vz, ec))

print("\nVZ t-tests")
print(ttest_ind(vz, pe))
print(ttest_ind(vz, cl))
print(ttest_ind(vz, co))
print(ttest_ind(vz, ec))

print("\nCO t-tests")
print(ttest_ind(co, pe))
print(ttest_ind(co, cl))
print(ttest_ind(co, vz))
print(ttest_ind(co, ec))

F_onewayResult(statistic=0.9219202769377437, pvalue=0.45114069396255396)

VZ t-tests
Ttest_indResult(statistic=-1.7144514004852853, pvalue=0.08918730406362392)
Ttest_indResult(statistic=-0.3953195856830108, pvalue=0.6941361967726638)
Ttest_indResult(statistic=-0.5990774828065281, pvalue=0.5495889105986987)
Ttest_indResult(statistic=-0.14544044484835367, pvalue=0.884775245096313)

CO t-tests
Ttest_indResult(statistic=-1.6714487456441764, pvalue=0.09568631976829253)
Ttest_indResult(statistic=-0.2899390990405123, pvalue=0.7721142180649859)
Ttest_indResult(statistic=0.5990774828065281, pvalue=0.5495889105986987)
Ttest_indResult(statistic=0.20402370370018746, pvalue=0.8384977075702557)


In [48]:
print(np.mean(co))
print(np.mean(vz))

0.6228222937705186
0.5889375409511393


### Number of scams

In [49]:
pe = tel_scam_n[tel_scam_n.index.str.startswith('+51')]
cl = tel_scam_n[tel_scam_n.index.str.startswith('+56')]
co = tel_scam_n[tel_scam_n.index.str.startswith('+57')]
vz = tel_scam_n[tel_scam_n.index.str.startswith('+58')]
ec = tel_scam_n[tel_scam_n.index.str.startswith('+593')]

y = [np.mean(country)[0] for country in [pe, cl, co, vz, ec]]

plt.bar(x, y)
plt.xlabel("Country")
plt.ylabel("Avg. # of Scams")
plt.title("All Users from Country")
plt.savefig('images/ch-misinformation/bar_scam_country_frequency.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

print(f_oneway(pe, cl, co, vz, ec))

print("\nVZ t-tests")
print(ttest_ind(vz, pe))
print(ttest_ind(vz, cl))
print(ttest_ind(vz, co))
print(ttest_ind(vz, ec))

print("\nCO t-tests")
print(ttest_ind(co, pe))
print(ttest_ind(co, cl))
print(ttest_ind(co, vz))
print(ttest_ind(co, ec))

F_onewayResult(statistic=array([5.76455715]), pvalue=array([0.00012665]))

VZ t-tests
Ttest_indResult(statistic=array([-2.71702545]), pvalue=array([0.00665342]))
Ttest_indResult(statistic=array([0.9484317]), pvalue=array([0.34308479]))
Ttest_indResult(statistic=array([-3.95456543]), pvalue=array([7.8290052e-05]))
Ttest_indResult(statistic=array([-1.94273087]), pvalue=array([0.05226142]))

CO t-tests
Ttest_indResult(statistic=array([-0.13786956]), pvalue=array([0.89035392]))
Ttest_indResult(statistic=array([2.86590534]), pvalue=array([0.00419662]))
Ttest_indResult(statistic=array([3.95456543]), pvalue=array([7.8290052e-05]))
Ttest_indResult(statistic=array([-0.50186754]), pvalue=array([0.61580937]))


In [50]:
print(np.mean(co))
print(np.mean(vz))

scam    0.165959
dtype: float64
scam    0.075278
dtype: float64


### Number of fake news, only for fake news sharers

In [51]:
tel_scam_pos_n = tel_scam_n[tel_scam_n['scam'] > 0]

In [52]:
pe = tel_scam_pos_n[tel_scam_pos_n.index.str.startswith('+51')]['scam'].values
cl = tel_scam_pos_n[tel_scam_pos_n.index.str.startswith('+56')]['scam'].values
co = tel_scam_pos_n[tel_scam_pos_n.index.str.startswith('+57')]['scam'].values
vz = tel_scam_pos_n[tel_scam_pos_n.index.str.startswith('+58')]['scam'].values
ec = tel_scam_pos_n[tel_scam_pos_n.index.str.startswith('+593')]['scam'].values

fig, ax = plt.subplots()
ax.violinplot([pe, cl, co, vz, ec], showmeans = True)
ax.set_xticklabels(['','PER', 'CHL', 'COL', 'VEN', 'ECU'])
plt.xlabel("Country")
plt.ylabel("# of Scams")
plt.title("Users Who've Shared Scams")
plt.savefig('images/ch-misinformation/violin_scam_country_frequency_sharers.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
print(f_oneway(pe, cl, co, vz, ec))

print("\nVZ t-tests")
print(ttest_ind(vz, pe))
print(ttest_ind(vz, cl))
print(ttest_ind(vz, co))
print(ttest_ind(vz, ec))

print("\nCO t-tests")
print(ttest_ind(co, pe))
print(ttest_ind(co, cl))
print(ttest_ind(co, vz))
print(ttest_ind(co, ec))

F_onewayResult(statistic=0.1892092480107084, pvalue=0.9439748689318488)

VZ t-tests
Ttest_indResult(statistic=0.26363625931747764, pvalue=0.792540374846579)
Ttest_indResult(statistic=0.3933101227960766, pvalue=0.6956109443582374)
Ttest_indResult(statistic=0.6414238115944625, pvalue=0.5217539811333425)
Ttest_indResult(statistic=0.1519477755405089, pvalue=0.8796590741260211)

CO t-tests
Ttest_indResult(statistic=-0.26827696424565795, pvalue=0.7886723743722057)
Ttest_indResult(statistic=0.7317508739483741, pvalue=0.4650375297368179)
Ttest_indResult(statistic=-0.6414238115944625, pvalue=0.5217539811333425)
Ttest_indResult(statistic=-0.1661203046356553, pvalue=0.8681939113138112)


In [53]:
print(np.mean(co))
print(np.mean(vz))

1.4789915966386555
1.6296296296296295


# Group dynamics

In [54]:
df_groups['scam'] = df_p.groupby('uid')['scam'].mean()
df_groups['scam'] = df_groups['scam'].fillna(0)

In [55]:
df_groups['scam_users'] = (df_p[['tel', 'uid', 'scam']].groupby(['tel', 'uid']).sum() > 1).groupby('uid').mean()
df_groups['scam_users'] = df_groups['scam_users'].fillna(0)

In [56]:
df_groups.to_csv('Data/df_groups_nb4c-scam.csv')

In [57]:
print(scipy.stats.pearsonr(df_groups['fakeNews'], df_groups['scam']))
print(scipy.stats.pearsonr(df_groups['fakeNews_users'], df_groups['scam_users']))

(-0.030218585113783974, 0.6922320333272)
(0.020737396700866596, 0.7859275906051502)


In [58]:
print(np.mean(df_groups['scam'] == 0))
print(np.sum(df_groups['scam'] == 0))
print(np.sum(df_groups['scam'] <= 0.1))

0.4942528735632184
86
140


In [59]:
df_groups[df_groups['scam'] > 0]['scam'].hist(bins = 20)
plt.xlabel("Message Prevalence")
plt.ylabel("# of Groups")
plt.title("Groups with Scams")
plt.savefig('images/ch-misinformation/hist_scam_group_prevalence.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [60]:
plt.scatter(df_groups[df_groups['scam'] > 0]['scam'], df_groups[df_groups['scam'] > 0]['scam_users'], alpha = 0.2)
plt.xlabel("Message Prevalence")
plt.ylabel("User Prevalence")
plt.title("Groups with Scams")
plt.savefig('images/ch-misinformation/scatter_scam_group.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [62]:
df_groups[df_groups['scam'] >= 0.5]
# Vzs en Bogota, few messages; screwing around, few; 
# business group, few; porn group, moderate; screwing around, few;
# internet money-making, MANY; VZ general, small

Unnamed: 0_level_0,+52,+55,+57,+58,+34,+1,+263,+27,+381,+505,...,degree,activity,hhConc,gini,replies_n,virality,fakeNews,fakeNews_users,scam,scam_users
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
573003168984-1574534069,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.555556,0.166667,0.0,0.0,0.0,0.0,0.5,0.0
573006487893-1552883009,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.555556,0.166667,0.0,0.0,0.0,0.0,1.0,0.0
573117575342-1576375028,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.282051,0.38843,0.386364,0.090909,0.5,0.0,0.0,0.5,0.0
573127205208-1577591043,3.0,0.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.789474,0.168889,0.417778,0.0,0.0,0.0,0.0,1.0,0.0
573215307159-1579143586,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.323529,0.322314,0.295455,0.090909,0.5,0.0,0.0,1.0,0.0
584166621910-1576352626,0.0,0.0,3.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,4.0,2.243243,0.797068,0.7642,0.0,0.0,0.0,0.0,0.547619,0.333333
584243087672-1572730687,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.555556,0.166667,0.0,0.0,0.0,0.0,1.0,0.0


In [63]:
df_p[df_p['scam']].groupby('uid')['scam'].sum().hist()
plt.xlabel("# of Scams Shared")
plt.ylabel("# of Groups")
plt.title("Groups with Scams")
plt.savefig('images/ch-misinformation/hist_scam_group_frequency.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()
# Very active gaming group, 156 scams shared of 1751 messages

### Correlates

In [64]:
cols = ['Size', 'pVZ', 'pCO',
       'pUS', 'pPE', 'pCL', 'pEC', 'p3rdCountry', 'entropy',
       'activity', 'degree', 'hhConc',
       'gini', 'virality', 'fakeNews', 'fakeNews_users']

In [65]:
for col in cols:
    print("============= %s =============" % col)
    print(scipy.stats.pearsonr(df_groups[col], df_groups['scam']))
    print(scipy.stats.pearsonr(df_groups[col], df_groups['scam_users']))

(-0.05268165954373854, 0.48994919706406187)
(0.07682828771971195, 0.3136367909222251)
(-0.07981338991875825, 0.295148642435833)
(-0.14891428342986598, 0.04986683115128725)
(-0.03719922017271353, 0.6260292016791941)
(0.12083803038307778, 0.11221854446326605)
(0.06596458207522252, 0.3871467702969128)
(0.22880227041676204, 0.0023914916965051144)
(-0.04518035221401274, 0.5538662750760072)
(-0.05515037522428668, 0.4698087740645235)
(-0.07534146801515515, 0.32312124914460016)
(-0.08580826433121869, 0.2602467321062162)
(0.2573766943932508, 0.0006069147209940115)
(-0.07551957890358245, 0.3219754040912484)
(0.10564509307563628, 0.16532112177154132)
(-0.019889000184346865, 0.7944868190047599)
(0.16032717911785088, 0.03457444052082779)
(0.07167626990809375, 0.3472844039644378)
(-0.1280190673413149, 0.09228803656794991)
(-0.07995760611315267, 0.294274158609873)
(-0.1279100712825057, 0.09256793402606366)
(0.0021128614940323434, 0.9779255879121966)
(-0.04656275088565788, 0.5417896903769188)
(-0.0716

In [66]:
reg = smf.ols('scam ~ entropy + virality',
              data = df_groups).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:                   scam   R-squared:                       0.104
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     9.883
Date:                Wed, 15 Apr 2020   Prob (F-statistic):           8.68e-05
Time:                        11:36:16   Log-Likelihood:                 58.612
No. Observations:                 174   AIC:                            -111.2
Df Residuals:                     171   BIC:                            -101.7
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0750      0.022      3.345      0.0

In [67]:
reg = smf.ols('scam_users ~ pVZ + gini',
              data = df_groups).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:             scam_users   R-squared:                       0.104
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     9.918
Date:                Wed, 15 Apr 2020   Prob (F-statistic):           8.41e-05
Time:                        11:37:15   Log-Likelihood:                 237.03
No. Observations:                 174   AIC:                            -468.1
Df Residuals:                     171   BIC:                            -458.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0012      0.010      0.120      0.9

#### Of groups that share scams

In [68]:
for col in cols:
    print("============= %s =============" % col)
    print(scipy.stats.pearsonr(df_groups[df_groups['scam'] > 0][col], df_groups[df_groups['scam'] > 0]['scam']))
    print(scipy.stats.pearsonr(df_groups[df_groups['scam'] > 0][col], df_groups[df_groups['scam'] > 0]['scam_users']))

(-0.3488448986200673, 0.0008654818033532204)
(-0.151230179663827, 0.1595804299944502)
(-0.176922191599797, 0.09915115201254245)
(-0.28376022724439287, 0.0073808231808304234)
(-0.029355843730968854, 0.7860040030608096)
(0.22610296681832398, 0.03415640978028331)
(0.10649776739826389, 0.32336565944058177)
(0.4019620245756146, 0.0001036476207749114)
(-0.06801040192148702, 0.5289553261632056)
(-0.08577120543885813, 0.4268656932993031)
(-0.1080803717037857, 0.3161857768953207)
(-0.13109085130650963, 0.2234445933805841)
(0.41812219554123103, 5.043730351047499e-05)
(-0.16178509293333848, 0.1320881405273336)
(0.18620287351996992, 0.08238927653083149)
(-0.03808647604856738, 0.7246116689040462)
(0.052773360392187225, 0.6253231698662911)
(-0.06803603238181134, 0.5287996986789558)
(-0.22491102892993958, 0.03513857942479447)
(-0.15362770518247224, 0.15299035857843743)
(-0.25605164070726516, 0.016043628054341914)
(-0.07906670656105795, 0.4640136968948673)
(0.4584726831824645, 7.054988696686719e-06)
(

In [69]:
reg = smf.ols('scam ~ Size + activity + degree + hhConc + gini + virality',
              data = df_groups[df_groups['scam'] > 0]).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:                   scam   R-squared:                       0.489
Model:                            OLS   Adj. R-squared:                  0.451
Method:                 Least Squares   F-statistic:                     12.92
Date:                Wed, 15 Apr 2020   Prob (F-statistic):           3.46e-10
Time:                        11:41:10   Log-Likelihood:                 32.983
No. Observations:                  88   AIC:                            -51.97
Df Residuals:                      81   BIC:                            -34.62
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.5147      0.083      6.232      0.0

# Grouping fake news together

In [70]:
scam_corpus = df_x[df_x['scam']]['textlower'].unique()
scam_tokenize = [tokenize(c) for c in scam_corpus]

In [71]:
print(len(scam_corpus))

247


In [72]:
X = vectorizer.fit_transform(scam_tokenize)

In [73]:
redirect_from_to = {}

In [74]:
for i, j in zip(np.where(cosine_similarity(X) > 0.8)[0], np.where(cosine_similarity(X) > 0.8)[1]):
    if i < j:
        if i in redirect_from_to:
            redirect_from_to[j] = redirect_from_to[i]
        else:
            redirect_from_to[j] = i

print(len(redirect_from_to))

105


In [75]:
print(redirect_from_to)

{59: 3, 8: 4, 16: 4, 20: 4, 24: 4, 31: 4, 68: 4, 43: 12, 90: 17, 36: 18, 41: 18, 44: 18, 94: 19, 30: 23, 46: 23, 99: 28, 56: 38, 154: 38, 52: 45, 54: 45, 57: 45, 81: 69, 226: 69, 79: 74, 114: 76, 116: 76, 117: 76, 118: 76, 121: 76, 122: 76, 123: 76, 124: 76, 127: 76, 128: 76, 129: 76, 131: 76, 132: 76, 133: 76, 134: 76, 135: 76, 235: 80, 111: 87, 112: 87, 115: 93, 110: 95, 98: 97, 104: 97, 150: 97, 215: 109, 120: 76, 125: 76, 126: 76, 137: 76, 143: 76, 146: 76, 159: 76, 160: 76, 204: 76, 130: 119, 210: 119, 142: 136, 141: 139, 196: 140, 152: 145, 156: 145, 157: 145, 158: 145, 162: 145, 163: 145, 164: 145, 165: 145, 167: 145, 169: 145, 170: 145, 172: 145, 173: 145, 175: 145, 176: 145, 177: 145, 178: 145, 182: 145, 183: 145, 185: 145, 187: 145, 188: 145, 189: 145, 190: 145, 236: 145, 245: 145, 180: 149, 192: 149, 179: 145, 201: 166, 184: 145, 195: 171, 205: 171, 197: 186, 198: 186, 211: 193, 200: 199, 228: 171, 220: 209, 216: 214, 229: 171, 242: 240}


In [76]:
print(df_p[df_p['textlower'] == scam_corpus[229]]['text'].iloc[0])
print()
print(df_p[df_p['textlower'] == scam_corpus[171]]['text'].iloc[0])

OFERTA DE PRÉSTAMO DE DINERO
 Somos una empresa que ofrece préstamos para la vivienda, préstamos de inversión, préstamos para automóviles, préstamos personales que van desde  4,000€ a  1,000,000€ con una tasa de interés del 3% sobre capital a corto y largo plazo. Si estás interesado contáctanos por whatsapp: +33752534155

Ofrecer el prestamo
  Somos una empresa que ofrece préstamos para vivienda, préstamos de inversión, préstamos para automóviles, préstamos personales que van desde 5,000 hasta 1,000,000 de dinares kuwaitíes con una tasa de interés del 3% sobre capital a corto y largo plazo.
  Con este préstamo, puede restaurar completamente su hogar, pagar sus impuestos y contribuir a sus necesidades personales y familiares.  Si está interesado, contáctenos a través de WhatsApp: +33752534155


In [77]:
groups_where_shared = {}
users_who_shared = {}

In [78]:
for i in range(len(scam_corpus)):
    groups_where_shared[i] = df_x.loc[df_x['textlower'] == scam_corpus[i], 'uid'].tolist()
    users_who_shared[i] = df_x.loc[df_x['textlower'] == scam_corpus[i], 'tel'].tolist()

In [79]:
for from_i in redirect_from_to.keys():
    to_i = redirect_from_to[from_i]
    groups_where_shared[to_i] = groups_where_shared[to_i] + groups_where_shared[from_i]
    users_who_shared[to_i] = users_who_shared[to_i] + users_who_shared[from_i]
    
    del groups_where_shared[from_i]
    del users_who_shared[from_i]

In [80]:
list_mergedScam = \
    [[scam_corpus[i], groups_where_shared[i], users_who_shared[i]] for i in groups_where_shared.keys()]
df_mergedScam = pd.DataFrame(list_mergedScam, columns = ['textlower', 'groups', 'users'])

In [81]:
df_mergedScam['n_shares'] = df_mergedScam['groups'].apply(len)
df_mergedScam['n_groups'] = df_mergedScam['groups'].apply(lambda x: len(set(x)))
df_mergedScam['n_users'] = df_mergedScam['users'].apply(lambda x: len(set(x)))
df_mergedScam['shares/groups'] = df_mergedScam['n_shares'] / df_mergedScam['n_groups']
df_mergedScam['shares/users'] = df_mergedScam['n_shares'] / df_mergedScam['n_users']

In [82]:
df_mergedScam['n_shares'].hist(bins = 40)
plt.xlabel("Number of Shares")
plt.ylabel("# of Scams")
plt.title("Unique Scams")
plt.savefig('images/ch-misinformation/hist_scam_unique_frequency.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [83]:
df_mergedScam['n_users'].hist(bins = 40)
plt.xlabel("# of Unique Users Sharing")
plt.ylabel("# of Scam Texts")
plt.title("Unique Scams")
plt.savefig('images/ch-misinformation/hist_scam_unique_user.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [84]:
df_mergedScam['n_groups'].hist(bins = 20)
plt.xlabel("# of Unique Groups Shared In")
plt.ylabel("# of Scam Texts")
plt.title("Unique Scams")
plt.savefig('images/ch-misinformation/hist_scam_unique_group.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [85]:
print(np.mean(df_mergedScam['shares/groups']))
print(np.mean(df_mergedScam['shares/users']))

2.5238821162267726
2.644857629988522


In [86]:
Counter(redirect_from_to.values())

Counter({3: 1,
         4: 6,
         12: 1,
         17: 1,
         18: 3,
         19: 1,
         23: 2,
         28: 1,
         38: 2,
         45: 3,
         69: 2,
         74: 1,
         76: 25,
         80: 1,
         87: 2,
         93: 1,
         95: 1,
         97: 3,
         109: 1,
         119: 2,
         136: 1,
         139: 1,
         140: 1,
         145: 28,
         149: 2,
         166: 1,
         171: 4,
         186: 2,
         193: 1,
         199: 1,
         209: 1,
         214: 1,
         240: 1})

In [87]:
print(df_p[df_p['textlower'] == scam_corpus[145]]['text'].iloc[0])
print()
print(df_p[df_p['textlower'] == scam_corpus[76]]['text'].iloc[0])

La OMS y el Gobierno han destinado un BONO de dinero para todos los paises por Motivo de CUARENTENA (CORONA VIRUS) 
Obtenga su BONO gratis en cualquier pais. 
Consiguelo ahora AQUI  
https://bit.ly/Bono-Comida-8

100 GB de datos de Internet sin ninguna recarga 
Obtenga 100 GB de datos de Internet gratis en cualquier red móvil durante 60 días. 
Consiguelo ahora 
https://internet4goffers.com/es
