In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 20

plt.rc('font', size=BIGGER_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=BIGGER_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [3]:
import hashlib

In [4]:
from scipy.stats import ttest_ind

In [5]:
df_x = pd.read_csv("Data/df_x_nb3b-virality.csv", index_col = 0)
print(df_x.shape)

(171634, 31)


In [6]:
df_x['text'] = df_x['text'].fillna('')
df_x['textlower'] = df_x['textlower'].fillna('')

In [7]:
df_x['hash'] = df_x['textlower'].apply(lambda x: hashlib.md5(x.encode('utf-8')).hexdigest())

### Text similarity imports/functions

In [8]:
import string, re, unidecode

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
stemmer = SnowballStemmer('spanish')
remove_punc = str.maketrans(string.punctuation, len(string.punctuation) * " ")
stopwords_ascii = [unidecode.unidecode(w) for w in stopwords.words('spanish')]

def tokenize(s):
    s = unidecode.unidecode(s)
    s = s.translate(remove_punc)
    tokens = nltk.word_tokenize(s)
    filtered = [w for w in tokens if w not in stopwords_ascii]
    return [stemmer.stem(w) for w in filtered]

In [10]:
def dummy(x):
    return x

vectorizer = TfidfVectorizer(analyzer='word', tokenizer=dummy, preprocessor=dummy,
                            token_pattern=None)

# Flagging fake news

In [11]:
fake_news_ext = pd.read_excel("Data/factcheck/FactCheck.xlsx", header=None)
print(len(fake_news_ext))

56


In [12]:
with open('Data/factcheck/nb4_fakenews.txt') as file:
    fake_news_hash = [line.split('\n')[0] for line in file.readlines()]
print(len(fake_news_hash))

64


In [13]:
df_x['fake_news'] = False
df_x.loc[df_x['hash'].isin(fake_news_hash), 'fake_news'] = True
print(df_x[df_x['fake_news']].shape)

(291, 33)


In [14]:
df_t = df_x[df_x['textlower'].str.len() > 0]
df_t = df_t.drop(df_t[df_t['fake_news']].index, axis = 0)
print(df_t.shape)
df_t['token'] = df_t['textlower'].apply(tokenize)
df_t = df_t[df_t['token'].apply(lambda x: len (x) >= 5)]
print(df_t.shape)

(101123, 33)
(43734, 34)


In [15]:
fake_news_wa_texts = df_x[df_x['fake_news']].groupby('hash')['textlower'].first()
n_fake_news = len(fake_news_wa_texts) + len(fake_news_ext)
print(n_fake_news)

120


In [16]:
all_corpus = df_t['token'].append([fake_news_ext[3].apply(tokenize), fake_news_wa_texts.apply(tokenize)])

In [17]:
X = vectorizer.fit_transform(all_corpus)
cs = cosine_similarity(X[:-n_fake_news], X[-n_fake_news:])
df_t['cs_max'] = cs.max(axis = 1)

In [18]:
df_t['flag'] = (df_t['cs_max'] > 0.3) & \
                   (~df_t['textlower'].str.contains('https://t.me/noticentrotelecentro'))

print(df_t['flag'].sum())

497


In [19]:
for i, row in df_t[df_t['flag']].iterrows():
    print(row['textlower'])
    print(str(i) + '\n----------------------------------')

muy buenos días tasa actualizada envios mayor a: 10.000 pesos al 0,0505 50.000 pesos al 0,050 100.000 pesos al 0,0495 250.000 pesos al 0,049
20
----------------------------------
hablando con una otorrinolaringóloga y comentando del coronavirus,ella dijo que definitivamente el virus va a llegar a todo el mundo y que viene lo peor a nivel mundial debido a que no hay vacuna y el virus resiste a los antivirales actuales. su recomendación es que empecemos niños y adultos a tomar 1 tableta diaria de vitamina c y omega (de gnc) y comer frutas y verduras para fortalecer el sistema inmunológico. la cantidad de casos y muertes que se reportan oficialmente no son las reales. otra recomendación importante es lavarse las manos muy seguido, tomar agua, evitar el saludo de mano o de beso desde ya, incluso no dar la paz con la mano en misa... y conforme  esto avance  "evitar lugares publicos y cubrirse la boca" por último, comentó que diariamente están  llegando personas de china a todos los paises y

In [20]:
true_positives_fakenews = [181, 7691, 9570, 20057, 23731, 23826, 23954, 24128, 24131, 24652,
25367, 26074, 26515, 26654, 26813, 26814, 27463, 27491, 31292,
31681, 38414, 39053, 39371, 42948, 43498, 44051, 44775, 44798,
44852, 44941, 45447, 46349, 46860, 47460, 47585, 47816, 48477,
49334, 49834, 50966, 51840, 52690, 53311, 55357, 55444, 56740,
59481, 61150, 63813, 69616, 70342, 71244, 74535, 75478, 75703,
76452, 77796, 78990, 79783, 81201, 81202, 81214, 82091, 83408,
83873, 84175, 84272, 84323, 84869, 84971, 85790, 87019, 87066,
87188, 87189, 88413, 88579, 89210, 89667, 90000, 90224, 90604,
90688, 91463, 92701, 92867, 93054, 93142, 93151,
93444, 94015, 95318, 95586, 95698, 96209, 96402, 96466, 96930,
96961, 96976, 97191, 97599, 98006, 98593, 98853, 98872, 99068,
99174, 99245, 99442, 99484, 100091, 101249, 101785, 101787,
102057, 102173, 102402, 102543, 102784, 102996, 103507, 103508,
105369, 105436, 105447, 106342, 107910, 108272, 108340, 108910,
110265, 114133, 114574, 114997, 116877, 117416, 117794, 117809,
117942, 118278, 118297, 120331, 121396, 122127, 126332, 127096,
127541, 134313, 134314, 134388, 135175, 135630, 136325, 136688,
137320, 138649, 138970, 141883, 144764, 147055, 147062, 148212,
150452, 150950, 151761, 153617, 154030, 155172, 156973, 157083,
157084, 157648, 159200, 159924, 161290, 161777, 162302, 164627,
167221, 171549]

In [21]:
false_positives_fakenews = [i for i in df_t[df_t['flag']].index.values if i not in true_positives_fakenews]

In [22]:
true_positives_cs = df_t.loc[true_positives_fakenews, 'cs_max']
false_positives_cs = df_t.loc[false_positives_fakenews, 'cs_max']

In [23]:
print(true_positives_cs.shape)
print(false_positives_cs.shape)

(181,)
(316,)


In [24]:
plt.figure(figsize = (9, 5))
true_positives_cs.hist(range = [0, 1], bins = 35, alpha = 0.5, label = 'True Positives')
false_positives_cs.hist(range = [0, 1], bins = 35, alpha = 0.5, label = 'False Positives')
plt.title("Detecting Fake News")
plt.xlabel("Max. Cosine Similarity to Known Fake News")
plt.ylabel("# of Messages")
plt.legend()
plt.savefig('images/ch-misinformation/detecting_fakeNews_cs.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [25]:
print(np.mean(true_positives_cs))
print(np.mean(false_positives_cs))
print(ttest_ind(true_positives_cs, false_positives_cs))

0.77497284596757
0.39961985724027876
Ttest_indResult(statistic=20.49533721050219, pvalue=4.7710836991062e-68)


In [26]:
df_x.loc[true_positives_fakenews, 'fake_news'] = True

# Flagging scams

In [27]:
with open('Data/factcheck/nb4_scams.txt') as file:
    scams_hash = [line.split('\n')[0] for line in file.readlines()]
print(len(scams_hash))

84


In [28]:
df_x['scam'] = False
df_x.loc[df_x['hash'].isin(scams_hash), 'scam'] = True
print(df_x[df_x['scam']].shape)

(663, 34)


In [29]:
df_t = df_x[df_x['textlower'].str.len() > 0]
df_t = df_t.drop(df_t[df_t['scam']].index, axis = 0)
print(df_t.shape)
df_t['token'] = df_t['textlower'].apply(tokenize)
df_t = df_t[df_t['token'].apply(lambda x: len (x) >= 5)]
print(df_t.shape)

(100751, 34)
(43362, 35)


In [30]:
scams_texts = df_x[df_x['scam']].groupby('hash')['textlower'].first()
n_scams = len(scams_hash)

In [31]:
all_corpus = df_t['token'].append(scams_texts.apply(tokenize))

In [32]:
X = vectorizer.fit_transform(all_corpus)
cs = cosine_similarity(X[:-n_scams], X[-n_scams:])
df_t['cs_max'] = cs.max(axis = 1)

In [33]:
df_t['flag'] = (df_t['cs_max'] > 0.3) & \
                   (~df_t['text'].str.contains('https://t.me/noticentrotelecentro'))

print(df_t['flag'].sum())

335


In [34]:
for i, row in df_t[df_t['flag']].iterrows():
    print(row['textlower'])
    print(str(i) + '\n----------------------------------')

 cuentas netflix premium ultra hd.... 4 perfiles... por 1 mes, 2 meses, 3 meses, 6 meses y 1 año  escribeme a mi whatsapp 3145000164 animate excelente precios dísfruta de las mejores películas y series de netflix con tu familia y amigos disponible 1, 2 , 3 pantallas de netflix por 1 mes.... a buen precio también te ofrecemos los servicios de:   cuentas de spotify por 3 meses  amazón primevideo por 1 mes
964
----------------------------------
 cobertura vam  migración venezolana + trámites migratorios únase a venezuela al minuto: http://t.me/noticiasvam  brasil es el país con mayor número de refugiados venezolanos reconocidos en latinoamérica brasil les dio ese estatus a más de 37.000 personas que han huido de venezuela por la grave crisis política, económica y social de ese país. http://bit.ly/2toyhgo  cúcuta, la ciudad dónde más han asesinado con armas de fuego a venezolanos la capital nortesantandereana superó a bogotá, donde ocurrieron 37 muertes a bala, seguida de otras ciudades co

In [35]:
false_positives_scams = [964, 3157, 5688, 8450, 8840, 9930, 10111, 10943,
                   13874, 14706, 18209, 18487, 22308, 27194, 
                   27570, 28134, 28928, 30590, 33623, 34719, 39860,
                   
                   41318, 46218, 48130, 49074, 52569, 53112, 56837,
                   56849, 56853, 57077, 57217, 59324, 59326, 63754,

                   65254, 66884, 66996, 67148, 68081, 68428, 68491,
                   68986, 70178, 72129, 72523, 75373, 75593, 78681,
                   78760, 78761, 79566, 81618, 85399, 85828, 86597,
                   91045, 92426, 93350, 93720, 97746, 99597, 99886,
                   100532, 100711, 101258, 103024, 103639, 103963,
                   108925, 109095, 112425, 112602, 113036, 113156,
                   113408, 114326, 114433, 114445, 115772, 115977,
                   116671, 119150, 119804, 122817, 123355, 123421,
                   123677, 128245, 129177, 129584, 130359, 130976,
                   131423, 133870, 134418, 135704, 135802, 136215,
                   139630, 139644, 144581, 144708, 146095, 146609,
                   147125, 147864, 150177, 154666, 157354, 160617,
                   169712]

In [36]:
true_positives_scams = [i for i in df_t[df_t['flag']].index.values if i not in false_positives_scams]

In [37]:
true_positives_cs = df_t.loc[true_positives_scams, 'cs_max']
false_positives_cs = df_t.loc[false_positives_scams, 'cs_max']

In [38]:
print(true_positives_cs.shape)
print(false_positives_cs.shape)

(223,)
(112,)


In [39]:
plt.figure(figsize = (9, 5))
true_positives_cs.hist(range = [0, 1], bins = 35, alpha = 0.5, label = 'True Positives')
false_positives_cs.hist(range = [0, 1], bins = 35, alpha = 0.5, label = 'False Positives')
plt.title("Detecting Scams")
plt.xlabel("Max. Cosine Similarity to Known Scams")
plt.ylabel("# of Messages")
plt.legend()
plt.savefig('images/ch-misinformation/detecting_scam_cs.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [40]:
print(np.mean(true_positives_cs))
print(np.mean(false_positives_cs))
print(ttest_ind(true_positives_cs, false_positives_cs))

0.5628925960793657
0.3544226293943395
Ttest_indResult(statistic=9.656392417655127, pvalue=1.3014595108800633e-19)


In [41]:
df_x.loc[true_positives_scams, 'scam'] = True

In [42]:
df_x.to_csv('Data/df_x_nb4a-mis.csv')