In [1]:
import pandas as pd 
import numpy as np
from collections import defaultdict

In [2]:
NEWS_USER_PATH = "../DatosKaggle/BuzzFeed/BuzzFeedNewsUser.csv"
USER_USER_PATH = "../DatosKaggle/BuzzFeed/sample/BuzzFeedUserUser_sample_20210815.csv"
NEWS_PATH = "news_id_title_type.csv"

## Carga de Datos

Cargamos el dataset de noticias

In [3]:
df_news = pd.read_csv(NEWS_PATH)
df_news.index = df_news.index + 1 
print(f"El dataset de noticias originalmente cuenta con {len(df_news)} observaciones")
display(df_news.head())

El dataset de noticias originalmente cuenta con 182 observaciones


Unnamed: 0,id,title,news_type
1,1,Another Terrorist Attack in NYC…Why Are we STI...,Real
2,2,Hillary Clinton on police shootings: 'too many...,Real
3,3,"Critical counties: Wake County, NC, could put ...",Real
4,4,NFL Superstar Unleashes 4 Word Bombshell on Re...,Real
5,5,Obama in NYC: 'We all have a role to play' in ...,Real


Cargamos el dataset de noticias asociadas a un usuario

In [4]:
df_news_user = pd.read_csv(NEWS_USER_PATH)
print(f"El dataset de usuarios que comparten noticias originalmente cuenta con {len(df_news_user)} observaciones")
df_news_user.head(n=2)

El dataset de usuarios que comparten noticias originalmente cuenta con 22779 observaciones


Unnamed: 0,news_id,user,share_freq
0,45,1,1
1,127,2,1


Cargamos el dataset de usuarios que siguen a usuarios

In [5]:
df_user_user = pd.read_csv(USER_USER_PATH, names = ['id_source', 'id_target'])
print(f"El dataset de usuarios que siguen a usuarios originalmente cuenta con {len(df_user_user)} observaciones")
df_user_user.head(n=2)

El dataset de usuarios que siguen a usuarios originalmente cuenta con 24306 observaciones


Unnamed: 0,id_source,id_target
0,31,10
1,10,240


## Procesamiento de Datos

Creamos una lista de los usuarios a los que cada usuario sigue llamada ```user_to_following```

In [6]:
user_to_following = defaultdict(list)

for index, row in df_user_user.iterrows():
    id_source = row["id_source"]
    id_target = row["id_target"]
    user_to_following[id_source].append(id_target)   

- Creamos una lista de ids a los que un usuario dado sigue llamada ```following```


- Creamos un diccionario en el cual asociamos un user_id con el tipo de noticia y frecuencia con la que interactuo llamado ```user_id_to_news_type```


- Creamos una lista con el tipo de noticia con la que interactuo cada usuario y la agregamos al dataframe llamada ```news_type```

In [7]:
following = []
user_id_to_news_type = defaultdict(list)
news_type = []

for index, row in df_news_user.iterrows():
    news_type_tmp = df_news.news_type.loc[row.news_id]
    user_id_to_news_type[row.user].append((news_type_tmp, row.share_freq))
    following.append(user_to_following.get(row.user, []))
    news_type.append(news_type_tmp)


df_news_user["following"] = following
df_news_user['news_type'] = news_type

display(df_news_user.head(n=11))

Unnamed: 0,news_id,user,share_freq,following,news_type
0,45,1,1,[],Real
1,127,2,1,[],Fake
2,115,3,1,[],Fake
3,180,3,1,[],Fake
4,140,4,1,[],Fake
5,5,5,1,[],Real
6,45,6,1,[],Real
7,45,7,1,[],Real
8,45,8,1,[],Real
9,150,9,1,[],Fake


In [8]:
def count_fake_news(following: list):
    count = 0
    for user in following:
        news_type_freq = user_id_to_news_type.get(user)
        for news in news_type_freq:
            if news[0] == 'Fake':
                count += news[1]
    return count
        
def count_real_news(following: list):
    count = 0
    for user in following:
        news_type_freq = user_id_to_news_type.get(user)
        for news in news_type_freq:
            if news[0] == 'Real':
                count += news[1]
    return count

def count_total_news(following: list):
    count = 0
    for user in following:
        news_type_freq = user_id_to_news_type.get(user)
        for news in news_type_freq:
            count += 1
    return count
    
def count_total_freq_news(following: list):
    count = 0
    for user in following:
        news_type_freq = user_id_to_news_type.get(user)
        for news in news_type_freq:
            count += news[1]
    return count
    
# Computamos cantidad de fake y real news compartidas por los vecinos
#df_news_user['fake_news_shared_by_following'] = df_news_user["following"].apply(lambda x : count_fake_news(x))
#df_news_user['fake_news_shared_by_user'] = df_news_user["user"].apply(lambda x : count_fake_news([x]))

#df_news_user['real_news_shared_by_following'] = df_news_user["following"].apply(lambda x : count_real_news(x))
#df_news_user['real_news_shared_by_user'] = df_news_user["user"].apply(lambda x : count_real_news([x]))

#df_news_user['total_news_shared_by_following'] = df_news_user["following"].apply(lambda x : count_total_news(x))
#df_news_user['total_news_shared_by_user'] = df_news_user["user"].apply(lambda x : count_total_news([x]))

#df_news_user['total_freq_news_shared_by_following'] = df_news_user["following"].apply(lambda x : count_total_freq_news(x))
#df_news_user['total_freq_news_shared_by_user'] = df_news_user["user"].apply(lambda x : count_total_freq_news([x]))

#df_news_user["count_following"] = df_news_user["following"].apply(lambda x : len(x))
#df_news_user.head()

# Computamos cantidad de fake y real news compartidas por cada usuario, ya sea usuario source o usuario target
df_user_user['fake_news_shared_by_source'] = df_user_user["id_source"].apply(lambda x : count_fake_news([x]))
df_user_user['fake_news_shared_by_target'] = df_user_user["id_target"].apply(lambda x : count_fake_news([x]))
df_user_user['real_news_shared_by_source'] = df_user_user["id_source"].apply(lambda x : count_real_news([x]))
df_user_user['real_news_shared_by_target'] = df_user_user["id_target"].apply(lambda x : count_real_news([x]))
df_user_user.head()

Unnamed: 0,id_source,id_target,fake_news_shared_by_source,fake_news_shared_by_target,real_news_shared_by_source,real_news_shared_by_target
0,31,10,0,3,1,0
1,10,240,3,0,0,1
2,437,10,1,3,0,0
3,10,740,3,1,0,0
4,789,10,1,3,0,0


In [9]:
print(f"El dataset de usuarios que siguen a usuarios finalmente cuenta con {len(df_user_user)} observaciones")

El dataset de usuarios que siguen a usuarios finalmente cuenta con 24306 observaciones


## Analisis Estadistico

In [10]:
df_user_user.fake_news_shared_by_source.value_counts()

0     11714
1      9206
2      1998
3       689
4       372
5       118
10       57
7        49
13       47
6        40
8         9
11        6
21        1
Name: fake_news_shared_by_source, dtype: int64

Borramos observaciones duplicadas

In [11]:
#df_user_user = df_user_user.drop_duplicates(subset=['id_source']).sort_values(by=['id_source'])
#df_user_user.head()
#len(df_user_user.id_source)

## Guardado de Dataset

In [12]:
df_user_user.to_csv("../DatosKaggle/BuzzFeed/sample/UsersFollowing_20210819.csv", index = False, encoding='utf-8-sig')

In [13]:
#df_user_user[["id_source", "id_target"]].to_csv("../DatosKaggle/BuzzFeed/sample/UsersFollowing_20210819.txt", sep='\t', header=None, index = False)
df_user_user[["id_source", "id_target"]].to_csv("../DatosKaggle/BuzzFeed/sample/UsersFollowing_20210819.txt", sep='\t', header=None, index = False)