In [1]:
import pandas as pd
import numpy as np
import uuid

In [2]:
REAL_PATH = "../DatosKaggle/BuzzFeed/BuzzFeed_real_news_content.csv"
FAKE_PATH = "../DatosKaggle/BuzzFeed/BuzzFeed_fake_news_content.csv"
NEWS_PATH = "../DatosKaggle/BuzzFeed/BuzzFeedNews.txt"

El objetivo de este notebook es tomar el dataset de red bipartita y cambiar el valor de la columna news_id a un string.
Mas puntualmente, mapear el id de la noticia a un hash unico.

## Carga del dataset de Fake News Dataset

In [3]:
df_fake = pd.read_csv(FAKE_PATH)
df_real = pd.read_csv(REAL_PATH)

In [4]:
df_fake.head(n=1)

Unnamed: 0,id,title,text,url,top_img,authors,source,publish_date,movies,images,canonical_link,meta_data
0,Fake_1-Webpage,Proof The Mainstream Media Is Manipulating The...,I woke up this morning to find a variation of ...,http://www.addictinginfo.org/2016/09/19/proof-...,http://addictinginfo.addictinginfoent.netdna-c...,Wendy Gittleson,http://www.addictinginfo.org,{'$date': 1474243200000},,"http://i.imgur.com/JeqZLhj.png,http://addictin...",http://addictinginfo.com/2016/09/19/proof-the-...,"{""publisher"": ""Addicting Info | The Knowledge ..."


In [5]:
df_real.head(n=1)

Unnamed: 0,id,title,text,url,top_img,authors,source,publish_date,movies,images,canonical_link,meta_data
0,Real_1-Webpage,Another Terrorist Attack in NYC…Why Are we STI...,"On Saturday, September 17 at 8:30 pm EST, an e...",http://eaglerising.com/36942/another-terrorist...,http://eaglerising.com/wp-content/uploads/2016...,"View All Posts,Leonora Cravotta",http://eaglerising.com,{'$date': 1474528230000},,http://constitution.com/wp-content/uploads/201...,http://eaglerising.com/36942/another-terrorist...,"{""description"": ""\u201cWe believe at this poin..."


In [6]:
# La columna id de los dataframe Fake y Real tiene valores de este estilo: Fake_1-Webpage
# El objetivo es quedarnos con el valor numerico, en este caso 1 y eliminar los caracteres restantes.

def fix_id(news_id:str):
    news_id = news_id[5:]
    news_id = news_id[:-8]
    return int(news_id)

df_fake["id"] = df_fake["id"].apply(lambda x: int(fix_id(x)))
df_real["id"] = df_real["id"].apply(lambda x: int(fix_id(x)))

In [7]:
# Ordenamos los dataset (ya que no lo estan) y reseteamos los indices de los dataframes
df_fake = df_fake.sort_values(by=['id'], ascending=True).reset_index(drop=True)
df_real = df_real.sort_values(by=['id'], ascending=True).reset_index(drop=True)

Checkeamos los resultados

In [8]:
df_fake.head(n=1)

Unnamed: 0,id,title,text,url,top_img,authors,source,publish_date,movies,images,canonical_link,meta_data
0,1,Proof The Mainstream Media Is Manipulating The...,I woke up this morning to find a variation of ...,http://www.addictinginfo.org/2016/09/19/proof-...,http://addictinginfo.addictinginfoent.netdna-c...,Wendy Gittleson,http://www.addictinginfo.org,{'$date': 1474243200000},,"http://i.imgur.com/JeqZLhj.png,http://addictin...",http://addictinginfo.com/2016/09/19/proof-the-...,"{""publisher"": ""Addicting Info | The Knowledge ..."


In [9]:
df_real.head(n=1)

Unnamed: 0,id,title,text,url,top_img,authors,source,publish_date,movies,images,canonical_link,meta_data
0,1,Another Terrorist Attack in NYC…Why Are we STI...,"On Saturday, September 17 at 8:30 pm EST, an e...",http://eaglerising.com/36942/another-terrorist...,http://eaglerising.com/wp-content/uploads/2016...,"View All Posts,Leonora Cravotta",http://eaglerising.com,{'$date': 1474528230000},,http://constitution.com/wp-content/uploads/201...,http://eaglerising.com/36942/another-terrorist...,"{""description"": ""\u201cWe believe at this poin..."


## Carga del dataset de Noticias

In [10]:
df_news = pd.read_table(NEWS_PATH, header=None, names=["news"])

### Creacion de una nueva columna indicando el tipo de noticia

In [11]:
df_real['news_type'] = "Real"
df_fake['news_type'] = "Fake"

In [12]:
df_real.head(n=2)

Unnamed: 0,id,title,text,url,top_img,authors,source,publish_date,movies,images,canonical_link,meta_data,news_type
0,1,Another Terrorist Attack in NYC…Why Are we STI...,"On Saturday, September 17 at 8:30 pm EST, an e...",http://eaglerising.com/36942/another-terrorist...,http://eaglerising.com/wp-content/uploads/2016...,"View All Posts,Leonora Cravotta",http://eaglerising.com,{'$date': 1474528230000},,http://constitution.com/wp-content/uploads/201...,http://eaglerising.com/36942/another-terrorist...,"{""description"": ""\u201cWe believe at this poin...",Real
1,2,Hillary Clinton on police shootings: 'too many...,"Story highlights ""This, though, is certain: to...",http://cnn.it/2cRxlv5,http://i2.cdn.cnn.com/cnnnext/dam/assets/16091...,"Mj Lee,Cnn National Politics Reporter",http://cnn.it,,,http://i2.cdn.cnn.com/cnnnext/dam/assets/16091...,http://www.cnn.com/2016/09/21/politics/hillary...,"{""description"": ""Hillary Clinton kicked off a ...",Real


In [13]:
df_fake.head(n=2)

Unnamed: 0,id,title,text,url,top_img,authors,source,publish_date,movies,images,canonical_link,meta_data,news_type
0,1,Proof The Mainstream Media Is Manipulating The...,I woke up this morning to find a variation of ...,http://www.addictinginfo.org/2016/09/19/proof-...,http://addictinginfo.addictinginfoent.netdna-c...,Wendy Gittleson,http://www.addictinginfo.org,{'$date': 1474243200000},,"http://i.imgur.com/JeqZLhj.png,http://addictin...",http://addictinginfo.com/2016/09/19/proof-the-...,"{""publisher"": ""Addicting Info | The Knowledge ...",Fake
1,2,Declassified Docs Show That Obama Admin Create...,Thanks in part to the declassification of Defe...,http://usherald.com/breaking-declassified-docs...,http://usherald.com/wp-content/uploads/2015/05...,Bob Amoroso,http://usherald.com,{'$date': 1432650030000},,http://usherald.com/wp-content/uploads/2015/02...,http://usherald.com/breaking-declassified-docs...,"{""generator"": ""WordPress 4.8.1"", ""og"": {""site_...",Fake


In [14]:
df_fake.index = df_fake.index + 91
df_fake.id = df_fake.id + 91
df_fake.head()

Unnamed: 0,id,title,text,url,top_img,authors,source,publish_date,movies,images,canonical_link,meta_data,news_type
91,92,Proof The Mainstream Media Is Manipulating The...,I woke up this morning to find a variation of ...,http://www.addictinginfo.org/2016/09/19/proof-...,http://addictinginfo.addictinginfoent.netdna-c...,Wendy Gittleson,http://www.addictinginfo.org,{'$date': 1474243200000},,"http://i.imgur.com/JeqZLhj.png,http://addictin...",http://addictinginfo.com/2016/09/19/proof-the-...,"{""publisher"": ""Addicting Info | The Knowledge ...",Fake
92,93,Declassified Docs Show That Obama Admin Create...,Thanks in part to the declassification of Defe...,http://usherald.com/breaking-declassified-docs...,http://usherald.com/wp-content/uploads/2015/05...,Bob Amoroso,http://usherald.com,{'$date': 1432650030000},,http://usherald.com/wp-content/uploads/2015/02...,http://usherald.com/breaking-declassified-docs...,"{""generator"": ""WordPress 4.8.1"", ""og"": {""site_...",Fake
93,94,Why is it “RACIST” to Question Someone’s Birth...,The Democrats are using an intimidation tactic...,http://eaglerising.com/36841/why-is-it-racist-...,http://eaglerising.com/wp-content/uploads/2016...,View All Posts,http://eaglerising.com,{'$date': 1474243356000},,http://2lv0hm3wvpix464wwy2zh7d1.wpengine.netdn...,http://eaglerising.com/36841/why-is-it-racist-...,"{""description"": ""There is evidence the birth c...",Fake
94,95,HILLARY ON DISABLED CHILDREN During Easter Egg...,Dolly Kyle has written a scathing “tell all” b...,http://100percentfedup.com/hillary-on-disabled...,http://100percentfedup.com/wp-content/uploads/...,Fed Up,http://100percentfedup.com,{'$date': 1466439263000},,https://www.facebook.com/tr?id=157908891568640...,http://100percentfedup.com/hillary-on-disabled...,"{""googlebot"": ""noimageindex"", ""generator"": ""Po...",Fake
95,96,'Reporters' FLEE When Clintons Get EXPOSED!,The Haitians in the audience have some newswor...,http://clashdaily.com/2016/09/watch-trump-visi...,http://clashdaily.com/wp-content/uploads/2016/...,"Rich Witmer,Doug Giles",http://clashdaily.com,{'$date': 1474208802000},https://www.youtube.com/embed/x5IS6Ya005E?feat...,http://clashdaily.wpengine.netdna-cdn.com/wp-c...,http://clashdaily.com/2016/09/watch-trump-visi...,"{""googlebot"": ""noimageindex"", ""og"": {""site_nam...",Fake


In [15]:
df_all_news = pd.concat([df_real, df_fake])
df_all_news.to_csv('news_id_title_type.csv', index = False, header=True, encoding='utf-8')

In [16]:
df_all_news.head(n=2)

Unnamed: 0,id,title,text,url,top_img,authors,source,publish_date,movies,images,canonical_link,meta_data,news_type
0,1,Another Terrorist Attack in NYC…Why Are we STI...,"On Saturday, September 17 at 8:30 pm EST, an e...",http://eaglerising.com/36942/another-terrorist...,http://eaglerising.com/wp-content/uploads/2016...,"View All Posts,Leonora Cravotta",http://eaglerising.com,{'$date': 1474528230000},,http://constitution.com/wp-content/uploads/201...,http://eaglerising.com/36942/another-terrorist...,"{""description"": ""\u201cWe believe at this poin...",Real
1,2,Hillary Clinton on police shootings: 'too many...,"Story highlights ""This, though, is certain: to...",http://cnn.it/2cRxlv5,http://i2.cdn.cnn.com/cnnnext/dam/assets/16091...,"Mj Lee,Cnn National Politics Reporter",http://cnn.it,,,http://i2.cdn.cnn.com/cnnnext/dam/assets/16091...,http://www.cnn.com/2016/09/21/politics/hillary...,"{""description"": ""Hillary Clinton kicked off a ...",Real


Guardamos el dataset solo con las columnas id, titlo y tipo de noticia

In [17]:
df_all_news[["id", "title", "news_type"]].to_csv('news_id_title_type.csv',  index = False, header=True, encoding='utf-8')

## Carga de Dataset Noticias Usuarios

In [18]:
bipartite = pd.read_csv("../DatosKaggle/BuzzFeed/sample/BuzzFeedNewsUser_sample_20210815.csv")
bipartite.columns = ["news_id", "user_id"]
bipartite.head()

Unnamed: 0,news_id,user_id
0,140,10
1,150,11
2,21,41
3,42,51
4,51,114


In [19]:
# Checkeamos cuantas news_id unicas hay
print(f"Cantidad de id de noticias unicos: {len(set(bipartite['news_id']))}")
print(f"Cantidad de id de noticias en total: {len(bipartite['news_id'])}")

Cantidad de id de noticias unicos: 37
Cantidad de id de noticias en total: 1314


In [20]:
# Hacemos coincidir el indice del dataset con el  mismo numero de id
df_all_news.index = df_all_news.index + 1

In [21]:
# Matcheamos news_id con news_title

title_name = []
title_id = []
fake_or_real = []
for index, row in bipartite.iterrows():
    id_ = row["news_id"]
    result = df_all_news["title"].loc[id_]
    news_type = df_all_news["news_type"].loc[id_]
    title_name.append(result)
    title_id.append(id_)
    fake_or_real.append(news_type)

In [22]:
bipartite["title"] = title_name
bipartite["news_type"] = fake_or_real

Checkeamos la distribucion de fake y real news

In [23]:
bipartite["news_type"].value_counts()

Real    696
Fake    618
Name: news_type, dtype: int64

In [25]:
# Asociamos el news_id a un hash unico (esto es porque R no puede cargar una red y distinguir entre dos tipos de nodos
# si los tipos de datos son iguales. Por eso no podemos usar un id numerico y tenemos que usar un hash)

news_id_to_hash = {}
title_id = set(title_id)
news_id_to_hash = {id: uuid.uuid4().hex[:6].upper() for id in title_id}

In [26]:
def get_title_hash(id: int):
    return news_id_to_hash.get(id)

bipartite['news_title_hash'] = bipartite['news_id'].apply(lambda x: get_title_hash(x))
bipartite.head()

Unnamed: 0,news_id,user_id,title,news_type,news_title_hash
0,140,10,Federal Agents Make Massive Discovery at South...,Fake,A8E571
1,150,11,"ABC Reveals Trump Paid ""Hundreds Of Millions"" ...",Fake,F466AA
2,21,41,Anthony Weiner Under Federal Investigation for...,Real,9C8FAF
3,42,51,National poll: Clinton leads Trump by 6,Real,B4EA27
4,51,114,Trump labels Clinton 'weak and ineffective' in...,Real,229D27


In [27]:
len(set(bipartite['news_title_hash']))

37

Guardamos el dataset

In [28]:
bipartite.to_csv("../DatosKaggle/BuzzFeed/sample/BipartiteUserNewsComplete_20210815.csv", index = False, encoding='utf-8-sig')

Deleting news_id column as we don´t need it

In [29]:
del bipartite['news_id']
del bipartite['title']
del bipartite['news_type']

bipartite.head()

Unnamed: 0,user_id,news_title_hash
0,10,A8E571
1,11,F466AA
2,41,9C8FAF
3,51,B4EA27
4,114,229D27


In [30]:
# Save bipartite dataset with news name and user id to a csv
bipartite.to_csv("../DatosKaggle/BuzzFeed/sample/BuzzFeedNewsUser_newshash_20210815.csv", index = False, encoding='utf-8-sig')