In [2]:
from google.cloud import bigquery
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import torch
import torch.nn.functional as F
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import GraphSAGE
from torch_geometric.data import Data
import time

# 1. Data pre-processing in BigQuery
1. Get one week of data from this public available BiqQuery table: gdelt-bq.gdeltv2.geg_gcnlapi
2. Create hosts table, articles table and entities table, with node ids assigned
3. Create 2 edge index tables: host to article and article to entity

TODO: Describe GDELT and GEG dataset, and the time range selected  
TODO: List tools to use: BigQuery, PyTorch, PyG

# 2. Read pre-processed data from BigQuery into Dataframes
Hosts table, article table, entities table, and edge indices between them.

TODO: Describe the graph: 3 types of nodes + 2 type of edges

In [3]:
# BigQuery api client
bq = bigquery.Client(location="US")


In [4]:
def nodes_tables_from_bq(bq):
    host_query = """  
        SELECT * 
        FROM `gannett-datarevenue.zz_test_pang.hosts_table` 
        """
    article_query = """  
        SELECT * 
        FROM `gannett-datarevenue.zz_test_pang.articles_table` 
        """
    entity_query = """  
        SELECT * 
        FROM `gannett-datarevenue.zz_test_pang.entities_table` 
        """
    query_job = bq.query(host_query, location="US") 
    hosts = query_job.to_dataframe() 
    query_job = bq.query(article_query, location="US") 
    articles = query_job.to_dataframe() 
    query_job = bq.query(entity_query, location="US") 
    entities = query_job.to_dataframe() 
    return hosts, articles, entities

In [5]:
# use local csv files when possible 
def load_nodes_tables(bq):
    exists_hosts = os.path.isfile('data/hosts_table.csv')
    exists_articles = os.path.isfile('data/articles_table.csv')
    exists_entities = os.path.isfile('data/entities_table.csv')
    if exists_hosts and exists_articles and exists_entities:
        print("Read from local csv files.")
        hosts = pd.read_csv('data/hosts_table.csv')
        articles = pd.read_csv('data/articles_table.csv')
        entities = pd.read_csv('data/entities_table.csv')
    else:
        print("Read from BigQuery.")
        hosts, articles, entities = nodes_tables_from_bq(bq)
        hosts.to_csv('data/hosts_table.csv')
        articles.to_csv('data/articles_table.csv')
        entities.to_csv('data/entities_table.csv')
    
    return hosts, articles, entities

In [6]:
# load nodes tables into dataframes
hosts_table, articles_table, entities_table = load_nodes_tables(bq=None)

# sort the dataframes by node ids
hosts_table = hosts_table.sort_values(by=['host_node_id'], ascending=True).reset_index(drop=True)
articles_table = articles_table.sort_values(by=['article_node_id'], ascending=True).reset_index(drop=True)
entities_table = entities_table.sort_values(by=['entity_node_id'], ascending=True).reset_index(drop=True)

print(hosts_table.shape, articles_table.shape, entities_table.shape)

# sort the dataframes by node ids, this is not necessary but makes a niceer view of the df
hosts_table = hosts_table.sort_values(by=['host_node_id'], ascending=True).reset_index(drop=True)
articles_table = articles_table.sort_values(by=['article_node_id'], ascending=True).reset_index(drop=True)
entities_table = entities_table.sort_values(by=['entity_node_id'], ascending=True).reset_index(drop=True)

# Display the first 3 rows
display(hosts_table.head(3))
display(articles_table.head(3))
display(entities_table.head(3))


Read from local csv files.
(20033, 3) (1048852, 7) (1289112, 4)


Unnamed: 0.1,Unnamed: 0,host_node_id,host
0,3605,0,www.berchtesgadener-anzeiger.de
1,18421,1,www.toponline.ch
2,14060,2,news.yahoo.com


Unnamed: 0.1,Unnamed: 0,article_node_id,host,url,lang,magnitude,score
0,63332,0,www.workersliberty.org,https://www.workersliberty.org/index.php/audio,en,15.9,0.0
1,695255,1,www.wkyc.com,https://www.wkyc.com/article/news/nation-world...,en,24.6,-0.4
2,678766,2,townhall.com,https://townhall.com/tipsheet/mattvespa/2023/1...,en,13.2,-0.4


Unnamed: 0.1,Unnamed: 0,entity_node_id,mid,type
0,710406,0,/m/0138vk,LOCATION
1,239295,1,/m/02phmc9,LOCATION
2,1224603,2,/m/04gsnk1,ORGANIZATION


## 2.1 Nodes
Nodes Features:
1. Hosts (News Outlets) - News Outlet (identified by host) - use random node feature for host, because hosts do not have rich features
2. Articles (identified by url) - lang, magnitude, score
3. Entity (identified by mid) - type

dim of node features = 12


In [7]:
# node feature dim
node_feature_dim = 12

# use random node feature for host, because hosts do not have rich features
host_features = torch.randn(hosts_table.shape[0], node_feature_dim)  # Random features for each node
host_features.shape

torch.Size([20033, 12])

In [8]:
# features for article: lang, magnitude, score
unique_langs, article_langs = np.unique(articles_table['lang'].values, return_inverse=True)
article_features = pd.get_dummies(article_langs)
article_features['magniture'] = articles_table['magnitude']
article_features['score'] = articles_table['score']
article_features = torch.from_numpy(article_features.values).to(torch.float)
article_features.shape

torch.Size([1048852, 12])

In [9]:
# features for entities: type
unique_types, entity_types = np.unique(entities_table['type'].values, return_inverse=True)
entity_features = pd.get_dummies(entity_types)
entity_features['padding1'] = 0
entity_features['padding2'] = 0
entity_features['padding3'] = 0
entity_features['padding4'] = 0
entity_features['padding5'] = 0
entity_features = torch.from_numpy(entity_features.values).to(torch.float)
entity_features.shape

torch.Size([1289112, 12])

In [10]:
# Nodes combined
combined_nodes_features = torch.cat((host_features, article_features, entity_features), dim=0)
combined_nodes_features.shape

torch.Size([2357997, 12])

## 2.2 Edge Indices
Two types of edges: 
1. host to article
2. article to entity


In [11]:
def edge_indices_from_bq(bq):
    host_article_query = """  
        SELECT * 
        FROM `gannett-datarevenue.zz_test_pang.edge_index_host_to_article` 
        """
    article_entity_query = """  
        SELECT * 
        FROM `gannett-datarevenue.zz_test_pang.edge_index_article_to_entity` 
        """
    query_job = bq.query(host_article_query, location="US") 
    host_article = query_job.to_dataframe() 
    query_job = bq.query(article_entity_query, location="US") 
    article_entity = query_job.to_dataframe() 

    return host_article, article_entity
    

In [12]:
# use local csv files when possible 
def load_edge_indices(bq):
    exists_host_article = os.path.isfile('data/edge_index_host_to_article.csv')
    exists_article_entity = os.path.isfile('data/edge_index_article_to_entity.csv')

    if exists_host_article and exists_article_entity:
        print("Read from local csv files.")
        host_article = pd.read_csv('data/edge_index_host_to_article.csv')
        article_entity = pd.read_csv('data/edge_index_article_to_entity.csv')
    else:
        print("Read from BigQuery.")
        host_article, article_entity = edge_indices_from_bq(bq)
        host_article.to_csv('data/edge_index_host_to_article.csv')
        article_entity.to_csv('data/edge_index_article_to_entity.csv')
    
    return host_article, article_entity

In [13]:
# load edge indices into dataframes
host_article, article_entity = load_edge_indices(bq=None)
print(host_article.shape, article_entity.shape)

# Assign global node ids to nodes
article_node_id_offset = hosts_table.shape[0]
entity_node_id_offset = hosts_table.shape[0] + articles_table.shape[0]

host_article['src'] = host_article['host_node_id']
host_article['trg'] = host_article['article_node_id'] + article_node_id_offset
host_article = host_article.drop(columns=['host_node_id', 'article_node_id'])

article_entity['src'] = article_entity['article_node_id'] + article_node_id_offset
article_entity['trg'] = article_entity['entity_node_id'] + entity_node_id_offset
article_entity = article_entity.drop(columns=['entity_node_id', 'article_node_id'])

# Combine the edges
combined_edges = pd.concat([host_article, article_entity])
print(combined_edges.shape)
assert max(combined_edges['trg'].values) == hosts_table.shape[0] + articles_table.shape[0] + entities_table.shape[0] - 1


Read from local csv files.
(1048852, 2) (39171153, 2)
(40220005, 2)


In [14]:
# construct `edge_index` in COO format

src = torch.from_numpy(combined_edges['src'].values)
trg = torch.from_numpy(combined_edges['trg'].values)
edge_index = torch.stack([src, trg], dim=0)
print(edge_index.shape)
assert edge_index.size() == (2, host_article.shape[0] + article_entity.shape[0])


torch.Size([2, 40220005])


# 3. Create Data object

In [15]:
data = Data(x=combined_nodes_features, edge_index=edge_index)
data

Data(x=[2357997, 12], edge_index=[2, 40220005])

## 4. GraphSAGE Model

TODO: add paper URL, briefly describe model architecture

In [16]:
train_loader = LinkNeighborLoader(
    data,
    batch_size=4096000,
    shuffle=True,
    neg_sampling_ratio=1.0,
    num_neighbors=[10, 10],
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
data = data.to(device, 'x', 'edge_index')

model = GraphSAGE(
    data.num_node_features,
    hidden_channels=64,
    num_layers=2,
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

cpu


In [17]:
def train():
    model.train()

    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        h = model(batch.x, batch.edge_index)
        h_src = h[batch.edge_label_index[0]]
        h_dst = h[batch.edge_label_index[1]]
        pred = (h_src * h_dst).sum(dim=-1)
        loss = F.binary_cross_entropy_with_logits(pred, batch.edge_label)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.size(0)

    return total_loss / data.num_nodes

In [None]:
times = []
for epoch in range(1, 101):
    start = time.time()
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, ')
    times.append(time.time() - start)
print(f"Median time per epoch: {torch.tensor(times).median():.4f}s")

TODO: get top 10 news outlets in the 5 different regions 
first visualize their postions in the reduced dim emb plots of all hosts
and see if they are in the same clusters  
https://pressgazette.co.uk/media-audience-and-business-data/media_metrics/most-popular-websites-news-world-monthly-2/  


In [18]:
# Hosts from the top 50 news sites
def top_hosts_from_bq(bq):
    top_host_query = """  
        SELECT * 
        FROM `gannett-datarevenue.zz_test_pang.top_hosts` 
        """
    query_job = bq.query(top_host_query, location="US") 
    top_hosts = query_job.to_dataframe() 

    return top_hosts

# use local csv files when possible 
def load_top_hosts(bq):
    exists = os.path.isfile('data/top_hosts.csv')
    if exists:
        print("Read from local csv files.")
        top_hosts = pd.read_csv('data/top_hosts.csv')
    else:
        print("Read from BigQuery.")
        top_hosts = top_hosts_from_bq(bq)
        top_hosts.to_csv('data/top_hosts.csv')
    
    return top_hosts
    

In [26]:
# hosts of the top 50 news sites
top_hosts = load_top_hosts(bq)
top_hosts = top_hosts.sort_values(by=['host_node_id'], ascending=True).reset_index(drop=True)


Read from local csv files.


In [27]:
top_hosts

Unnamed: 0.1,Unnamed: 0,host_node_id,host,top_site
0,123,2,news.yahoo.com,news.yahoo.com
1,105,3,abcnews.go.com,abcnews.go.com
2,24,7,www.foxnews.com,foxnews.com
3,107,8,bangaloremirror.indiatimes.com,indiatimes.com
4,34,125,www.huffpost.com,huffpost.com
...,...,...,...,...
146,81,19084,vikingswire.usatoday.com,usatoday.com
147,77,19335,ravenswire.usatoday.com,usatoday.com
148,136,19367,au.finance.yahoo.com,finance.yahoo.com
149,92,19590,raiderswire.usatoday.com,usatoday.com


In [21]:
host_features

tensor([[ 1.0724, -2.2048, -0.2910,  ...,  1.4194, -0.2917,  1.8048],
        [ 0.3646, -0.0420, -0.5573,  ..., -0.8322, -0.5942,  0.6134],
        [-1.0792,  0.6814, -1.1628,  ...,  1.1982, -0.8369, -1.4425],
        ...,
        [-0.6074, -1.4191,  1.1454,  ..., -1.4777, -0.1048,  1.2570],
        [ 1.6267, -1.0371,  0.0356,  ..., -2.5519,  0.3613, -0.0524],
        [ 0.7078, -0.2421,  0.2402,  ..., -0.9356,  0.3039,  1.6956]])

In [25]:
host_features[top_hosts['host_node_id']].shape

torch.Size([151, 12])