In [14]:
import pandas as pd
from sklearn.metrics import *
from tqdm import tqdm
from utils import Hetero_Data_Processor_Transfer_Learning
import numpy as np
import warnings
warnings.filterwarnings('ignore')


In [15]:
train_dataset = 'charlie_hebdo'
test_dataset = 'ottawashooting'
time_cut =60*3*24
processor = Hetero_Data_Processor_Transfer_Learning(train_dataset, test_dataset, time_cut=time_cut,test_size=0.4)
data = processor.process()

rumour
1    180
0    163
Name: count, dtype: int64


In [16]:
data

HeteroData(
  id={
    x=[2859, 106],
    y=[2859],
    train_mask=[2859],
    val_mask=[2859],
    test_mask=[2859],
  },
  reply_user_id={ x=[26437, 104] },
  (id, retweet, reply_user_id)={ edge_index=[2, 26437] },
  (reply_user_id, rev_retweet, id)={ edge_index=[2, 26437] }
)

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, to_hetero

class GAT(torch.nn.Module):
    def __init__(self, dim_h,dim_i, dim_out):
        super().__init__()
        self.conv1 = GATConv((-1, -1), dim_h, add_self_loops=False)
        self.conv2 = GATConv(dim_h, dim_i, add_self_loops=False)  # Added second GATConv layer
        self.linear = nn.Linear(dim_i, dim_out)
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, x, edge_index):
        h = self.conv1(x, edge_index).relu()
        h = self.dropout(h)
        h = self.conv2(h, edge_index).relu()  # Pass through the second GATConv layer
        h = self.dropout(h)
        h = self.linear(h)
        return h

@torch.no_grad()
def test(mask):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict)['id'].argmax(dim=-1)
    acc = (pred[mask] == data['id'].y[mask]).sum() / mask.sum()
    return float(acc)

In [18]:

model = GAT(dim_h=64,dim_i=32, dim_out=2)
model = to_hetero(model, data.metadata(), aggr='sum')

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)



for epoch in range(500):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)['id']
    mask = data['id'].train_mask
    loss = F.cross_entropy(out[mask], data['id'].y[mask])
    loss.backward()
    optimizer.step()
    
    if epoch % 50 == 0:
        train_acc = test(data['id'].train_mask)
        val_acc = test(data['id'].val_mask)
        print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')
    
test_acc = test(data['id'].test_mask)
print(f'Test accuracy: {test_acc*100:.2f}%')
    


Epoch:   0 | Train Loss: 0.8086 | Train Acc: 66.97% | Val Acc: 57.56%
Epoch:  50 | Train Loss: 0.4204 | Train Acc: 80.68% | Val Acc: 72.09%
Epoch: 100 | Train Loss: 0.3491 | Train Acc: 84.86% | Val Acc: 76.16%
Epoch: 150 | Train Loss: 0.2978 | Train Acc: 87.60% | Val Acc: 77.91%
Epoch: 200 | Train Loss: 0.2552 | Train Acc: 89.51% | Val Acc: 79.65%
Epoch: 250 | Train Loss: 0.2271 | Train Acc: 91.61% | Val Acc: 79.65%
Epoch: 300 | Train Loss: 0.1900 | Train Acc: 93.32% | Val Acc: 81.98%
Epoch: 350 | Train Loss: 0.1596 | Train Acc: 94.79% | Val Acc: 81.40%
Epoch: 400 | Train Loss: 0.1322 | Train Acc: 95.75% | Val Acc: 80.81%
Epoch: 450 | Train Loss: 0.1104 | Train Acc: 96.74% | Val Acc: 80.81%
Test accuracy: 73.10%


In [20]:
test_mask = data['id'].test_mask | data['id'].val_mask
pred = model(data.x_dict, data.edge_index_dict)['id'].argmax(dim=-1)
true_labels = data['id'].y[test_mask]
pred_labels = pred[test_mask]
test_precision =precision_score(true_labels, pred_labels, average='macro')
test_recall=recall_score(true_labels, pred_labels, average='macro')
print(f'Test Recall: {test_recall*100:.2f}%')
print(f'Test Precision: {test_precision*100:.2f}%')

Test Recall: 77.61%
Test Precision: 77.90%


In [8]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
#mlflow.set_experiment("spyder-experiment")
import mlflow.pytorch
mlflow.set_experiment("GAT Network 2024-12-21 2 layers ferguson")

<Experiment: artifact_location='/workspaces/rumour-detection-gnn/mlruns/16', creation_time=1734873281509, experiment_id='16', last_update_time=1734873281509, lifecycle_stage='active', name='GAT Network 2024-12-21 2 layers ferguson', tags={}>

In [None]:
#
train_dataset = 'charlie_hebdo'
test_dataset = 'ferguson'


for time_cut in range(4320,(60*6*24),60):
    print(time_cut)
    processor = Hetero_Data_Processor_Transfer_Learning(train_dataset, test_dataset, time_cut=time_cut,test_size=0.7)
    data = processor.process()
    
    model =  GAT(dim_h=64,dim_i=32, dim_out=2)
    model = to_hetero(model, data.metadata(), aggr='sum')
        
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    data, model = data.to(device), model.to(device)
       
    with mlflow.start_run():
    
        for epoch in range(400):
            model.train()
            optimizer.zero_grad()
            out = model(data.x_dict, data.edge_index_dict)['id']
            mask = data['id'].train_mask
            loss = F.cross_entropy(out[mask], data['id'].y[mask])
            loss.backward()
            optimizer.step()
                
            if epoch % 300 == 0:
                print(epoch)
                train_acc = test(data['id'].train_mask)
                val_acc = test(data['id'].val_mask)
                print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')
    
                
        test_acc = test(data['id'].test_mask)
        print(f'Test accuracy: {test_acc*100:.2f}%')
    
        test_mask = data['id'].test_mask | data['id'].val_mask
        pred = model(data.x_dict, data.edge_index_dict)['id'].argmax(dim=-1)
        true_labels = data['id'].y[test_mask]
        pred_labels = pred[test_mask]
        test_precision =precision_score(true_labels, pred_labels, average='macro')
        test_recall=recall_score(true_labels, pred_labels, average='macro')
        print(f'Test Recall: {test_recall*100:.2f}%')
        print(f'Test Precision: {test_precision*100:.2f}%')
    
        mlflow.log_metric("train_loss", loss.item(), step=epoch)
        mlflow.log_metric("train_acc", train_acc, step=epoch)
        mlflow.log_metric("val_acc", val_acc, step=epoch)
        mlflow.log_metric("test_acc", test_acc)
        mlflow.log_metric("test_precision",  test_precision)
        mlflow.log_metric("test_recall",  test_recall)
    
                
        mlflow.log_param("dim_h", 64)
        mlflow.log_param("dim_out", 2)
        mlflow.log_param("learning_rate", 0.001)
        mlflow.log_param("epochs", 400)
        mlflow.log_metric("time_cut", time_cut)

    
    
    
            
                
        mlflow.pytorch.log_model(model, f"GAT_model_{time_cut}")


In [21]:
from torch_geometric.nn import HANConv, Linear
from torch import nn

class HAN(nn.Module):
    def __init__(self, dim_in, dim_out, dim_h=64, heads=4):
        super().__init__()
        self.han = HANConv(dim_in, dim_h, heads=heads,dropout=0.2, metadata=data.metadata())
        self.han2 = HANConv(dim_h, dim_h, heads=heads, dropout=0.2, metadata=data.metadata())
        self.linear = nn.Linear(dim_h, dim_out)

    def forward(self, x_dict, edge_index_dict):
        out = self.han(x_dict, edge_index_dict)
        out = self.han2(out, edge_index_dict)
        out = self.linear(out['id'])
        return out
    
@torch.no_grad()
def test(mask):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict).argmax(dim=-1)
    acc = (pred[mask] == data['id'].y[mask]).sum() / mask.sum()
    return float(acc)

In [22]:
model = HAN(dim_in=-1, dim_out=2)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)

for epoch in range(250):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    mask = data['id'].train_mask
    loss = F.cross_entropy(out[mask], data['id'].y[mask])
    loss.backward()
    optimizer.step()

    if epoch % 20 == 0:
        train_acc = test(data['id'].train_mask)
        val_acc = test(data['id'].val_mask)
        print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')

test_acc = test(data['id'].test_mask)
print(f'Test accuracy: {test_acc*100:.2f}%')

Epoch:   0 | Train Loss: 0.7939 | Train Acc: 30.45% | Val Acc: 41.28%
Epoch:  20 | Train Loss: 0.5910 | Train Acc: 71.22% | Val Acc: 58.72%
Epoch:  40 | Train Loss: 0.4654 | Train Acc: 79.09% | Val Acc: 66.86%
Epoch:  60 | Train Loss: 0.3772 | Train Acc: 82.47% | Val Acc: 71.51%
Epoch:  80 | Train Loss: 0.3504 | Train Acc: 84.34% | Val Acc: 76.16%
Epoch: 100 | Train Loss: 0.3286 | Train Acc: 85.37% | Val Acc: 78.49%
Epoch: 120 | Train Loss: 0.3106 | Train Acc: 86.29% | Val Acc: 77.91%
Epoch: 140 | Train Loss: 0.2927 | Train Acc: 87.32% | Val Acc: 79.65%
Epoch: 160 | Train Loss: 0.2721 | Train Acc: 88.00% | Val Acc: 80.23%
Epoch: 180 | Train Loss: 0.2542 | Train Acc: 88.83% | Val Acc: 81.98%
Epoch: 200 | Train Loss: 0.2293 | Train Acc: 89.71% | Val Acc: 82.56%
Epoch: 220 | Train Loss: 0.2106 | Train Acc: 91.18% | Val Acc: 83.72%
Epoch: 240 | Train Loss: 0.1915 | Train Acc: 92.09% | Val Acc: 83.14%
Test accuracy: 71.93%


In [24]:
test_mask = data['id'].test_mask | data['id'].val_mask
pred = model(data.x_dict, data.edge_index_dict).argmax(dim=-1)
true_labels = data['id'].y[test_mask]
pred_labels = pred[test_mask]
test_precision =precision_score(true_labels, pred_labels, average='macro')
test_recall=recall_score(true_labels, pred_labels, average='macro')
print(f'Test Recall: {test_recall*100:.2f}%')
print(f'Test Precision: {test_precision*100:.2f}%')

Test Recall: 78.23%
Test Precision: 80.01%
