In [1]:
from collections import namedtuple
from itertools import product

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from torch_geometric.datasets import UPFD
from torch_geometric.loader import DataLoader
from tqdm.notebook import tqdm
from box import Box

from models import UPFDNet, UPFDSingle, MultiFeatureNet, ParallelFeatureNet
from dataset import MultiFeatureDataset, get_data
from utils import train, validate, test, train_multi, validate_multi, test_multi

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
config = Box({
    'batch_size': 128,
    'epochs': 35,
    'lr': 1e-2,
    'l2': 1e-2,
    'hidden_dim': 128,
    'dropout': 0.2,
})

In [4]:
Result = namedtuple('Result', ['name', 'feature', 'conv_layer', 'accuracy', 'recall', 'precision', 'f1', 'roc_auc', 'train_mean', 'train_std', 'val_mean', 'val_std'])

# UPFDNet Experiments

In [5]:
names = ['politifact', 'gossipcop']
features = ['content', 'bert', 'profile', 'spacy']
root = './datasets/UPFD'

conv_layers = ['gcn', 'sage', 'gat', 'parma']
combinations = list(product(names, features, conv_layers))

In [6]:
results = []

for name, feature, conv_layer in tqdm(combinations, desc=f"Combination"):
    train_dataset = UPFD(root=root, name=name, feature=feature, split='train')
    val_dataset = UPFD(root=root, name=name, feature=feature, split='val')
    test_dataset = UPFD(root=root, name=name, feature=feature, split='test')
    break
    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False)
    
    model = UPFDNet(conv_layer=conv_layer, 
                    in_dim=train_dataset.num_features, 
                    hidden_dim=config.hidden_dim, 
                    out_dim=train_dataset.num_classes, 
                    concat=True, 
                    dropout=config.dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, weight_decay=config.l2)
    
    train_losses = []
    val_losses = []

    for epoch in tqdm(range(config.epochs), desc=f"Training"):
        train_loss = train(model, optimizer, train_loader, device)
        train_losses.append(train_loss)
        
        val_loss = validate(model, val_loader, device)
        val_losses.append(val_loss)
    
    acc, recall, precision, f1, roc_auc = test(model, test_loader, device)
    train_mean, train_std = np.mean(train_losses), np.std(train_losses)
    val_mean, val_std = np.mean(val_losses), np.std(val_losses)
    
    curr_result = Result(name=name, feature=feature, conv_layer=conv_layer, accuracy=acc, recall=recall, precision=precision, f1=f1, roc_auc=roc_auc, train_mean=train_mean, train_std=train_std, val_mean=val_mean, val_std=val_std)
    results.append(curr_result)

Combination:   0%|          | 0/32 [00:00<?, ?it/s]

In [7]:
sorted(results, key=lambda x: x.accuracy, reverse=True)

[Result(name='gossipcop', feature='bert', conv_layer='gat', accuracy=0.9696811291165708, recall=0.9796450939457203, precision=0.9605936540429887, f1=0.9700258397932816, roc_auc=0.9696654789100329, train_mean=0.19738492453027343, train_std=0.15418018112730295, val_mean=0.19631492103849138, val_std=0.13622124563453017),
 Result(name='gossipcop', feature='content', conv_layer='gat', accuracy=0.9683742812336644, recall=0.9650313152400835, precision=0.9716237519705728, f1=0.9683163131709872, roc_auc=0.9683795319655915, train_mean=0.47219475333889327, train_std=0.2514528047130513, val_mean=0.5144080175885133, val_std=0.3031630096850024),
 Result(name='gossipcop', feature='spacy', conv_layer='sage', accuracy=0.9584422373235756, recall=0.9702505219206681, precision=0.9479857215706272, f1=0.9589889089502193, roc_auc=0.9584236902797058, train_mean=0.17093955313875564, train_std=0.07788061144900925, val_mean=0.16229910671710965, val_std=0.05793061496084811),
 Result(name='gossipcop', feature='ber

In [8]:
df = pd.DataFrame(results)
df.to_csv('./results/upfdnet.csv')

# UPFDSingle

In [None]:
results = []

for name, feature, conv_layer in tqdm(combinations, desc=f"Combination"):
    train_dataset = UPFD(root=root, name=name, feature=feature, split='train')
    val_dataset = UPFD(root=root, name=name, feature=feature, split='val')
    test_dataset = UPFD(root=root, name=name, feature=feature, split='test')
    
    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False)
    
    model = UPFDSingle(conv_layer=conv_layer,
                       feature_type=feature,
                       hidden_dim=config.hidden_dim,
                       out_dim=train_dataset.num_classes,
                       dropout=config.dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, weight_decay=config.l2)
    
    train_losses = []
    val_losses = []

    for epoch in tqdm(range(config.epochs), desc=f"Training"):
        train_loss = train(model, optimizer, train_loader, device)
        train_losses.append(train_loss)
        
        val_loss = validate(model, val_loader, device)
        val_losses.append(val_loss)
    
    acc, recall, precision, f1, roc_auc = test(model, test_loader, device)
    train_mean, train_std = np.mean(train_losses), np.std(train_losses)
    val_mean, val_std = np.mean(val_losses), np.std(val_losses)
    
    curr_result = Result(name=name, feature=feature, conv_layer=conv_layer, accuracy=acc, recall=recall, precision=precision, f1=f1, roc_auc=roc_auc, train_mean=train_mean, train_std=train_std, val_mean=val_mean, val_std=val_std)
    results.append(curr_result)

Combination:   0%|          | 0/32 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

In [None]:
sorted(results, key=lambda x: x.accuracy, reverse=True)

In [None]:
df = pd.DataFrame(results)
df.to_csv('./results/upfdsingle.csv')

# Multi Feature 

In [7]:
results = []

feature_combs = [
    ['content', 'bert'],
    ['bert', 'profile'],
    ['profile', 'spacy']
]

for name, conv_layer, features in tqdm(list(product(names, conv_layers, feature_combs)), desc="Combinations"):
    train_multidataset = MultiFeatureDataset(features=features, name=name, split='train')
    val_multidataset = MultiFeatureDataset(features=features, name=name, split='val')
    test_multidataset = MultiFeatureDataset(features=features, name=name, split='test')
    
    multi_trainloader = DataLoader(train_multidataset, batch_size=config.batch_size, shuffle=True)
    multi_valloader = DataLoader(val_multidataset, batch_size=config.batch_size, shuffle=False)
    multi_testloader = DataLoader(test_multidataset, batch_size=config.batch_size, shuffle=False)
    
    
    model = MultiFeatureNet(features=features, conv_layer=conv_layer, hidden_dim=config.hidden_dim, out_dim=train_dataset.num_classes, dropout=0.2).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, weight_decay=config.l2)   

    train_losses = []
    val_losses = []

    for epoch in tqdm(range(config.epochs), desc=f"Training"):
        train_loss = train_multi(model, optimizer, multi_trainloader, device)
        train_losses.append(train_loss)
        
        val_loss = validate_multi(model, multi_valloader, device)
        val_losses.append(val_loss)
    
    acc, recall, precision, f1, roc_auc = test_multi(model, multi_testloader, device)
    train_mean, train_std = np.mean(train_losses), np.std(train_losses)
    val_mean, val_std = np.mean(val_losses), np.std(val_losses)
    
    curr_result = Result(name=name, feature=features, conv_layer=conv_layer, accuracy=acc, recall=recall, precision=precision, f1=f1, roc_auc=roc_auc, train_mean=train_mean, train_std=train_std, val_mean=val_mean, val_std=val_std)
    results.append(curr_result)

Combinations:   0%|          | 0/24 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

In [8]:
sorted(results, key=lambda x: x.accuracy, reverse=True)

[Result(name='politifact', feature=['content', 'bert'], conv_layer='sage', accuracy=0.8280542986425339, recall=0.8053097345132744, precision=0.8504672897196262, f1=0.8272727272727273, roc_auc=0.8285807931825632, train_mean=0.4392655481185232, train_std=0.2817128741260854, val_mean=0.720392210994448, val_std=0.21353789129461576),
 Result(name='politifact', feature=['content', 'bert'], conv_layer='gcn', accuracy=0.8054298642533937, recall=0.8141592920353983, precision=0.8070175438596491, f1=0.8105726872246696, roc_auc=0.8052277941658472, train_mean=0.5883523566382272, train_std=0.19124905806111944, val_mean=0.7141901799610683, val_std=0.29838252170877055),
 Result(name='politifact', feature=['profile', 'spacy'], conv_layer='gcn', accuracy=0.8054298642533937, recall=0.8230088495575221, precision=0.8017241379310345, f1=0.8122270742358079, roc_auc=0.8050229432972795, train_mean=0.43450512758323123, train_std=0.2002433282197705, val_mean=0.6843502887657711, val_std=0.2489312646282863),
 Resu

In [9]:
df = pd.DataFrame(results)
df.to_csv('./results/multifeature.csv')

# Parallel Feature

In [8]:
results = []

feature_combs = [
    ['content', 'bert'],
    ['bert', 'profile'],
    ['profile', 'spacy']
]

for name, conv_layer, features in tqdm(list(product(names, conv_layers, feature_combs)), desc="Combinations"):
    train_multidataset = MultiFeatureDataset(features=features, name=name, split='train')
    val_multidataset = MultiFeatureDataset(features=features, name=name, split='val')
    test_multidataset = MultiFeatureDataset(features=features, name=name, split='test')
    
    multi_trainloader = DataLoader(train_multidataset, batch_size=config.batch_size, shuffle=True)
    multi_valloader = DataLoader(val_multidataset, batch_size=config.batch_size, shuffle=False)
    multi_testloader = DataLoader(test_multidataset, batch_size=config.batch_size, shuffle=False)
    
    
    model = ParallelFeatureNet(features=features, conv_layer=conv_layer, hidden_dim=config.hidden_dim, out_dim=train_dataset.num_classes, dropout=0.2).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr, weight_decay=config.l2)   

    train_losses = []
    val_losses = []

    for epoch in tqdm(range(config.epochs), desc=f"Training"):
        train_loss = train_multi(model, optimizer, multi_trainloader, device)
        train_losses.append(train_loss)
        
        val_loss = validate_multi(model, multi_valloader, device)
        val_losses.append(val_loss)
    
    acc, recall, precision, f1, roc_auc = test_multi(model, multi_testloader, device)
    train_mean, train_std = np.mean(train_losses), np.std(train_losses)
    val_mean, val_std = np.mean(val_losses), np.std(val_losses)
    
    curr_result = Result(name=name, feature=features, conv_layer=conv_layer, accuracy=acc, recall=recall, precision=precision, f1=f1, roc_auc=roc_auc, train_mean=train_mean, train_std=train_std, val_mean=val_mean, val_std=val_std)
    results.append(curr_result)

Combinations:   0%|          | 0/24 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

Training:   0%|          | 0/35 [00:00<?, ?it/s]

In [9]:
sorted(results, key=lambda x: x.accuracy, reverse=True)

[Result(name='gossipcop', feature=['profile', 'spacy'], conv_layer='sage', accuracy=0.9231573444851019, recall=0.9008350730688935, precision=0.9431693989071038, f1=0.9215162840363054, roc_auc=0.9231924056443944, train_mean=0.41325491858380187, train_std=0.157092479251925, val_mean=0.3728804755210877, val_std=0.12467048910101622),
 Result(name='gossipcop', feature=['profile', 'spacy'], conv_layer='gat', accuracy=0.797961317302666, recall=0.6617954070981211, precision=0.9102656137832017, f1=0.7663946811725597, roc_auc=0.7981751904600553, train_mean=0.5782906344012608, train_std=0.050314190506717676, val_mean=0.5278170565196446, val_std=0.06888627621532191),
 Result(name='politifact', feature=['profile', 'spacy'], conv_layer='gat', accuracy=0.7963800904977375, recall=0.6902654867256637, precision=0.8863636363636364, f1=0.7761194029850746, roc_auc=0.7988364470665356, train_mean=0.3998149220432554, train_std=0.1592850138686564, val_mean=0.5820902381624494, val_std=0.13794624485975798),
 Res

In [10]:
df = pd.DataFrame(results)
df.to_csv('./results/parallelfeature.csv')

In [None]:
plt.plot(train_losses, label="Training loss")
plt.plot(val_losses, label="Validation loss")
plt.legend()
plt.show()

In [None]:
acc

In [None]:
train_mean, train_std

In [None]:
val_mean, val_std