## Federated Data Pipeline

In [1]:
import torch
from torch import nn

In [2]:
class Client:
    def __init__(self, client_id, data):
        self.items_embeddings = dict()
        self.user_embedding = None
        self.neighbor_embeddings = []
        self.items_rated = data
        self.old_embeddings = None
        self.id = client_id
    
    def initialize_embeddings(self, items_embeddings):
        self.user_embedding = torch.nn.init.xavier_uniform_(torch.empty(1, 256))
        self.update_embeddings(items_embeddings)
    
    def update_embeddings(self, items_embedding):
        self.items_embeddings = items_embedding[items_embedding['movieId'].isin(self.items_rated["movieId"])]
            
    def get_embeddings(self):
        return self.items_embeddings
    
    def get_rated_items_list(self):
        return self.items_rated["movieId"]
    
    def generate_graph(self):
        list_a = [0]*(len(self.items_rated)) + [i for i in range(1, len(self.items_rated)+1)]
        list_b = [i for i in range(1, len(self.items_rated)+1)]+[0]*(len(self.items_rated)) 
        edge_index = torch.tensor([list_a,
                           list_b], dtype=torch.long)
        x = [self.user_embedding.numpy()[0], ]
        item_emb = self.items_embeddings['embeddings'].values
        x += [item_emb[i] for i in range(len(item_emb))] 
        x = torch.tensor(x, dtype=torch.float)
        
        y = torch.tensor(self.items_rated['rating'].values)
        
        return x, y, edge_index
    
    def __str__(self):
        return repr(self.items_rated)
        

In [3]:
class Server:
    def __init__(self, items):
        self.items = items        
        self.items_embeddings = dict()
    
    def generate_item_embeddings(self):
        embeddings = torch.nn.init.xavier_uniform_(torch.empty(self.items.shape[0], 256))
        df = pd.DataFrame({"id": np.arange(1, embeddings.shape[0]+1)})
        df["embeddings"] = list(embeddings.numpy())
        self.items_embeddings = pd.concat([self.items['movieId'], df["embeddings"]], axis=1)
        return self.items_embeddings
    
    def get_item_embeddings(self):
        return self.items_embeddings   
        

In [4]:
import numpy as np
import pandas as pd
import random

class Pipeline:

    def __init__(self):
        self.server = None
        self.clients = None

    def initialize_clients(self, client_count=128):
        data = pd.read_csv('../ml-latest-small/ratings.csv')
        data.drop('timestamp', inplace=True, axis=1)
        random.sample(range(1, 611), client_count)
        self.clients = []
        for i in range(1, client_count+1):
            data_i = data[data['userId'] == i]
            client_i = Client(i, data_i)
            client_i.initialize_embeddings(self.server.get_item_embeddings())
            self.clients.append(client_i)
        self.clients = pd.DataFrame(self.clients, columns=['clients'])
        return self.clients
    
    def initialize_server(self):
        items = pd.read_csv('../ml-latest-small/movies.csv')
        self.server = Server(items)
        embeddings = self.server.generate_item_embeddings()
        return embeddings
        
    def get_embeddings(self):
        return self.server.get_item_embeddings()


## GAT IMPLEMENTATION

In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATv2Conv, GATConv
from torch.autograd import Variable

In [6]:
class GAT(torch.nn.Module):
    def __init__(self):
        super(GAT, self).__init__()
        self.hidden_channels = 256
        self.headsv1 = 4
        self.headsv2 = 1
 
        self.conv1 = GATConv(in_channels=256, out_channels=self.hidden_channels, 
                               heads=self.headsv1, dropout=0.2)
#         self.conv2 = GATv2Conv(in_channels=self.hidden_channels*self.headsv1, out_channels=self.hidden_channels,
#                              heads=self.headsv2, dropout=0.6, concat=False,)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index  
        x_in = Variable(x, requires_grad=True)
        x = F.dropout(x_in, p=0.2, training=self.training) 
        x = self.conv1(x, edge_index)                   
        x = F.elu(x)
        
        y = x[0,:] * x[1:,:]
        y = torch.sum(y, dim=1, dtype=float)
        y = F.relu(y)
        y.retain_grad()
        return x_in, y
    
    

## Driver Code for Training on Client

In [7]:
import torch
from torch_geometric.data import Data

In [8]:
pipeline = Pipeline()
server = pipeline.initialize_server()
clients = pipeline.initialize_clients()['clients'].values
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# print(f"Edge count: {data.num_edges}\nNode Count: {data.num_nodes}\nNode Features: {data.num_features}")

In [9]:
lr=0.01
model = GAT().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
model.train()


x, y, edge_index = clients[0].generate_graph()
data = Data(x=x, edge_index=edge_index)
data = data.to(device)
for epoch in range(300):
    optimizer.zero_grad()
    x, out = model(data)

    criterion = nn.MSELoss()
    loss = torch.sqrt(criterion(out, y))
    if epoch%50==0:
        print(f"Epoch {epoch} Loss:\n{loss}")

    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
    optimizer.step()  
    data.x -= lr*x.grad
        
    

  x = torch.tensor(x, dtype=torch.float)


Epoch 0 Loss:
4.430629292758412
Epoch 50 Loss:
0.6968105855535429
Epoch 100 Loss:
0.3507655472768589
Epoch 150 Loss:
0.43328798527692974
Epoch 200 Loss:
0.27433861626532857
Epoch 250 Loss:
0.4359075767340519


## Evaluating Model

In [10]:
model.eval()
_, pred = model(data)
pred = torch.round(pred.data)

print("Actual: ", y[:10])
print("Predicted: ", pred[:10])

correct = float(pred.eq(y).sum().item())
acc = correct / len(y)
print('Accuracy: {:.4f}'.format(acc))

Actual:  tensor([4., 4., 4., 5., 5., 3., 5., 4., 5., 5.], dtype=torch.float64)
Predicted:  tensor([4., 4., 4., 5., 5., 3., 5., 4., 5., 5.], dtype=torch.float64)
Accuracy: 1.0000
