In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
from torch.nn.parameter import Parameter
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split

In [2]:
# Definir la clase del modelo GAT
          ## primero de la capa
class GraphAttentionLayer(torch.nn.Module):
    def __init__(self, in_features, out_features, dropout, alpha, concat=True):
        super(GraphAttentionLayer, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.dropout = dropout
        self.alpha = alpha
        self.concat = concat
        
        self.W = torch.nn.Parameter(torch.zeros(size=(in_features, out_features)))
        self.a = torch.nn.Parameter(torch.zeros(size=(2*out_features, 1)))
        self.leakyrelu = torch.nn.LeakyReLU(self.alpha)
        
        self.init_parameters()
        
    def init_parameters(self):
        torch.nn.init.xavier_normal_(self.W.data, gain=1.414)
        torch.nn.init.xavier_normal_(self.a.data, gain=1.414)
        
    def forward(self, input, adj):
        print(input.shape)
        print(self.W.shape)
        h = torch.matmul(input, self.W)
        N = h.size()[0]
        
        a_input = torch.cat([h.repeat(1, N).view(N*N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2*self.out_features)
        e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2))
        
        zero_vec = -9e15 * torch.ones_like(e)
        attention = torch.where(adj > 0, e, zero_vec)
        attention = F.softmax(attention, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        
        h_prime = torch.matmul(attention, h)
        
        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime

In [3]:
    ## y despues del modelo
class GAT(torch.nn.Module):
        def __init__(self, n_features, n_classes, n_heads, dropout, alpha):
            super(GAT, self).__init__()
            self.dropout = dropout
            self.attentions = []
            self.out_att = GraphAttentionLayer(n_features, n_classes, dropout=dropout, alpha=alpha, concat=True)
        
            for _ in range(n_heads-1):
                self.attentions.append(GraphAttentionLayer(n_features, n_classes, dropout=dropout, alpha=alpha, concat=True))
        
        def forward(self, x, adj):
            x = F.dropout(x, self.dropout, training=self.training)
            x = torch.cat([att(x, adj) for att in self.attentions], dim=1)
            x = F.dropout(x, self.dropout, training=self.training)
            print(x.shape)
            print(adj)
            x = F.elu(self.out_att(x, adj))
            return x

In [4]:
# Definir una clase para cargar los datos
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]
        return x, y

    def __len__(self):
        return len(self.features)

In [5]:
# Cargar los datos desde los archivos CSV
df_features = pd.read_csv('trabajo raiz/large_twitch_features.csv')[:99]
df_edges = pd.read_csv('trabajo raiz/large_twitch_edges.csv')[:99]


In [6]:
print(df_features.head(50))

       views  mature  life_time  created_at  updated_at  numeric_id  \
0       7879       1        969  2016-02-16  2018-10-12           0   
1        500       0       2699  2011-05-19  2018-10-08           1   
2     382502       1       3149  2010-02-27  2018-10-12           2   
3        386       0       1344  2015-01-26  2018-10-01           3   
4       2486       0       1784  2013-11-22  2018-10-11           4   
5       4987       1       1288  2015-04-03  2018-10-12           5   
6        234       0        358  2017-09-14  2018-09-07           6   
7        775       1        577  2017-03-14  2018-10-12           7   
8      69020       0       1781  2013-11-22  2018-10-08           8   
9      32073       0        499  2017-05-31  2018-10-12           9   
10        94       0       1879  2013-08-17  2018-10-09          10   
11  10254468       0       1715  2014-01-31  2018-10-12          11   
12      2178       0        908  2016-04-16  2018-10-11          12   
13    

In [7]:
# Construir el grafo a partir de los datos de vértices y aristas
graph = nx.from_pandas_edgelist(df_edges, 'numeric_id_1', 'numeric_id_2', create_using=nx.Graph())
print(graph)

Graph with 100 nodes and 99 edges


In [8]:
##Codificacion de 'created_at', 'updated_at' y 'language'
from sklearn.preprocessing import OneHotEncoder

# Seleccionar las columnas de características categóricas
categorical_columns = ['created_at', 'updated_at', 'language']

# Codificar las columnas categóricas utilizando one-hot encoding
encoder = OneHotEncoder(sparse=False)
encoded_features = encoder.fit_transform(df_features[categorical_columns])

# Obtener las columnas de características numéricas
numeric_columns = ['views', 'mature', 'life_time', 'numeric_id', 'dead_account']

# Combinar las características numéricas y codificadas en un solo array
features = np.concatenate([df_features[numeric_columns].values, encoded_features], axis=1)
print(features)

[[7.87900e+03 1.00000e+00 9.69000e+02 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [5.00000e+02 0.00000e+00 2.69900e+03 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [3.82502e+05 1.00000e+00 3.14900e+03 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 ...
 [2.57000e+02 0.00000e+00 7.75000e+02 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [4.77890e+04 1.00000e+00 3.23400e+03 ... 0.00000e+00 0.00000e+00
  0.00000e+00]
 [2.15400e+03 1.00000e+00 2.07500e+03 ... 0.00000e+00 0.00000e+00
  0.00000e+00]]




In [9]:
# Normalizar características
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
features = scaler.fit_transform(features)

In [10]:
# Convertir las características a tensores de PyTorch
features = torch.tensor(features, dtype=torch.float32)

In [11]:
# Convertir las etiquetas a tensores de PyTorch
labels = torch.tensor(df_features['affiliate'].values, dtype=torch.long)

In [12]:
# Obtener la matriz de adyacencia como un tensor de PyTorch
adj = nx.adjacency_matrix(graph)
print(adj.shape)
adj = torch.FloatTensor(adj.todense())
adj = torch.tensor(adj, dtype=torch.float32)

(100, 100)


  adj = torch.tensor(adj, dtype=torch.float32)


In [13]:
# Dividir los datos en conjuntos de entrenamiento, validación y prueba
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)

In [14]:
# Crear las instancias del conjunto de datos
train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)
test_dataset = CustomDataset(X_test, y_test)

In [15]:
# Crear los objetos DataLoader para cargar los datos en lotes
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [16]:
# Definir los parámetros del modelo y el entrenamiento
n_features = len(features[0])
n_classes = len(np.unique(labels))
n_heads = 8
dropout = 0.6
alpha = 0.2
learning_rate = 0.01
weight_decay = 5e-4
num_epochs = 100


In [17]:
# Crear el modelo GAT
model = GAT(n_features=n_features, n_classes=n_classes, n_heads=n_heads, dropout=dropout, alpha=alpha)


In [18]:
# Definir la función de pérdida y el optimizador
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [19]:
# Entrenamiento del modelo
model.train()
for epoch in range(num_epochs):
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data, adj)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

    # Validación durante el entrenamiento
    model.eval()
    with torch.no_grad():
        val_loss = 0
        correct = 0
        total = 0
        for data, target in val_loader:
            output = model(data, adj)
            val_loss += criterion(output, target).item()
            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()

        val_loss /= len(val_loader)
        val_accuracy = 100 * correct / total

        print('Epoch: {}, Val Loss: {:.4f}, Val Accuracy: {:.2f}%'.format(epoch+1, val_loss, val_accuracy))

    model.train()

torch.Size([64, 131])
torch.Size([131, 2])


RuntimeError: The size of tensor a (100) must match the size of tensor b (64) at non-singleton dimension 1

In [22]:
def predict(model, features, adj):
    model.eval()
    with torch.no_grad():
        output = model(features, adj)
        probabilities = F.softmax(output, dim=1)
        _, predicted_labels = torch.max(probabilities, dim=1)
    return predicted_labels

In [23]:
predict(model, test_dataset, adj)

TypeError: dropout(): argument 'input' (position 1) must be Tensor, not CustomDataset

In [21]:
# Evaluación del modelo en el conjunto de prueba
model.eval()
with torch.no_grad():
    test_loss = 0
    correct = 0
    total = 0
    for data, target in test_loader:
        output = model(data, adj)
        test_loss += criterion(output, target).item()
        _, predicted = output.max(1)
        total += target.size(0)
        correct += predicted.eq(target).sum().item()

    test_loss /= len(test_loader)
    test_accuracy = 100 * correct / total

    print('Test Loss: {:.4f}, Test Accuracy: {:.2f}%'.format(test_loss, test_accuracy))

torch.Size([10, 131])
torch.Size([131, 2])


RuntimeError: The size of tensor a (100) must match the size of tensor b (10) at non-singleton dimension 1