In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split

In [2]:
# Definir la clase del modelo GCN
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.gc1 = GraphConvolution(input_dim, hidden_dim)
        self.gc2 = GraphConvolution(hidden_dim, output_dim)

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))
        x = self.gc2(x, adj)
        return F.log_softmax(x, dim=1)

In [3]:
# Definir la clase para la capa de convolución de grafos
class GraphConvolution(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(GraphConvolution, self).__init__()
        self.weight = nn.Parameter(torch.FloatTensor(input_dim, output_dim))
        self.bias = nn.Parameter(torch.FloatTensor(output_dim))

    def forward(self, x, adj):
        x = torch.matmul(x, self.weight)
        ##print(x.shape)       para comprobar las dimensiones de x 
        ##                      y ver que se puede mult con adj
        x = torch.matmul(adj, x)
        x = x + self.bias
        return x

In [4]:
# Cargar los datos de vértices y aristas desde los archivos CSV
vertices_data = pd.read_csv('trabajo raiz/large_twitch_features.csv')[:100]
aristas_data = pd.read_csv('trabajo raiz/large_twitch_edges.csv')[:99]
print(vertices_data)
print(aristas_data)

     views  mature  life_time  created_at  updated_at  numeric_id  \
0     7879       1        969  2016-02-16  2018-10-12           0   
1      500       0       2699  2011-05-19  2018-10-08           1   
2   382502       1       3149  2010-02-27  2018-10-12           2   
3      386       0       1344  2015-01-26  2018-10-01           3   
4     2486       0       1784  2013-11-22  2018-10-11           4   
..     ...     ...        ...         ...         ...         ...   
95    1108       1       1163  2015-08-05  2018-10-11          95   
96     257       0        775  2016-08-19  2018-10-03          96   
97   47789       1       3234  2009-12-04  2018-10-12          97   
98    2154       1       2075  2013-02-01  2018-10-08          98   
99    2060       1       3272  2009-10-04  2018-09-19          99   

    dead_account language  affiliate  
0              0       EN          1  
1              0       EN          0  
2              0       EN          1  
3              

In [5]:
# Construir el grafo a partir de los datos de vértices y aristas
graph = nx.from_pandas_edgelist(aristas_data, 'numeric_id_1', 'numeric_id_2', create_using=nx.Graph())
print(graph)

Graph with 100 nodes and 99 edges


In [6]:
# Obtener las características y etiquetas de los vértices
labels = vertices_data['affiliate'].values
print(labels.size)

100


In [19]:
##Codificacion de 'created_at', 'updated_at' y 'language'
from sklearn.preprocessing import OneHotEncoder

# Seleccionar las columnas de características categóricas
categorical_columns = ['created_at', 'updated_at', 'language']

# Codificar las columnas categóricas utilizando one-hot encoding
encoder = OneHotEncoder(sparse=False)
encoded_features = encoder.fit_transform(vertices_data[categorical_columns])

# Obtener las columnas de características numéricas
numeric_columns = ['views', 'mature', 'life_time', 'numeric_id', 'dead_account']

# Combinar las características numéricas y codificadas en un solo array
features = np.concatenate([vertices_data[numeric_columns].values, encoded_features], axis=1)
print(features[0])

[7.879e+03 1.000e+00 9.690e+02 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000



In [8]:
# Dividir los datos en conjuntos de entrenamiento, validación y prueba
train_indices, test_indices = train_test_split(range(len(features)), test_size=0.2, random_state=42)
train_indices, val_indices = train_test_split(train_indices, test_size=0.2, random_state=42)

In [9]:
# Preparar los datos en tensores de PyTorch
features = torch.FloatTensor(features)
labels = torch.LongTensor(labels)
adj = nx.adjacency_matrix(graph)
print(adj.shape)
adj = torch.FloatTensor(adj.todense())

train_features = features[train_indices]
train_labels = labels[train_indices]
train_adj = adj[train_indices, :][:, train_indices]

val_features = features[val_indices]
val_labels = labels[val_indices]
val_adj = adj[val_indices, :][:, val_indices]

test_features = features[test_indices]
test_labels = labels[test_indices]
test_adj = adj[test_indices, :][:, test_indices]

(100, 100)


In [10]:
# Definir los parámetros del modelo y del entrenamiento
input_dim = features.shape[1]
print(features.shape)
hidden_dim = 64
output_dim = len(np.unique(labels))
print(output_dim)
lr = 0.01
epochs = 100

torch.Size([100, 133])
2


In [11]:
# Crear el modelo GCN
model = GCN(input_dim, hidden_dim, output_dim)

In [12]:
# Definir la función de pérdida y el optimizador
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

In [13]:
# Función de entrenamiento
def train(model, features, adj, labels, train_indices, epochs, lr):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        output = model(features, adj)
        loss = F.cross_entropy(output[train_indices], labels[train_indices])
        loss.backward()
        optimizer.step()
        
        print('Epoch: {:04d} | Loss: {:.4f}'.format(epoch+1, loss.item()))

In [14]:
train(model, features, adj, labels, train_indices, epochs, lr)

Epoch: 0001 | Loss: nan
Epoch: 0002 | Loss: nan
Epoch: 0003 | Loss: nan
Epoch: 0004 | Loss: nan
Epoch: 0005 | Loss: nan
Epoch: 0006 | Loss: nan
Epoch: 0007 | Loss: nan
Epoch: 0008 | Loss: nan
Epoch: 0009 | Loss: nan
Epoch: 0010 | Loss: nan
Epoch: 0011 | Loss: nan
Epoch: 0012 | Loss: nan
Epoch: 0013 | Loss: nan
Epoch: 0014 | Loss: nan
Epoch: 0015 | Loss: nan
Epoch: 0016 | Loss: nan
Epoch: 0017 | Loss: nan
Epoch: 0018 | Loss: nan
Epoch: 0019 | Loss: nan
Epoch: 0020 | Loss: nan
Epoch: 0021 | Loss: nan
Epoch: 0022 | Loss: nan
Epoch: 0023 | Loss: nan
Epoch: 0024 | Loss: nan
Epoch: 0025 | Loss: nan
Epoch: 0026 | Loss: nan
Epoch: 0027 | Loss: nan
Epoch: 0028 | Loss: nan
Epoch: 0029 | Loss: nan
Epoch: 0030 | Loss: nan
Epoch: 0031 | Loss: nan
Epoch: 0032 | Loss: nan
Epoch: 0033 | Loss: nan
Epoch: 0034 | Loss: nan
Epoch: 0035 | Loss: nan
Epoch: 0036 | Loss: nan
Epoch: 0037 | Loss: nan
Epoch: 0038 | Loss: nan
Epoch: 0039 | Loss: nan
Epoch: 0040 | Loss: nan
Epoch: 0041 | Loss: nan
Epoch: 0042 | Lo

In [15]:
## Función de predicción
def predict(model, features, adj, test_indices):
    model.eval()
    with torch.no_grad():
        output = model(features, adj)
        predictions = output[test_indices].argmax(dim=1)
    return predictions


In [16]:
print(features)
print(adj.shape)
print(test_indices)
predict(model, features, adj, test_indices)

tensor([[7.8790e+03, 1.0000e+00, 9.6900e+02,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [5.0000e+02, 0.0000e+00, 2.6990e+03,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [3.8250e+05, 1.0000e+00, 3.1490e+03,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [4.7789e+04, 1.0000e+00, 3.2340e+03,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [2.1540e+03, 1.0000e+00, 2.0750e+03,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [2.0600e+03, 1.0000e+00, 3.2720e+03,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]])
torch.Size([100, 100])
[83, 53, 70, 45, 44, 39, 22, 80, 10, 0, 18, 30, 73, 33, 90, 4, 76, 77, 12, 31]


tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [17]:
## Evaluar el modelo
def evaluate_model(model, features, adj, labels, test_indices):
    model.eval()
    with torch.no_grad():
        output = model(features, adj)
        predictions = output[test_indices].argmax(dim=1)
        accuracy = torch.sum(predictions == labels[test_indices]).item() / len(test_indices)
    return accuracy

# Evaluar el modelo en el conjunto de prueba
accuracy = evaluate_model(model, features, adj, labels, test_indices)
print(f"Accuracy on test set: {accuracy}")

Accuracy on test set: 0.55
