In [1]:
import os
import pickle
import numpy as np
import gzip

import torch
from torch_geometric.data import Dataset, Data, DataLoader
from torch_geometric.utils import dense_to_sparse
from torch.utils.data import random_split

from torch_geometric.data.data import BaseData

In [5]:
# https://pytorch-geometric.readthedocs.io/en/latest/tutorial/create_dataset.html
class influenceDataset(Dataset):
    def __init__(self, root, graph_dir):
        self.graph_dir = graph_dir
        super().__init__(root)
    
    @property
    def processed_file_names(self):  # data_0.pt가 존재하면 이미 process했으므로 process()실행 안함
        return ['data_0.pt']
    
    
    """
    여러 그래프를 처리할 수 있도록 process를 짰지만 현재 다른 파일(_350 등)이 추가되면서 작동 안함
    """
    def process(self):
        idx = 0
        for graph_name in os.listdir(self.raw_dir):
            edge_index = [[],[]]
            edge_attr = []
            with open(self.graph_dir+'/'+graph_name[:-7]+'.txt','r') as f:
                n,m = map(int,f.readline().split())
                for _ in range(m):
                    u,v,p = f.readline().split()
                    u,v,p = int(u),int(v),float(p)
                    edge_index[0].append(u)
                    edge_index[1].append(v)
                    edge_attr.append([p])
            edge_index = torch.tensor(edge_index)
            edge_attr = torch.tensor(edge_attr)
            
            with gzip.open(self.raw_dir+'/'+graph_name, 'rb') as f: seeds,probs = pickle.load(f)
            seeds = torch.from_numpy(np.expand_dims(seeds,axis=-1)).float()
            probs = torch.from_numpy(np.sum(probs,axis=-1,keepdims=True)).float()
            
            for i in range(len(seeds)):
                seed,prob = seeds[i],probs[i]
                prob = torch.unsqueeze(prob, 0)  # 이렇게 해야 loss를 계산할 때 자연스럽다.
                data = Data(x=seed, edge_index=edge_index, edge_attr=edge_attr, y=prob)
                torch.save(data, os.path.join(self.processed_dir, f'data_{idx}.pt'))
                idx += 1

    def len(self):
        return 347  # 10000개 처리하다 347개에서 중단함
        #return len(os.listdir(self.processed_dir))-2

    def get(self, idx):
        data = torch.load(os.path.join(self.processed_dir, f'data_{idx}.pt'))
        return data
    
    """
    dataset class가 (y의 데이터 개수)=y.size(0)이면 데이터의 unique 개수로 class 개수를 계산한다.
    따라서 재귀에는 자연스럽게 사용할 수 없다.
    """
    @property
    def num_classes(self): return 1

In [3]:
def get_dataloader(batch_size):
    dataset = influenceDataset('/data/URP','/data/URP/graphs')
    train_num = int(len(dataset)*0.8)
    val_num = int(len(dataset)*0.1)
    test_num = len(dataset)-train_num-val_num
    
    train_dataset, val_dataset, test_dataset = random_split(dataset,lengths=[train_num,val_num,test_num])
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    return train_dataloader, val_dataloader, test_dataloader

In [4]:
# 데이터 불러오는 작업은 나중에 raw생성할때로 통합, 파일로 저장하기
def get_dataloader_10000_noprocessing(batch_size):
    edge_index = [[],[]]
    edge_attr = []
    with open('/data/URP/graphs/Celebrity_train_JI.txt','r') as f:
        n,m = map(int,f.readline().split())
        for _ in range(m):
            u,v,p = f.readline().split()
            u,v,p = int(u),int(v),float(p)
            edge_index[0].append(u)
            edge_index[1].append(v)
            edge_attr.append([p])
    edge_index = torch.tensor(edge_index)
    edge_attr = torch.tensor(edge_attr)

    with gzip.open('/data/URP/raw/Celebrity_train_JI.pkl.gz', 'rb') as f: seeds,probs = pickle.load(f)
    seeds = torch.from_numpy(np.expand_dims(seeds,axis=-1)).float()
    probs = torch.from_numpy(np.sum(probs,axis=-1,keepdims=True)).float()

    datas = []
    for i in range(len(seeds)):
        seed,prob = seeds[i],probs[i]
        prob = torch.unsqueeze(prob, 0)  # 이렇게 해야 loss를 계산할 때 자연스럽다.
        data = Data(x=seed, edge_index=edge_index, edge_attr=edge_attr, y=prob)
        datas.append(data)
    
    
    
    train_num = int(len(datas)*0.8)
    val_num = int(len(datas)*0.1)
    test_num = len(datas)-train_num-val_num
    
    train_dataloader = DataLoader(datas[:train_num], batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(datas[train_num:train_num+val_num], batch_size=batch_size, shuffle=False)
    test_dataloader = DataLoader(datas[train_num+val_num:], batch_size=batch_size, shuffle=False)
    
    return train_dataloader, val_dataloader, test_dataloader

In [None]:
def get_data_350():
    with gzip.open('Celebrity_train_JI_350.pkl.gz', 'rb') as f: datas = pickle.load(f)
    return datas

In [58]:
# test
"""
for batch in train_dataloader:
    print(batch)
    print(batch.num_graphs)
    print(batch.y)
"""

DataBatch(x=[156960, 1], edge_index=[2, 576780], edge_attr=[576780, 1], y=[20], batch=[156960], ptr=[21])
20
tensor([ 331.5699, 5738.3150, 7460.2946, 5248.2631, 2935.6470, 1930.2823,
        1106.2486, 5479.7685,  845.4149, 4362.1423, 7679.0883,  855.9194,
        5212.6871,  702.1163, 4452.2191, 5147.9294, 6752.9028, 5307.6616,
        4554.9217, 4804.9743], dtype=torch.float64)
DataBatch(x=[156960, 1], edge_index=[2, 576780], edge_attr=[576780, 1], y=[20], batch=[156960], ptr=[21])
20
tensor([5470.3094, 2750.6111, 5894.7667, 3520.1980, 7524.7225, 4587.5165,
        1003.6427, 4570.4463, 3457.7579, 3942.4710, 7183.4812, 2528.2794,
        6156.4984, 1894.2717, 4166.1275, 1239.1298,  755.1534, 4898.6982,
        7141.5593, 6122.2534], dtype=torch.float64)
DataBatch(x=[156960, 1], edge_index=[2, 576780], edge_attr=[576780, 1], y=[20], batch=[156960], ptr=[21])
20
tensor([7248.0818, 2074.6164, 1631.9656,  603.4873, 6324.7113, 7751.5885,
        7505.9574, 7177.8660, 7158.8710, 3675.0412,

In [6]:
"""
dataset = influenceDataset('/data/URP','/data/URP/graphs')
print(len(dataset))
"""

347
