In [1]:
from torch_geometric.nn import GCNConv, GATConv, SAGEConv, GATv2Conv,DynamicEdgeConv, EdgeConv,RGCNConv, TransformerConv, GINConv, global_mean_pool
import torch 
import torch.nn as nn
import pandas as pd
import numpy as np
import warnings
import sys
sys.path.append('..')
from dataset.data import get_dataset
from utils.table_to_graph import Table2GraphTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from torch_geometric.data import DataLoader
from torch_geometric.data import Data
import argparse,os,random
from collections import Counter
from typing import Dict, List, Tuple
# 기존 ArgumentParser로부터 같거나 유사한 파라미터들을 Namespace로 직접 생성
args = argparse.Namespace(
    random_seed=42,
    train_epochs=200,
    batch_size=64,
    input_dim=768,
    hidden_dim=128,
    num_layers=4,
    dropout_rate=0.3,
    threshold=0.5,
    heads=8,
    model='NORM_GNN',
    source_dataset_name='cleveland',
    target_dataset_name='hungarian',
    few_shot=4,
    dataset_seed=4,
    source_lr=0.0001,
    llm_model='gpt2',
    use_gpu=True,
    des=None,  # 실험 메모를 남기는 용도
    baseline=[],  # 'Logistic_Regression' or 'XGBoost' 등을 리스트로 넣을 수 있음
    graph_path="/storage/personal/eungyeop/dataset/graph",
    table_path="/storage/personal/eungyeop/dataset/table",
    model_type='NORM_GNN',
    graph_type='star',
    FD='N',
    label=False  # --label 플래그 사용 시 True
)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ['PYTHONHASHSEED'] = str(args.random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True)

torch.manual_seed(args.random_seed)
np.random.seed(args.random_seed)
random.seed(args.random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(args.random_seed)
    torch.cuda.manual_seed_all(args.random_seed)

if args.label:
    file_path = os.path.join(args.graph_path, f"{args.graph_type}_{args.FD}_label_{args.target_dataset_name}.pkl")
else:
    file_path = os.path.join(args.graph_path, f"{args.graph_type}_{args.FD}_{args.target_dataset_name}.pkl")



In [3]:
dataset_and_class = {
    "adult" : ['income', ['no','yes']],
    "bank" : ['Class', ['no','yes']],
    "blood" : ['Class',['no','yes']],
    "car" : ['class',['unacceptable','acceptable','good','very good']],
    "communities" : ['ViolentCrimesPerPop',['high','medium','low']],
    "credit-g" : ['class', ['no','yes']],
    "diabetes": ['Outcome',['no','yes']],
    "myocardial" : ['ZSN',['no','yes']],
    "cleveland": ['target_binary', ['no','yes']],
    "hungarian": ['target_binary', ['no','yes']],
    "switzerland": ['target_binary', ['no','yes']],
    "heart_statlog": ['target_binary', ['no','yes']],
    "heart": ['target_binary', ['no','yes']]
}

In [4]:
def preprocessing( DATASETS: pd.DataFrame, data_name: str) -> Tuple[pd.DataFrame, np.ndarray]:
    """데이터셋 전처리"""
    assert data_name in dataset_and_class, f"{data_name} is not a valid dataset name"

    class_name = dataset_and_class[data_name][0]
    class_values = dataset_and_class[data_name][1]
    
    class_mapping = {label: idx for idx, label in enumerate(class_values)}
    
    X = DATASETS.drop(class_name, axis=1)
    X = X.reset_index(drop=True)

    y = DATASETS[class_name]
    y = y.map(class_mapping).astype(int)
    y = y.reset_index(drop=True)
    
    return X, y

In [5]:
base_path = "/storage/personal/eungyeop/dataset/table/"
data_source = "label_table" if args.label else "origin_table"
file_path = os.path.join(base_path, data_source, f"{args.source_dataset_name}.csv")
cleveland = pd.read_csv(file_path)
X, y = preprocessing(cleveland, args.source_dataset_name)
print(X.head())
print(y.head())

    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak
0  63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3
1  67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5
2  67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6
3  37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5
4  41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4
0    0
1    1
2    1
3    0
4    0
Name: target_binary, dtype: int64


In [6]:
print(len(X))
print(len(y))

303
303


In [10]:
for_num = 5

for i in range(for_num -3):
    print(i)

0
1


In [7]:
transformer = Table2GraphTransformer(
    include_edge_attr=True, 
    graph_type=args.graph_type,
    lm_model="gpt2", 
    n_components=768, 
    n_jobs=1,
    use_attention_init=None,
    use_hypergraph=None,
    corr_threshold=0.5,
    FD=args.FD,
    dataset_name=args.source_dataset_name,
)

TypeError: __init__() got an unexpected keyword argument 'graph_type'

In [21]:
graphs = transformer.fit_transform(X, y)

In [23]:
print(len(graphs))

303


In [26]:
class_labels = [data.y.item() for data in graphs]
num_classes = len(set(class_labels))

In [28]:
train_val_data, test_data = train_test_split(graphs, test_size=0.2, stratify=class_labels, random_state=args.random_seed)
train_labels = [data.y.item() for data in train_val_data]
train_data, val_data = train_test_split(train_val_data, test_size=0.25, stratify=train_labels, random_state=args.random_seed)
print(len(train_data))
print(len(val_data))
print(len(test_data))


181
61
61


In [36]:
shot = args.few_shot 
base_samples_per_class = shot // num_classes 
remainder = shot % num_classes
extra_samples = random.sample(range(num_classes), remainder)
support_data = []
for cls in range(num_classes):
    cls_data = [data for data in train_data if data.y.item() == cls]
    sample_num = base_samples_per_class + (1 if cls in extra_samples else 0)
    
    if len(cls_data) < sample_num:
        warnings.warn(f"Class {cls} has fewer samples ({len(cls_data)}) than required ({sample_num}). Using replacement sampling.")
        selected_data = random.choices(cls_data, k=sample_num)
    else:
        selected_data = random.sample(cls_data, k=sample_num)
    
    support_data.extend(selected_data)

train_data = support_data

In [42]:
class_dist = Counter([data.y.item() for data in train_data])

In [43]:
train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=args.batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=args.batch_size, shuffle=False)




In [None]:
def prepare_tabular_dataloaders(args, dataset_name, few_shot = False):
    os.environ['PYTHONHASHSEED'] = str(args.random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)

    torch.manual_seed(args.random_seed)
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.random_seed)
        torch.cuda.manual_seed_all(args.random_seed)
    dataset_and_class = {
    "adult" : ['income', ['no','yes']],
    "bank" : ['Class', ['no','yes']],
    "blood" : ['Class',['no','yes']],
    "car" : ['class',['unacceptable','acceptable','good','very good']],
    "communities" : ['ViolentCrimesPerPop',['high','medium','low']],
    "credit-g" : ['class', ['no','yes']],
    "diabetes": ['Outcome',['no','yes']],
    "myocardial" : ['ZSN',['no','yes']],
    "cleveland": ['target_binary', ['no','yes']],
    "hungarian": ['target_binary', ['no','yes']],
    "switzerland": ['target_binary', ['no','yes']],
    "heart_statlog": ['target_binary', ['no','yes']],
    "heart": ['target_binary', ['no','yes']]
    }
    
