<a href="https://colab.research.google.com/github/caltdreamer/GNN/blob/main/Experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
!pip install -q torch-geometric
!pip install umap-learn
!git clone https://github.com/msesia/arc.git
!pip install conditionalconformal
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -r /content/conformalized-gnn-master/requirements.txt
%cd /content/conformalized-gnn-master
!mkdir -p /content/conformalized-gnn-master/model

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

def wsc(X, y, S, delta=0.1, M=1000, random_state=2020, verbose=False):
    rng = np.random.default_rng(random_state)

    def wsc_v(X, y, S, delta, v):
        #print(X.shape)
        #print(y.shape)
        #print(len(S))
        n = len(y)
        cover = np.array([y[i] in S[i] for i in range(n)])
        #print(f'length is {n}')
        #print(n)
        #print(f'coverage is {cover.sum()}')
        z = np.dot(X,v)
        # Compute mass
        z_order = np.argsort(z)
        z_sorted = z[z_order]
        cover_ordered = cover[z_order]
        ai_max = int(np.round((1.0-delta)*n))
        ai_best = 0
        bi_best = n-1
        cover_min = 1
        for ai in np.arange(0, ai_max):
            bi_min = np.minimum(ai+int(np.round(delta*n)),n)
            coverage = np.cumsum(cover_ordered[ai:n]) / np.arange(1,n-ai+1)
            coverage[np.arange(0,bi_min-ai)]=1
            bi_star = ai+np.argmin(coverage)
            cover_star = coverage[bi_star-ai]
            if cover_star < cover_min:
                ai_best = ai
                bi_best = bi_star
                cover_min = cover_star
        #print(cover_min)
        #print(ai_best)
        #print(bi_best)
        return cover_min, z_sorted[ai_best], z_sorted[bi_best]

    def sample_sphere(n, p):
        v = rng.normal(size=(p, n))
        v /= np.linalg.norm(v, axis=0)
        return v.T
    V = sample_sphere(M, p=X.shape[1])

    wsc_list = [[]] * M
    a_list = [[]] * M
    b_list = [[]] * M
    #print('wqwqqw')
    if verbose:
        for m in tqdm(range(M)):
            wsc_list[m], a_list[m], b_list[m] = wsc_v(X, y, S, delta, V[m])
    else:
        #print('iehkwd')
        for m in range(M):
            #print(m)
            #print(X.shape)
            #print(y.shape)
            wsc_list[m], a_list[m], b_list[m] = wsc_v(X, y, S, delta, V[m])
            #print(m)
    #print('ikfrk')
    idx_star = np.argmin(np.array(wsc_list))
    a_star = a_list[idx_star]
    b_star = b_list[idx_star]
    v_star = V[idx_star]
    wsc_star = wsc_list[idx_star]
    return wsc_star, v_star, a_star, b_star

def wsc_unbiased(X, y, S, delta=0.1, M=1000, test_size=0.75, random_state=2020, verbose=False):
    #print('iqhd')
    def wsc_vab(X, y, S, v, a, b):
        n = len(y)
        cover = np.array([y[i] in S[i] for i in range(n)])
        z = np.dot(X,v)
        idx = np.where((z>=a)*(z<=b))
        coverage = np.mean(cover[idx])
        return coverage
    #print('skjdh')

    max_attempts = 5000
    for attempt in range(max_attempts):
        X_train, X_test, y_train, y_test, S_train, S_test = train_test_split(X, y, S, test_size=0.75, random_state=attempt)
        cover = np.array([y_train[i] in S_train[i] for i in range(len(y_train))])
        if not all(cover):
            break

    if all(cover):
        print('May cause problem')
    #print(len(y_train))
    #print(cover)
    #print('dasytrdv')
    #print(X_train.shape)
    #print(X_test.shape)
    ##print(y_train.shape)
    #print(y_test.shape)
    #print(len(S_train))
    #print(len(S_test))
    #print(S_train)
    # Find adversarial parameters
    wsc_star, v_star, a_star, b_star = wsc(X_train, y_train, S_train, delta=delta, M=M, random_state=random_state, verbose=verbose)
    #print('ewtddvzdas')
    #print(v_star)
    #print(a_star)
    #print(b_star)
    # Estimate coverage
    coverage = wsc_vab(X_test, y_test, S_test, v_star, a_star, b_star)
    return coverage


In [None]:
datasets = ['Anaheim','ChicagoSketch','county_election_2016','county_education_2012','county_income_2012','county_unemployment_2012']
dataset_results = {dataset: {} for dataset in datasets}
for dataset_name in datasets:
      marginal_coverage_all = []
      conditional_coverage_all = []
      for z in range(10):
          !python train.py --model GraphSAGE \
                      --dataset {dataset_name} \
                      --device cpu \
                      --alpha 0.1 \
                      --num_runs 1 \
                      --conf_correct_model QR \
                      --data_seed {z+500}
          import pickle
          file_path = '/content/conformalized-gnn-master/data_splits.pkl'
          with open(file_path, 'rb') as file:
            data_splits = pickle.load(file)
          file_path = f'/content/conformalized-gnn-master/pred/{dataset_name}_GraphSAGE_QR.pkl'
          with open(file_path, 'rb') as file:
            results1 = pickle.load(file)
          file_path = '/content/conformalized-gnn-master/labelsandfeatures.pkl'
          with open(file_path, 'rb') as file:
            data_labels = pickle.load(file)

          #print(data_labels.get('labels')[~results1[0].get('conf_gnn').get('Raw')[4][0]])
          current_data_labels = data_labels.get('labels')
          current_data_features = data_labels.get('features')
          current_calib_test_mask = data_labels.get('calib_test_mask')
          #print(current_calib_test_mask.shape)
          current_test_mask_outof_calibtest = ~current_calib_test_mask
          current_test_features = current_data_features[current_calib_test_mask][~results1[0].get('conf_gnn').get('Raw')[4][0]]
          current_test_labels = current_data_labels[~results1[0].get('conf_gnn').get('Raw')[4][0]]
          current_lbs = results1[0].get('conf_gnn').get('Raw')[2][0][0]
          current_ubs = results1[0].get('conf_gnn').get('Raw')[2][0][1]

          average_length = np.mean(current_ubs - current_lbs)
          intervals = [pd.Interval(left=lb, right=ub, closed='right') for lb, ub in zip(current_lbs, current_ubs)]
          marg_coverage = np.mean([current_test_labels[i] in intervals[i] for i in range(len(current_test_labels))])
          print(marg_coverage)
          coverage_flag = [current_test_labels[i] in intervals[i] for i in range(len(current_test_labels))]
          wsc_coverage = wsc_unbiased(current_test_features, current_test_labels, intervals,delta=0.1,M=1000)
          print(wsc_coverage)
          dataset_results[dataset_name][f'Iteration_{z}'] = {
            'Marginal_Coverage': marg_coverage,
            'current_test_features':current_test_features,
            'coverage_flag':coverage_flag,
            'Average length':average_length,
            'current_data_features':current_data_features,
            'current_calib_test_mask':current_calib_test_mask,
            'current_test_mask_outof_calibtest':current_test_mask_outof_calibtest,
            'current_test_labels':current_test_labels}
          dataset_results_marginal[dataset_name].append(marg_coverage)
          dataset_results_conditional[dataset_name].append(wsc_coverage)


# Save the entire dictionary to a pickle file
file_path = '/content/drive/MyDrive/Regression_CFGNN_results.txt'
with open(file_path, 'wb') as file:
    pickle.dump(dataset_results, file)