In [None]:
import torch
import requests
from tabpfn import TabPFNClassifier
import os
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data import random_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import random
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
Noise_0_data = requests.get(
    "http://AdityaAhuja01.pythonanywhere.com/data/df_syn_train_0_0_.csv"
)
Noise_Low_data = requests.get(
    "http://AdityaAhuja01.pythonanywhere.com/data/df_synA_train_shuffled.csv"
)
Noise_High_data = requests.get(
    "http://AdityaAhuja01.pythonanywhere.com/data/df_synA_test_hard_shuffled_sample.csv"
)

if Noise_0_data.status_code == 200 and Noise_Low_data.status_code == 200 and Noise_High_data.status_code == 200:
    datafolder = "Data/Assignment1"

    if not os.path.exists(datafolder):
        os.makedirs(datafolder)

    with open(os.path.join(datafolder, "data_0_noise"), "wb") as f:
        f.write(Noise_0_data.text.encode("utf-8"))

    with open(os.path.join(datafolder, "data_Low_noise"), "wb") as f:
        f.write(Noise_Low_data.text.encode("utf-8"))

    with open(os.path.join(datafolder, "data_High_noise"), "wb") as f:
        f.write(Noise_High_data.text.encode("utf-8"))
else:
    print("Error in fetching data")

In [None]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, noise, transform=None, target_transform=None, drop=None, target=None):
        self.dataframe = dataframe
        if drop != None:
            self.X = dataframe.drop(drop, axis=1).values
        else:
            self.X = dataframe.values

        self.y = dataframe[target].values
        self.transform = transform
        self.target_transform = target_transform
        self.noise = noise

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item, label = self.X[idx], self.y[idx]
        return item, label

    def get_noise(self):
        return self.noise

In [None]:
Noise_0_dataframe = pd.read_csv("Data/Assignment1/data_0_noise")
Noise_Low_dataframe = pd.read_csv("Data/Assignment1/data_Low_noise")
Noise_High_dataframe = pd.read_csv("Data/Assignment1/data_High_noise")

In [None]:
class_index = list(Noise_0_dataframe["era"].unique())
class_index_noise = list(Noise_Low_dataframe["era"].unique())
class_index_t10v_noise = list(Noise_Low_dataframe["target_10_val"].unique())

def encode(value, class_index = class_index):
    return class_index.index(value)

def encode_noise(value, class_index = class_index_noise):
    return class_index.index(value)

def encode_noise_t10v(value, class_index = class_index_t10v_noise):
    return class_index.index(value)


Noise_0_dataframe["era"] = Noise_0_dataframe["era"].apply(encode)
Noise_Low_dataframe["era"] = Noise_Low_dataframe["era"].apply(encode_noise)
Noise_High_dataframe["era"] = Noise_High_dataframe["era"].apply(encode_noise)
Noise_Low_dataframe["target_10_val"] = Noise_Low_dataframe["target_10_val"].apply(encode_noise_t10v)
Noise_High_dataframe["target_10_val"] = Noise_High_dataframe["target_10_val"].apply(encode_noise_t10v)

In [None]:
Noise_0_dataset_era = CustomDataset(Noise_0_dataframe, "0",drop = ["day","era","target_10_val","target_5_val"], target = "era")
Noise_Low_dataset_era = CustomDataset(Noise_Low_dataframe, "Low", drop = ["row_num","day","era","target_10_val","target_5_val","data_type"], target = "era")
Noise_High_dataset_era = CustomDataset(Noise_High_dataframe, "High", drop = ["row_num","day","era","target_10_val","target_5_val","data_type"], target = "era")
Noise_Low_dataset_t10v = CustomDataset(Noise_Low_dataframe, "Low", drop = ["row_num","day","era","target_10_val","target_5_val","data_type"], target = "target_10_val")
Noise_High_dataset_t10v = CustomDataset(Noise_High_dataframe, "High", drop = ["row_num","day","era","target_10_val","target_5_val","data_type"], target = "target_10_val")
Noise_0_train_era, Noise_0_test_era = random_split(Noise_0_dataset_era, [int(len(Noise_0_dataset_era)*0.8), int(len(Noise_0_dataset_era)*0.2)])
Noise_Low_train_era, Noise_Low_test_era = random_split(Noise_Low_dataset_era, [int(len(Noise_Low_dataset_era)*0.8), int(len(Noise_Low_dataset_era)*0.2)])
Noise_High_train_era, Noise_High_test_era = random_split(Noise_High_dataset_era, [int(len(Noise_High_dataset_era)*0.8), int(len(Noise_High_dataset_era)*0.2)])
Noise_Low_train_t10v, Noise_Low_test_t10v = random_split(Noise_Low_dataset_t10v, [int(len(Noise_Low_dataset_t10v)*0.8), int(len(Noise_Low_dataset_t10v)*0.2)])
Noise_High_train_t10v, Noise_High_test_t10v = random_split(Noise_High_dataset_t10v, [int(len(Noise_High_dataset_t10v)*0.8), int(len(Noise_High_dataset_t10v)*0.2)])

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


In [None]:
#Setting batch_size=1000 for target_10_Val as tabpfn requires 1000 rows max

#Setting up dataloaders
Noise_0_era_train_loader = DataLoader(Noise_0_train_era, batch_size=512, shuffle=True)
Noise_0_era_test_loader = DataLoader(Noise_0_test_era, batch_size=512, shuffle=True)
########################################################################################
Noise_Low_era_train_loader = DataLoader(Noise_Low_train_era, batch_size=512, shuffle=True)
Noise_Low_era_test_loader = DataLoader(Noise_Low_test_era, batch_size=512, shuffle=True)
########################################################################################
Noise_High_era_train_loader = DataLoader(Noise_High_train_era, batch_size=512, shuffle=True)
Noise_High_era_test_loader = DataLoader(Noise_High_test_era, batch_size=512, shuffle=True)
########################################################################################
Noise_Low_t10v_train_loader = DataLoader(Noise_Low_train_t10v, batch_size=1000, shuffle=True)
Noise_Low_t10v_test_loader = DataLoader(Noise_Low_test_t10v, batch_size=512, shuffle=True)
########################################################################################
Noise_High_t10v_train_loader = DataLoader(Noise_High_train_t10v, batch_size=1000, shuffle=True)
Noise_High_t10v_test_loader = DataLoader(Noise_High_test_t10v, batch_size=512, shuffle=True)
########################################################################################


In [None]:
def predicition_assembler(predictions,probabilities):
    final_winner=[]
    no_of_models=len(predictions)
    batch_size=len(predictions[0])
    for i in range(batch_size):
        winner_dict={}
        for j in range(no_of_models):
            if predictions[j][i] not in winner_dict.keys():
                winner_dict[predictions[j][i]]=0
            winner_dict[predictions[j][i]]+=probabilities[j][i]
        final_winner.append(max(winner_dict,key=winner_dict.get))
    return final_winner

def fit_test_tabpfn(train_dataloader,test_dataloader,no_of_models_to_ensemble=1,ensemble_config=1):
    all_tabpfns=[]
    for data,target in tqdm(train_dataloader, desc="FITTING"):
        classifier = TabPFNClassifier(device=device, N_ensemble_configurations=ensemble_config)
        classifier.fit(data,target, overwrite_warning=True)
        all_tabpfns.append(classifier)

    total=0
    correct=0

    for data,target in test_dataloader:
        each_model_prob=[]
        each_model_pred=[]
        random_models = random.sample(all_tabpfns, no_of_models_to_ensemble)
        for model in random_models:
            y_pred,p_pred=model.predict(data,return_winning_probability=True)
            each_model_prob.append(p_pred)
            each_model_pred.append(y_pred)
        y_pred_summ=predicition_assembler(each_model_pred,each_model_prob)
        total+=len(y_pred_summ)
        correct+=sum(1 for p, t in zip(y_pred_summ, target) if p == t)
    print(f"Accuracy: {correct/total:.4f}")

In [None]:
fit_test_tabpfn(train_dataloader=Noise_Low_t10v_train_loader,test_dataloader=Noise_Low_t10v_test_loader,no_of_models_to_ensemble=30,ensemble_config=1)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Accuracy: 0.7847




In [None]:
fit_test_tabpfn(train_dataloader=Noise_High_t10v_train_loader,test_dataloader=Noise_High_t10v_test_loader,no_of_models_to_ensemble=30,ensemble_config=2)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Accuracy: 0.6160




In [None]:
def add_to_dict(dict,key,value):
    if key not in dict.keys():
        dict[key]=0
    dict[key]+=value
    return dict

def predicition_assembler(predictions1,probabilities1,predictions2,probabilities2,split,threshold = 0):
    final_winner=[]
    no_of_models=len(predictions1)
    batch_size=len(predictions1[0])
    for i in range(batch_size):
        winner_dict= {}
        count_dict = {}
        for j in range(no_of_models):
            value1=probabilities1[j][i]
            if (value1<threshold):
              value1 = 0
            value2=probabilities2[j][i]
            if (value2<threshold):
              value2 = 0
            winner1 = predictions1[j][i]
            winner2 = predictions2[j][i] + split + 1
            winner_dict = add_to_dict(winner_dict,winner1,value1)
            winner_dict = add_to_dict(winner_dict,winner2,value2)
            if (value1!=0):count_dict=add_to_dict(count_dict,winner1,1)
            if (value2!=0): count_dict=add_to_dict(count_dict,winner2,1)

        for key in winner_dict.keys():
            winner_dict[key] = winner_dict[key]/count_dict[key]

        final_winner.append(max(winner_dict,key=winner_dict.get))
    return final_winner

def fit_test_tabpfn_for_era(train_dataloader,test_dataloader,no_of_models_to_ensemble=1,ensemble_config=1,split=6):
    split_1=[]
    split_2=[]
    filtered_data=torch.empty(0)
    filtered_target=torch.empty(0)
    count=0
    for data,target in tqdm(train_dataloader, desc="FITTING FOR SPLIT1"):
        classifier = TabPFNClassifier(device=device, N_ensemble_configurations=ensemble_config)
        target[target>split]=9
        valid_indices = target != 9
        filtered_data =torch.cat([filtered_data,data[valid_indices]],dim=0)
        filtered_target =torch.cat([filtered_target,target[valid_indices]],dim=0)
        # all other
        if len(filtered_data)>=1000 or count==len(train_dataloader)-1:
            to_fit_data=filtered_data[:1000]
            to_fit_target=filtered_target[:1000]
            filtered_target=filtered_target[1000:]
            filtered_data=filtered_data[1000:]
            classifier.fit(to_fit_data,to_fit_target)
            split_1.append(classifier)
        count+=1

    filtered_data=torch.empty(0)
    filtered_target=torch.empty(0)
    count=0
    for data,target in tqdm(train_dataloader, desc="FITTING FOR SPLIT2"):
        classifier = TabPFNClassifier(device=device, N_ensemble_configurations=ensemble_config)
        target-=(split+1)
        #all other
        target[target<0]=9
        target[target>split]=9
        valid_indices = target != 9
        filtered_data =torch.cat([filtered_data,data[valid_indices]],dim=0)
        filtered_target =torch.cat([filtered_target,target[valid_indices]],dim=0)
        if len(filtered_data)>=1000 or count==len(train_dataloader)-1 :
            to_fit_data=filtered_data[:1000]
            to_fit_target=filtered_target[:1000]
            filtered_target=filtered_target[1000:]
            filtered_data=filtered_data[1000:]
            classifier.fit(to_fit_data,to_fit_target)
            split_2.append(classifier)
        count+=1

    print("NO OF MODELS TO ENSEMBLE:",no_of_models_to_ensemble)
    if no_of_models_to_ensemble>min(len(split_1),len(split_2)):
        no_of_models_to_ensemble=min(len(split_1),len(split_2))

    print("MAX MODELS AVAILABLE:",min(len(split_1),len(split_2)))

    total=0
    correct=0
    for data,target in tqdm(test_dataloader, desc="TESTING"):
        each_model_prob_split1=[]
        each_model_pred_split1=[]
        each_model_prob_split2=[]
        each_model_pred_split2=[]
        # Zip the lists together
        zipped_splits = list(zip(split_1, split_2))
        random_models = random.sample(zipped_splits, no_of_models_to_ensemble)
        random_models_split_1, random_models_split_2 = zip(*random_models)
        for model in random_models_split_1:
            y_pred,p_pred=model.predict(data,return_winning_probability=True)
            each_model_prob_split1.append(p_pred)
            each_model_pred_split1.append(y_pred)
        for model in random_models_split_2:
            y_pred,p_pred=model.predict(data,return_winning_probability=True)
            each_model_prob_split2.append(p_pred)
            each_model_pred_split2.append(y_pred)
        y_pred_summ=predicition_assembler(each_model_pred_split1,each_model_prob_split1,each_model_pred_split2,each_model_prob_split2,split=split)
        total+=len(y_pred_summ)
        correct+=sum(1 for p, t in zip(y_pred_summ, target) if p == t)
    print(f"Accuracy: {correct/total:.4f}")

  and should_run_async(code)


In [None]:
fit_test_tabpfn_for_era(train_dataloader=Noise_0_era_train_loader,test_dataloader=Noise_0_era_test_loader,no_of_models_to_ensemble=4,ensemble_config=4,split=5)

FITTING FOR SPLIT1: 100%|██████████| 13/13 [00:00<00:00, 150.96it/s]
FITTING FOR SPLIT2: 100%|██████████| 13/13 [00:00<00:00, 188.38it/s]


NO OF MODELS TO ENSEMBLE: 4
MAX MODELS AVAILABLE: 4


TESTING: 100%|██████████| 4/4 [00:08<00:00,  2.04s/it]

Accuracy: 0.8205





In [None]:
fit_test_tabpfn_for_era(train_dataloader=Noise_Low_era_train_loader,test_dataloader=Noise_Low_era_test_loader,no_of_models_to_ensemble=4,ensemble_config=1,split=5)

FITTING FOR SPLIT1: 100%|██████████| 488/488 [00:01<00:00, 304.84it/s]
FITTING FOR SPLIT2: 100%|██████████| 488/488 [00:01<00:00, 298.69it/s]


NO OF MODELS TO ENSEMBLE: 10
MAX MODELS AVAILABLE: 125


TESTING:   5%|▍         | 6/122 [00:07<02:15,  1.17s/it]


KeyboardInterrupt: 

In [None]:
fit_test_tabpfn_for_era(train_dataloader=Noise_High_era_train_loader,test_dataloader=Noise_High_era_test_loader,no_of_models_to_ensemble=10,ensemble_config=1,split=6)

FITTING FOR SPLIT1: 100%|██████████| 390/390 [00:03<00:00, 127.56it/s]
FITTING FOR SPLIT2: 100%|██████████| 390/390 [00:03<00:00, 121.61it/s]
TESTING: 100%|██████████| 98/98 [02:23<00:00,  1.47s/it]

Accuracy: 0.3313



