1. Download and import the model

Set-up a baseline form HuggingFace
https://huggingface.co/alana89/TabSTAR


In [None]:

#!pip install tabstar


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




2. Read the datasets

In [1]:
import pandas as pd
covtype_test = pd.read_csv("covtype_test.csv")
covtype_train = pd.read_csv("covtype_train.csv")

heloc_test = pd.read_csv("heloc_test.csv")
heloc_train = pd.read_csv("heloc_train.csv")

higgs_test = pd.read_csv("higgs_test.csv")
higgs_train = pd.read_csv("higgs_train.csv")

In [2]:
import pandas as pd

# covytype
# create mapping for covtype classes to new outcome_class
covtype_map = {
    1: 2,  
    2: 3,  
    3: 4,  
    4: 5,  
    5: 6,  
    6: 7,  
    7: 8,  
}

# make copy of covtype_train to avoid modifying original data
covtype_train_copy = covtype_train.copy()

covtype_train_copy["outcome_class"] = covtype_train_copy["Cover_Type"].map(covtype_map)
covtype_train_copy = covtype_train_copy.drop(columns=["Cover_Type"])

# heloc 
heloc_train_copy = heloc_train.copy()
heloc_train_copy["outcome_class"] = heloc_train_copy["RiskPerformance"].map({
    "Bad": 0,
    "Good": 1,
})
heloc_train_copy = heloc_train_copy.drop(columns=["RiskPerformance"])

# higgs data
higgs_train_copy = higgs_train.copy()            
higgs_train_copy["outcome_class"] = higgs_train_copy["Label"].map({
    "b": 9,   
    "s": 10,  
})
higgs_train_copy = higgs_train_copy.drop(columns=["Label"])


#  merge all 3 datasets into a single training set
merged_train = pd.concat([covtype_train_copy, heloc_train_copy, higgs_train_copy], axis=0, ignore_index=True)
merged_train.head()


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight
0,3351.0,206.0,27.0,726.0,124.0,3813.0,192.0,252.0,180.0,2271.0,...,,,,,,,,,,
1,2732.0,129.0,7.0,212.0,1.0,1082.0,231.0,236.0,137.0,912.0,...,,,,,,,,,,
2,2572.0,24.0,9.0,201.0,25.0,957.0,216.0,222.0,142.0,2191.0,...,,,,,,,,,,
3,2824.0,69.0,13.0,417.0,39.0,3223.0,233.0,214.0,110.0,6478.0,...,,,,,,,,,,
4,2529.0,84.0,5.0,120.0,9.0,1092.0,227.0,231.0,139.0,4983.0,...,,,,,,,,,,


In [3]:
# number of rows with class 1
count_class1 = int((merged_train["outcome_class"] == 1).sum())
count_class1 == sum(heloc_train["RiskPerformance"] == "Good") +  sum(higgs_train["Label"] == 1)



True

In [4]:
covtype_test_copy = covtype_test.copy()
heloc_test_copy = heloc_test.copy()
higgs_test_copy = higgs_test.copy() 

import numpy as np


# merge datasets
merged_test = pd.concat([covtype_test_copy, heloc_test_copy, higgs_test_copy], axis=0, ignore_index=True)

# ensure that the columns in merged_test match those in t
merged_test = merged_test.reindex(columns= merged_train.columns.drop("outcome_class"))

In [5]:
# split into X and y
y = merged_train["outcome_class"].astype(int)
X = merged_train.drop(columns=["outcome_class"])


# fill NaNs

# train data
X = X.fillna(-999)

# test data
merged_test = merged_test.fillna(-999)



In [None]:
from tabstar.tabstar_model import TabSTARClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split 

# Split into local train/val (80/20) for insight
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Get the model and fit it on the training data
tabstar = TabSTARClassifier(max_epochs=2)
tabstar.fit(X_train, y_train)

# Save the model
tabstar.save("baseline_4.pkl")
# tabstar = TabSTARClassifier.load("baseline_4.pkl") 

# Predict on creates test set
predictions_baseline = tabstar.predict(X_test)

report = pd.DataFrame(classification_report(y_test, predictions_baseline, output_dict=True)).transpose()


name_map = {
    '0': "Bad", '1': "Good",
    '2': "CT 1", '3': "CT 2", '4': "CT 3", '5': "CT 4", '6': "CT 5", '7': "CT 6", '8': "CT 7",
    '9': "b", '10': "s"
}

# Convert index to string for matching, then map
report.index = report.index.map(lambda x: name_map.get(str(x), x))

# 4. Show Result
print(report.rename(columns={"recall": "Accuracy"})[["Accuracy", "precision", "f1-score"]])

KeyboardInterrupt: 

In [None]:
# predict on the merged test set
X_test_real = merged_test
predictions = tabstar.predict(X_test_real) 

# save the predictions to a CSV file 
submission = pd.DataFrame({
    "ID": range(1, len(predictions) + 1),
    "Prediction": predictions.astype(int),
})

submission = submission[["ID", "Prediction"]]
submission.to_csv("combined_test_submission4.csv", index=False)