In [1]:
import pandas as pd
from ucimlrepo import fetch_ucirepo

In [2]:

def load_ucimldatasets(dataset_ids: list[int] = None) -> list:
    """
    Load datasets from UCI ML Repository based on specified dataset IDs.
    Returns a list of loaded datasets in the same format as load_openml_datasets.
    
    Args:
        dataset_ids: List of UCI ML Repository dataset IDs to fetch.
                    If None, uses default list: [1, 189, 942, 890, 880, 925, 913, 713]
    
    Returns:
        List of dictionaries containing dataset information
    """

    
    # Default dataset IDs if none provided
    if dataset_ids is None:
        dataset_ids = [1, 189, 942, 890, 880, 925, 913, 713]
    
    print(f"Loading {len(dataset_ids)} datasets from UCI ML Repository")
    print(f"Dataset IDs: {dataset_ids}")
    
    loaded_datasets = []
    successful_loads = 0
    failed_loads = []
    
    for did in dataset_ids:
        try:
            print(f"→ Fetching UCI dataset {did}...", end=" ", flush=True)
            
            # Fetch dataset from UCI ML Repository
            dataset = fetch_ucirepo(id=did)
            
            # Extract features and target
            X = dataset.data.features
            y = dataset.data.targets
            
            # Handle the case where y might have multiple columns
            if isinstance(y, pd.DataFrame):
                if y.shape[1] == 1:
                    y = y.iloc[:, 0]  # Convert to Series if single column
                else:
                    # If multiple targets, take the first one
                    print(f"[Multiple targets, using first: {y.columns[0]}]", end=" ")
                    y = y.iloc[:, 0]
            
            # Convert to pandas DataFrame/Series if not already
            if not isinstance(X, pd.DataFrame):
                X = pd.DataFrame(X)
            if not isinstance(y, pd.Series):
                y = pd.Series(y)
            
            # Check if it's a regression task (numeric target)
            if not pd.api.types.is_numeric_dtype(y):
                print(f"[Skipped: Non-numeric target]")
                failed_loads.append((did, "Non-numeric target"))
                continue
            
            # Create categorical indicator (True for non-numeric columns)
            categorical_indicator = []
            for col in X.columns:
                is_categorical = not pd.api.types.is_numeric_dtype(X[col])
                categorical_indicator.append(is_categorical)
            
            # Get attribute names
            attribute_names = list(X.columns)
            
            # Add to loaded datasets
            loaded_datasets.append({
                "X": X,
                "y": y,
                "categorical_indicator": categorical_indicator,
                "attribute_names": attribute_names,
                "dataset_id": did,
                "dataset_name": getattr(dataset.metadata, 'name', f'UCI_Dataset_{did}'),
                "source": "UCI_ML_Repository"
            })
            
            successful_loads += 1
            print(f"✓ Shape: {X.shape}, Target: {y.shape}")
            
        except Exception as e:
            print(f"❌ Error: {e}")
            failed_loads.append((did, str(e)))
            continue
    
    print(f"\nUCI ML Repository Loading Summary:")
    print(f"• Successfully loaded: {successful_loads}/{len(dataset_ids)} datasets")
    print(f"• Failed to load: {len(failed_loads)} datasets")
    
    if failed_loads:
        print("• Failed dataset IDs and reasons:")
        for did, reason in failed_loads:
            print(f"  - Dataset {did}: {reason}")
    
    if loaded_datasets:
        print("• Successfully loaded dataset details:")
        for ds in loaded_datasets:
            print(f"  - ID {ds['dataset_id']}: {ds['dataset_name']} - {ds['X'].shape} features, {len(ds['y'])} samples")
    
    return loaded_datasets

# Example usage
loaded_datasets = load_ucimldatasets([1, 189, 942, 890, 880, 925, 913, 713])
if loaded_datasets:
    print(f"\nLoaded {len(loaded_datasets)} datasets successfully!")
    for ds in loaded_datasets:
        print(f"Dataset {ds['dataset_id']} ({ds['dataset_name']}): {ds['X'].shape} features, {len(ds['y'])} samples")
else:
    print("No datasets loaded successfully.")

Loading 8 datasets from UCI ML Repository
Dataset IDs: [1, 189, 942, 890, 880, 925, 913, 713]
→ Fetching UCI dataset 1... ✓ Shape: (4177, 8), Target: (4177,)
→ Fetching UCI dataset 189... ✓ Shape: (4177, 8), Target: (4177,)
→ Fetching UCI dataset 189... [Multiple targets, using first: motor_UPDRS] ✓ Shape: (5875, 19), Target: (5875,)
→ Fetching UCI dataset 942... [Multiple targets, using first: motor_UPDRS] ✓ Shape: (5875, 19), Target: (5875,)
→ Fetching UCI dataset 942... [Skipped: Non-numeric target]
→ Fetching UCI dataset 890... [Skipped: Non-numeric target]
→ Fetching UCI dataset 890... ✓ Shape: (2139, 23), Target: (2139,)
→ Fetching UCI dataset 880... ✓ Shape: (2139, 23), Target: (2139,)
→ Fetching UCI dataset 880... [Multiple targets, using first: death] ✓ Shape: (9105, 42), Target: (9105,)
→ Fetching UCI dataset 925... [Multiple targets, using first: death] ✓ Shape: (9105, 42), Target: (9105,)
→ Fetching UCI dataset 925... [Multiple targets, using first: aveOralF] ✓ Shape: (1020

In [3]:
import os
import sys
import types

# Ensure the current directory is in Python path
current_dir = '/Users/surbhi/MASTERS/SS25/autoML/final-project/automl-project/automl-tabular-pipeline/src/automl'
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)

os.chdir(current_dir)

# Create a simple automl module that will act as a namespace
automl = types.ModuleType('automl')
automl.__path__ = [current_dir]
sys.modules['automl'] = automl

# Import modules directly and add them to the automl namespace
import pre_processor
import constants
import FeatureSelector

automl.pre_processor = pre_processor
automl.constants = constants
automl.FeatureSelector = FeatureSelector

# Now import the modules we need
from meta_features import extract_meta_features
from meta_trainer import algorithms_eval
from constants import algorithms

# loaded_datasets = load_ucimldatasets([1, 189, 942, 890, 880, 925, 913, 713])

# records = algorithms_eval(algorithms=algorithms, datasets=loaded_datasets)

# df = pd.DataFrame(records)
# df.to_csv("meta_features_uci.csv", index=False)

#### Getting meta features from kagglehub datasets

In [1]:
import kagglehub
import os
import glob
import numpy as np
import pandas as pd

def load_kaggle_dataset(path: str) -> list:
    """
    Load datasets from a Kaggle dataset path.
    Returns a list of loaded datasets in the same format as load_ucimldatasets.
    
    Args:
        path: Path to the downloaded Kaggle dataset directory
    
    Returns:
        List of dictionaries containing dataset information
    """
    
    print(f"Loading datasets from Kaggle path: {path}")
    
    loaded_datasets = []
    successful_loads = 0
    failed_loads = []
    processed_files = set()  # Track processed files to avoid duplicates
    
    # Look for CSV files in the path
    csv_files = glob.glob(os.path.join(path, "*.csv"))
    
    if not csv_files:
        print("No CSV files found in the specified path")
        return loaded_datasets
    
    print(f"Found {len(csv_files)} CSV files")
    
    for idx, csv_file in enumerate(csv_files):
        filename = os.path.basename(csv_file)
        
        # Skip if already processed
        if filename in processed_files:
            continue
        processed_files.add(filename)
        
        try:
            print(f"→ Loading {filename}...", end=" ", flush=True)
            
            # Load CSV file
            df = pd.read_csv(csv_file)
            
            if df.empty:
                print(f"[Skipped: Empty dataset]")
                failed_loads.append((filename, "Empty dataset"))
                continue
            
            # Check for missing values and clean the data
            initial_shape = df.shape
            print(f"[Initial shape: {initial_shape}]", end=" ")
            
            # Remove rows with too many missing values (more than 50% missing)
            missing_threshold = 0.5
            df = df.dropna(thresh=int(missing_threshold * len(df.columns)))
            
            # Remove columns with too many missing values (more than 50% missing)
            df = df.dropna(axis=1, thresh=int(missing_threshold * len(df)))
            
            if df.empty:
                print(f"[Skipped: Too many missing values]")
                failed_loads.append((filename, "Too many missing values after cleaning"))
                continue
            
            # For remaining missing values, use different strategies for numeric vs categorical
            for col in df.columns:
                if pd.api.types.is_numeric_dtype(df[col]):
                    # For numeric columns, fill with median
                    if df[col].isnull().any():
                        df[col] = df[col].fillna(df[col].median())
                else:
                    # For categorical columns, fill with mode
                    if df[col].isnull().any():
                        mode_val = df[col].mode()
                        if len(mode_val) > 0:
                            df[col] = df[col].fillna(mode_val[0])
                        else:
                            df[col] = df[col].fillna('unknown')
            
            # Final check: drop any remaining rows with NaN
            df = df.dropna()
            
            if df.empty:
                print(f"[Skipped: No data left after cleaning]")
                failed_loads.append((filename, "No data left after cleaning"))
                continue
            
            print(f"[Cleaned: {initial_shape} -> {df.shape}]", end=" ")
            
            # Assume the last column is the target (you might need to adjust this)
            # or look for common target column names
            target_candidates = ['target', 'y', 'label', 'class', 'output']
            target_col = None
            
            # Check if any of the common target names exist
            for candidate in target_candidates:
                if candidate in df.columns:
                    target_col = candidate
                    break
            
            # If no common target name found, use the last column
            if target_col is None:
                target_col = df.columns[-1]
            
            # Split features and target
            X = df.drop(columns=[target_col])
            y = df[target_col]
            
            # Check if it's a regression task (numeric target)
            if not pd.api.types.is_numeric_dtype(y):
                print(f"[Skipped: Non-numeric target '{target_col}']")
                failed_loads.append((filename, f"Non-numeric target '{target_col}'"))
                continue
            
            # Additional robust NaN check for target
            if y.isnull().any():
                print(f"[Skipped: Target variable still contains NaN]")
                failed_loads.append((filename, "Target variable contains NaN"))
                continue
            
            # Additional check: ensure no infinite values
            if np.isinf(X.select_dtypes(include=[np.number])).any().any() or np.isinf(y).any():
                print(f"[Skipped: Contains infinite values]")
                failed_loads.append((filename, "Contains infinite values"))
                continue
            
            # Final NaN check for features
            if X.isnull().any().any():
                print(f"[Skipped: Features still contain NaN after cleaning]")
                failed_loads.append((filename, "Features still contain NaN after cleaning"))
                continue
            
            # Convert categorical columns to numeric using label encoding
            from sklearn.preprocessing import LabelEncoder
            for col in X.columns:
                if not pd.api.types.is_numeric_dtype(X[col]):
                    le = LabelEncoder()
                    X[col] = le.fit_transform(X[col].astype(str))
            
            # Create categorical indicator (True for originally non-numeric columns)
            categorical_indicator = []
            for col in df.drop(columns=[target_col]).columns:
                is_categorical = not pd.api.types.is_numeric_dtype(df[col])
                categorical_indicator.append(is_categorical)
            
            # Get attribute names
            attribute_names = list(X.columns)
            
            # Create dataset name from filename
            dataset_name = os.path.splitext(filename)[0]
            
            # Add to loaded datasets
            loaded_datasets.append({
                "X": X,
                "y": y,
                "categorical_indicator": categorical_indicator,
                "attribute_names": attribute_names,
                "dataset_id": idx,  # Use index as ID for Kaggle datasets
                "dataset_name": dataset_name,
                "source": "Kaggle",
                "file_path": csv_file
            })
            
            successful_loads += 1
            print(f"✓ Shape: {X.shape}, Target: {y.shape}, Target column: '{target_col}'")
            
        except Exception as e:
            print(f"❌ Error: {e}")
            failed_loads.append((filename, str(e)))
            continue
    
    print(f"\nKaggle Dataset Loading Summary:")
    print(f"• Successfully loaded: {successful_loads}/{len(set(os.path.basename(f) for f in csv_files))} unique datasets")
    print(f"• Failed to load: {len(failed_loads)} datasets")
    
    if failed_loads:
        print("• Failed files and reasons:")
        for filename, reason in failed_loads:
            print(f"  - {filename}: {reason}")
    
    if loaded_datasets:
        print("• Successfully loaded dataset details:")
        for ds in loaded_datasets:
            print(f"  - {ds['dataset_name']}: {ds['X'].shape} features, {len(ds['y'])} samples")
    
    return loaded_datasets

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Download latest version
path = kagglehub.dataset_download("sohier/calcofi")

# Load the Kaggle dataset
kaggle_datasets = load_kaggle_dataset(path)

if kaggle_datasets:
    print(f"\nLoaded {len(kaggle_datasets)} Kaggle datasets successfully!")
    records = algorithms_eval(algorithms=algorithms, datasets=kaggle_datasets)

    df = pd.DataFrame(records)
    df.to_csv("meta_features_kaggle_1.csv", index=False)
else:
    print("No Kaggle datasets loaded successfully.")

Loading datasets from Kaggle path: /Users/surbhi/.cache/kagglehub/datasets/sohier/calcofi/versions/2
Found 2 CSV files
→ Loading bottle.csv... 

  df = pd.read_csv(csv_file)


[Initial shape: (864863, 74)] [Cleaned: (864863, 74) -> (405546, 42)] [Cleaned: (864863, 74) -> (405546, 42)] ✓ Shape: (405546, 41), Target: (405546,), Target column: 'R_PRES'
→ Loading cast.csv... ✓ Shape: (405546, 41), Target: (405546,), Target column: 'R_PRES'
→ Loading cast.csv... [Initial shape: (34404, 61)] [Initial shape: (34404, 61)] 

  df = pd.read_csv(csv_file)


[Cleaned: (34404, 61) -> (34404, 44)] ✓ Shape: (34404, 43), Target: (34404,), Target column: 'Wea'

Kaggle Dataset Loading Summary:
• Successfully loaded: 2/2 unique datasets
• Failed to load: 0 datasets
• Successfully loaded dataset details:
  - bottle: (405546, 41) features, 405546 samples
  - cast: (34404, 43) features, 34404 samples

Loaded 2 Kaggle datasets successfully!
→ Processing dataset 1/2 (ID: 0)
   • Dataset shape: (405546, 41), target shape: (405546,)
✓ Shape: (34404, 43), Target: (34404,), Target column: 'Wea'

Kaggle Dataset Loading Summary:
• Successfully loaded: 2/2 unique datasets
• Failed to load: 0 datasets
• Successfully loaded dataset details:
  - bottle: (405546, 41) features, 405546 samples
  - cast: (34404, 43) features, 34404 samples

Loaded 2 Kaggle datasets successfully!
→ Processing dataset 1/2 (ID: 0)
   • Dataset shape: (405546, 41), target shape: (405546,)
   • Train shape:         Cst_Cnt  Btl_Cnt  Sta_ID  Depth_ID  Depthm  T_degC  Salnty  O2ml_L  \
69



   • Training LGBMRegressor... [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8033
[LightGBM] [Info] Number of data points in the train set: 283882, number of used features: 36
[LightGBM] [Info] Start training from score 199.663638
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8033
[LightGBM] [Info] Number of data points in the train set: 283882, number of used features: 36
[LightGBM] [Info] Start training from score 199.663638
R²=0.9991
   • Training XGBRegressor... R²=0.9991
   • Training XGBRegressor... R²=0.9980
   • Training RandomForestRegressor... R²=0.9980
   • Trainin



   • Training LGBMRegressor... [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001820 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7413
[LightGBM] [Info] Number of data points in the train set: 24082, number of used features: 41
[LightGBM] [Info] Start training from score 1.255336
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001820 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7413
[LightGBM] [Info] Number of data points in the train set: 24082, number of used features: 41
[LightGBM] [Info] Start training from score 1.255336
R²=0.1836
   • Training XGBRegressor... R²=0.1836
   • Training XGBRegressor... R²=0.1920
   • Training RandomForestRegressor... R²=0.1920
   • Training Rand

In [7]:
# Download latest version
path2 = kagglehub.dataset_download("sbudincsevity/szeged-weather")

# Load the Kaggle dataset
kaggle_datasets2 = load_kaggle_dataset(path2)

if kaggle_datasets2:
    print(f"\nLoaded {len(kaggle_datasets2)} Kaggle datasets successfully!")
    records = algorithms_eval(algorithms=algorithms, datasets=kaggle_datasets2)

    df = pd.DataFrame(records)
    df.to_csv("meta_features_kaggle_2.csv", index=False)
else:
    print("No Kaggle datasets loaded successfully.")

KaggleApiHTTPError: 403 Client Error.

You don't have permission to access resource at URL: https://www.kaggle.com/datasets/sbudincsevity/szeged-weather. The server reported the following issues: Permission 'datasets.get' was denied
Please make sure you are authenticated if you are trying to access a private resource or a resource requiring consent.

In [8]:
# Download latest version
path3 = kagglehub.dataset_download("smid80/weatherww2")

# Load the Kaggle dataset
kaggle_datasets3 = load_kaggle_dataset(path3)

if kaggle_datasets3:
    print(f"\nLoaded {len(kaggle_datasets3)} Kaggle datasets successfully!")
    records = algorithms_eval(algorithms=algorithms, datasets=kaggle_datasets3)

    df = pd.DataFrame(records)
    df.to_csv("meta_features_kaggle_3.csv", index=False)
else:
    print("No Kaggle datasets loaded successfully.")

Downloading from https://www.kaggle.com/api/v1/datasets/download/smid80/weatherww2?dataset_version_number=1...


100%|██████████| 1.65M/1.65M [00:00<00:00, 2.46MB/s]

Extracting files...
Loading datasets from Kaggle path: /Users/surbhi/.cache/kagglehub/datasets/smid80/weatherww2/versions/1
Found 2 CSV files
→ Loading Weather Station Locations.csv... Loading datasets from Kaggle path: /Users/surbhi/.cache/kagglehub/datasets/smid80/weatherww2/versions/1
Found 2 CSV files
→ Loading Weather Station Locations.csv... [Initial shape: (161, 8)] [Cleaned: (161, 8) -> (161, 8)] ✓ Shape: (161, 7), Target: (161,), Target column: 'Longitude'
→ Loading Summary of Weather.csv... [Initial shape: (161, 8)] [Cleaned: (161, 8) -> (161, 8)] ✓ Shape: (161, 7), Target: (161,), Target column: 'Longitude'
→ Loading Summary of Weather.csv... 


  df = pd.read_csv(csv_file)
  df = pd.read_csv(csv_file)


[Initial shape: (119040, 31)] [Cleaned: (119040, 31) -> (116293, 15)] [Skipped: Non-numeric target 'SNF']

Kaggle Dataset Loading Summary:
• Successfully loaded: 1/2 unique datasets
• Failed to load: 1 datasets
• Failed files and reasons:
  - Summary of Weather.csv: Non-numeric target 'SNF'
• Successfully loaded dataset details:
  - Weather Station Locations: (161, 7) features, 161 samples

Loaded 1 Kaggle datasets successfully!
→ Processing dataset 1/1 (ID: 0)
   • Dataset shape: (161, 7), target shape: (161,)
   • Train shape:       WBAN  NAME  STATE/COUNTRY ID  LAT  LON  ELEV   Latitude
67   32801    16                28   85  103    98  25.433333
114  12101    12                40   61   31   235  20.466667
11   81404    70                 2   34  129    12 -10.600000
65   16202   129                27  154   46     8  64.133333
85   33019    30                31  127   18     9  39.250000, Test shape:       WBAN  NAME  STATE/COUNTRY ID  LAT  LON  ELEV   Latitude
105  11601   146  



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000713 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 257
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 7
[LightGBM] [Info] Start training from score 13.076786
R²=0.2773
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000713 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 257
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 7
[LightGBM] [Info] Start training from score 13.076786
R²=0.2773
   • Training XGBRegressor... R²=0.7656
   • Training RandomForestRegressor... R²=0.7656
   • Training RandomForestRegressor... R²=0.6604
   • Training DecisionTreeRegressor.



R²=0.9624


In [9]:
# Download latest version
path4 = kagglehub.dataset_download("mirichoi0218/insurance")

# Load the Kaggle dataset
kaggle_datasets4 = load_kaggle_dataset(path4)

if kaggle_datasets4:
    print(f"\nLoaded {len(kaggle_datasets4)} Kaggle datasets successfully!")
    records = algorithms_eval(algorithms=algorithms, datasets=kaggle_datasets4)

    df = pd.DataFrame(records)
    df.to_csv("meta_features_kaggle_4.csv", index=False)
else:
    print("No Kaggle datasets loaded successfully.")

Downloading from https://www.kaggle.com/api/v1/datasets/download/mirichoi0218/insurance?dataset_version_number=1...


100%|██████████| 16.0k/16.0k [00:00<00:00, 10.3MB/s]

Extracting files...
Loading datasets from Kaggle path: /Users/surbhi/.cache/kagglehub/datasets/mirichoi0218/insurance/versions/1
Found 1 CSV files
→ Loading insurance.csv... Loading datasets from Kaggle path: /Users/surbhi/.cache/kagglehub/datasets/mirichoi0218/insurance/versions/1
Found 1 CSV files
→ Loading insurance.csv... [Initial shape: (1338, 7)] [Cleaned: (1338, 7) -> (1338, 7)] ✓ Shape: (1338, 6), Target: (1338,), Target column: 'charges'

Kaggle Dataset Loading Summary:
• Successfully loaded: 1/1 unique datasets
• Failed to load: 0 datasets
• Successfully loaded dataset details:
  - insurance: (1338, 6) features, 1338 samples

Loaded 1 Kaggle datasets successfully!
→ Processing dataset 1/1 (ID: 0)
   • Dataset shape: (1338, 6), target shape: (1338,)
   • Train shape:      age  sex     bmi  children  smoker  region
332   61    0  31.160         0       0       1
355   46    1  27.600         0       0       3
138   54    0  31.900         3       0       2
381   55    1  30.685




   • Training LGBMRegressor... [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000546 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 316
[LightGBM] [Info] Number of data points in the train set: 936, number of used features: 6
[LightGBM] [Info] Start training from score 13379.157302
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000546 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 316
[LightGBM] [Info] Number of data points in the train set: 936, number of used features: 6
[LightGBM] [Info] Start training from score 13379.157302
R²=0.8559
   • Training XGBRegressor... R²=0.8559
   • Training XGBRegressor... R²=0.8385
   • Training RandomForestRegressor... R²=0.8385
   • Training Rand



R²=0.8721


In [4]:
# Download latest version
path5 = kagglehub.dataset_download("harlfoxem/housesalesprediction")

# Load the Kaggle dataset
kaggle_datasets5 = load_kaggle_dataset(path5)

if kaggle_datasets5:
    print(f"\nLoaded {len(kaggle_datasets5)} Kaggle datasets successfully!")
    records = algorithms_eval(algorithms=algorithms, datasets=kaggle_datasets5)

    df = pd.DataFrame(records)
    df.to_csv("meta_features_kaggle_5.csv", index=False)
else:
    print("No Kaggle datasets loaded successfully.")

Loading datasets from Kaggle path: /Users/surbhi/.cache/kagglehub/datasets/harlfoxem/housesalesprediction/versions/1
Found 1 CSV files
→ Loading kc_house_data.csv... [Initial shape: (21613, 21)] [Cleaned: (21613, 21) -> (21613, 21)] ✓ Shape: (21613, 20), Target: (21613,), Target column: 'sqft_lot15'

Kaggle Dataset Loading Summary:
• Successfully loaded: 1/1 unique datasets
• Failed to load: 0 datasets
• Successfully loaded dataset details:
  - kc_house_data: (21613, 20) features, 21613 samples

Loaded 1 Kaggle datasets successfully!
→ Processing dataset 1/1 (ID: 0)
   • Dataset shape: (21613, 20), target shape: (21613,)
   • Train shape:                id  date     price  bedrooms  bathrooms  sqft_living  sqft_lot  \
167    1836980160   317  807100.0         4       2.50         2680      4499   
12412  9221400335   152  570000.0         4       1.75         2340      5080   
7691   6669020490   102  320000.0         4       2.25         2190      9020   
12460  2025079045    52  6490



   • Training LGBMRegressor... [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002046 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2830
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 20
[LightGBM] [Info] Start training from score 12823.633089
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002046 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2830
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 20
[LightGBM] [Info] Start training from score 12823.633089
R²=0.5720
   • Training XGBRegressor... R²=0.5720
   • Training XGBRegressor... R²=0.5358
   • Training RandomForestRegressor... R²=0.5358
   • Train



[Error: Number of samples 15129 in the input data is greater than the maximum number of samples 10000 officially supported by TabPFN. Set `ignore_pretraining_limits=True` to override this error!]
R²=-0.0000
