In [9]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
/kaggle/input/tg-smiles-pid-polymer-class/TgSS_enriched_cleaned.csv
/kaggle/input/tc-smiles/Tc_SMILES.csv
/kaggle/input/smiles-extra-data/data_dnst1.xlsx
/kaggle/input/smiles-extra-data/data_tg3.xlsx
/kaggle/input/smiles-extra-data/JCIM_sup_bigsmiles.csv
/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train.csv
/kaggle/input/neurips-open-polymer-prediction-2025/test.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset2.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset3.csv


In [10]:
!pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl

Processing /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
rdkit is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


In [15]:
# =============================================================================
# DATA LOADING, CLEANING, AND INTEGRATION
#
# The foundation of any successful QSPR model is a high-quality, well-integrated dataset.
# Our strategy involves combining the primary competition data with multiple external
# datasets to maximize the number of training examples. The process prioritizes data
# integrity through two key steps:
# 1. SMILES Validation: Using a robust function to handle and filter non-standard notations.
# 2. Prioritized Merging: Combining data while prioritizing the competition's ground truth.
# =============================================================================

import gc
import pandas as pd
from rdkit import Chem 

# --- Helper Function for Chemical Data Cleaning ---

def clean_and_validate_smiles(smiles):
    """
    Validates and cleans SMILES strings, specifically targeting non-standard
    notations found in external chemical datasets.
    
    Justification: External datasets often contain legacy or non-standard SMILES
    notations (e.g., [R], [R1]) to represent polymer attachment points. These
    can cause parsing errors in RDKit. This function explicitly filters these
    out, ensuring only parsable molecules are passed to the feature generation stage.
    """
    if not isinstance(smiles, str) or len(smiles) == 0:
        return None
    
    # Explicitly filter known non-standard R-group notations
    bad_patterns = ['[R]', '[R1]', '[R2]', '[R3]', '[R4]', '[R5]', "[R']", '[R"]']
    if any(pattern in smiles for pattern in bad_patterns):
        return None
    
    try:
        mol = Chem.MolFromSmiles(smiles, sanitize=True)
        if mol is None: return None
        return Chem.MolToSmiles(mol, canonical=True)
    except:
        return None
        
# --- Data Ingestion and Standardization ---
# This entire section is kept from your original code, as it's proven to be effective.
# We are simply wrapping it in a more organized structure.

print("Starting data ingestion and standardization...")

# Load competition data
train = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')
test_df = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')

# Apply initial cleaning to base data
train['SMILES'] = train['SMILES'].apply(clean_and_validate_smiles)
test_df['SMILES'] = test_df['SMILES'].apply(clean_and_validate_smiles)
train.dropna(subset=['SMILES'], inplace=True)
test_df.dropna(subset=['SMILES'], inplace=True)

# Function to integrate external data
def add_extra_data_clean(df_train, df_extra, target, targets_list):
    # Ensure the target column is numeric before grouping
    df_extra[target] = pd.to_numeric(df_extra[target], errors='coerce')
    df_extra.dropna(subset=[target], inplace=True)

    df_extra['SMILES'] = df_extra['SMILES'].apply(clean_and_validate_smiles)
    df_extra.dropna(subset=['SMILES'], inplace=True)
    if df_extra.empty: return df_train
    
    df_extra = df_extra.groupby('SMILES', as_index=False)[target].mean()
    
    # Merge to fill NAs and add new SMILES
    df_train = pd.merge(df_train, df_extra, on='SMILES', how='outer', suffixes=('', '_new'))
    df_train[target] = df_train[target].fillna(df_train[target + '_new'])
    df_train.drop(columns=[target + '_new'], inplace=True)
    
    return df_train

# Load and process external datasets
print("Loading and integrating external datasets...")
train_extended = train.copy()
TARGETS = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']

# A list of tuples: (path, target, processing_function)
datasets_to_load = [
    ('/kaggle/input/tc-smiles/Tc_SMILES.csv', 'Tc', lambda df: df.rename(columns={'TC_mean': 'Tc'})),
    ('/kaggle/input/tg-smiles-pid-polymer-class/TgSS_enriched_cleaned.csv', 'Tg', lambda df: df),
    ('/kaggle/input/smiles-extra-data/JCIM_sup_bigsmiles.csv', 'Tg', lambda df: df.rename(columns={'Tg (C)': 'Tg'})),
    ('/kaggle/input/smiles-extra-data/data_tg3.xlsx', 'Tg', lambda df: df.rename(columns={'Tg [K]': 'Tg'}).assign(Tg=lambda x: x['Tg'] - 273.15)),
    
    # --- THIS IS THE CORRECTED LINE ---
    ('/kaggle/input/smiles-extra-data/data_dnst1.xlsx', 'Density', 
     lambda df: df.rename(columns={'density(g/cm3)': 'Density'})),
    # ------------------------------------
     
    ('/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv', 'FFV', lambda df: df)
]

for path, target, processor in datasets_to_load:
    try:
        ext_df = pd.read_excel(path) if path.endswith('.xlsx') else pd.read_csv(path)
        ext_df = processor(ext_df)
        train_extended = add_extra_data_clean(train_extended, ext_df, target, TARGETS)
        print(f"Successfully integrated: {os.path.basename(path)}")
    except Exception as e:
        print(f"Failed to integrate {os.path.basename(path)}: {e}")

# Drop duplicates, keeping the first instance (prioritizing original data)
train_extended.drop_duplicates(subset=['SMILES'], keep='first', inplace=True)

print("\nData integration summary:")
print(f"Total unique polymers for training: {train_extended['SMILES'].nunique()}")
for target in TARGETS:
    print(f"  - {target}: {train_extended[target].notna().sum():,} available samples")

# Clean up memory
gc.collect()

Starting data ingestion and standardization...
Loading and integrating external datasets...
Successfully integrated: Tc_SMILES.csv
Successfully integrated: TgSS_enriched_cleaned.csv
Successfully integrated: JCIM_sup_bigsmiles.csv
Successfully integrated: data_tg3.xlsx
Successfully integrated: data_dnst1.xlsx
Successfully integrated: dataset4.csv

Data integration summary:
Total unique polymers for training: 11327
  - Tg: 8,244 available samples
  - FFV: 7,892 available samples
  - Tc: 866 available samples
  - Density: 1,247 available samples
  - Rg: 614 available samples


1935

In [18]:
# =============================================================================
# FEATURE ENGINEERING
#
# The goal is to convert a SMILES string into a rich, numerical representation
# that a machine learning model can use. Our strategy is three-fold:
# 1. Descriptors: Calculate ~200 physicochemical properties (e.g., MolWt, LogP, TPSA)
#    that describe the molecule's bulk properties.
# 2. Fingerprints: Generate binary vectors (Morgan and MACCS keys) that encode the
#    presence or absence of specific substructural features.
# 3. Graph Features: Model the molecule as a graph and calculate topological
#    indices (e.g., diameter, cycles), which describe its connectivity and shape.
# =============================================================================

from rdkit.Chem import Descriptors, MACCSkeys, rdmolops
# --- THIS IS THE CORRECTED IMPORT ---
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
# ------------------------------------
import networkx as nx
import pandas as pd
import numpy as np

def generate_features(df_input):
    """
    Generates a comprehensive feature set from a DataFrame containing SMILES strings.
    
    Args:
        df_input (pd.DataFrame): Input DataFrame which must contain a 'SMILES' column.
        
    Returns:
        pd.DataFrame: A DataFrame containing all calculated features, indexed
                      identically to the input DataFrame.
    """
    
    all_features_list = []
    
    # Initialize the Morgan fingerprint generator once
    morgan_gen = GetMorganGenerator(radius=2, fpSize=128)

    for smiles in df_input['SMILES']:
        mol = Chem.MolFromSmiles(smiles)
        
        # If a molecule is invalid, append a dictionary of NaNs and continue.
        # This preserves the DataFrame's index.
        if mol is None:
            all_features_list.append({})
            continue
            
        # --- Feature Calculation ---
        # 1. Descriptors
        descriptors = Descriptors.CalcMolDescriptors(mol)
        
        # 2. Fingerprints
        maccs_fp = {f'maccs_{i}': bit for i, bit in enumerate(MACCSkeys.GenMACCSKeys(mol))}
        morgan_fp = {f'morgan_{i}': bit for i, bit in enumerate(morgan_gen.GetFingerprint(mol))}
        
        # 3. Graph-based features
        graph_features = {}
        try:
            adj = Chem.rdmolops.GetAdjacencyMatrix(mol)
            G = nx.from_numpy_array(adj)
            if nx.is_connected(G):
                graph_features['graph_diameter'] = nx.diameter(G)
                graph_features['avg_shortest_path'] = nx.average_shortest_path_length(G)
            else:
                graph_features['graph_diameter'] = 0
                graph_features['avg_shortest_path'] = 0
            graph_features['num_cycles'] = len(list(nx.cycle_basis(G)))
        except:
            graph_features['graph_diameter'] = np.nan
            graph_features['avg_shortest_path'] = np.nan
            graph_features['num_cycles'] = np.nan

        # Combine all features for the current molecule
        combined_features = {**descriptors, **maccs_fp, **morgan_fp, **graph_features}
        all_features_list.append(combined_features)
        
    # Create the final DataFrame and fill any missing values that may have occurred.
    features_df = pd.DataFrame(all_features_list, index=df_input.index).fillna(0)
    
    return features_df

def augment_smiles_dataset(smiles_list, labels, num_augments=1):
    """
    Augments a list of SMILES strings by generating randomized versions.
    This increases the diversity of the training data.
    """
    augmented_smiles = []
    augmented_labels = []

    for smiles, label in zip(smiles_list, labels):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None: continue
        
        # Add the original SMILES and its label
        augmented_smiles.append(smiles)
        augmented_labels.append(label)
        
        # Add randomized versions
        for _ in range(num_augments):
            rand_smiles = Chem.MolToSmiles(mol, doRandom=True)
            augmented_smiles.append(rand_smiles)
            augmented_labels.append(label)

    return augmented_smiles, np.array(augmented_labels)

# The separate_subtables function remains unchanged as it is clear and effective.
def separate_subtables(train_df):
    labels = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
    subtables = {}
    for label in labels:
        subtables[label] = train_df[['SMILES', label]][train_df[label].notna()]
    return subtables

In [19]:
# =============================================================================
# FEATURE SELECTION CONFIGURATION
#
# Not all molecular features are equally predictive for every target property. For instance,
# features related to molecular volume and weight are critical for 'Density', while
# features describing chain flexibility (e.g., 'NumRotatableBonds') are more relevant
# for 'Tg'.
#
# This dictionary defines a manually curated list of features for each of the five
# target models. This expert-driven feature selection reduces model complexity,
# decreases the risk of overfitting on irrelevant features, and can lead to
# better performance and interpretability.
# =============================================================================

# A base set of descriptors considered fundamentally important for all targets.
required_descriptors = {
    'MolWt', 'MolLogP', 'TPSA', 'NumRotatableBonds', 'HeavyAtomCount',
    'graph_diameter', 'num_cycles', 'avg_shortest_path'
}

# The main dictionary mapping each target to its specific list of required features.
filters = {
    'Tg': sorted(list(set([
        'BalabanJ','BertzCT','Chi1','Chi3n','Chi4n','EState_VSA4','EState_VSA8',
        'FpDensityMorgan3','HallKierAlpha','Kappa3','MaxAbsEStateIndex','MolLogP',
        'NumAmideBonds','NumHeteroatoms','NumHeterocycles','NumRotatableBonds',
        'PEOE_VSA14','Phi','RingCount','SMR_VSA1','SPS','SlogP_VSA1','SlogP_VSA5',
        'SlogP_VSA8','TPSA','VSA_EState1','VSA_EState4','VSA_EState6','VSA_EState7',
        'VSA_EState8','fr_C_O_noCOO','fr_NH1','fr_benzene','fr_bicyclic','fr_ether',
        'fr_unbrch_alkane'
    ]).union(required_descriptors))),

    'FFV': sorted(list(set([
        'AvgIpc','BalabanJ','BertzCT','Chi0','Chi0n','Chi0v','Chi1','Chi1n','Chi1v',
        'Chi2n','Chi2v','Chi3n','Chi3v','Chi4n','EState_VSA10','EState_VSA5',
        'EState_VSA7','EState_VSA8','EState_VSA9','ExactMolWt','FpDensityMorgan1',
        'FpDensityMorgan2','FpDensityMorgan3','FractionCSP3','HallKierAlpha',
        'HeavyAtomMolWt','Kappa1','Kappa2','Kappa3','MaxAbsEStateIndex',
        'MaxEStateIndex','MinEStateIndex','MolLogP','MolMR','MolWt','NHOHCount',
        'NOCount','NumAromaticHeterocycles','NumHAcceptors','NumHDonors',
        'NumHeterocycles','NumRotatableBonds','PEOE_VSA14','RingCount','SMR_VSA1',
        'SMR_VSA10','SMR_VSA3','SMR_VSA5','SMR_VSA6','SMR_VSA7','SMR_VSA9','SPS',
        'SlogP_VSA1','SlogP_VSA10','SlogP_VSA11','SlogP_VSA12','SlogP_VSA2',
        'SlogP_VSA3','SlogP_VSA4','SlogP_VSA5','SlogP_VSA6','SlogP_VSA7',
        'SlogP_VSA8','TPSA','VSA_EState1','VSA_EState10','VSA_EState2',
        'VSA_EState3','VSA_EState4','VSA_EState5','VSA_EState6','VSA_EState7',
        'VSA_EState8','VSA_EState9','fr_Ar_N','fr_C_O','fr_NH0','fr_NH1',
        'fr_aniline','fr_ether','fr_halogen','fr_thiophene'
    ]).union(required_descriptors))),

    'Tc': sorted(list(set([
        'BalabanJ','BertzCT','Chi0','EState_VSA5','ExactMolWt','FpDensityMorgan1',
        'FpDensityMorgan2','FpDensityMorgan3','HeavyAtomMolWt','MinEStateIndex',
        'MolWt','NumAtomStereoCenters','NumRotatableBonds','NumValenceElectrons',
        'SMR_VSA10','SMR_VSA7','SPS','SlogP_VSA6','SlogP_VSA8','VSA_EState1',
        'VSA_EState7','fr_NH1','fr_ester','fr_halogen'
    ]).union(required_descriptors))),

    'Density': sorted(list(set([
        'BalabanJ','Chi3n','Chi3v','Chi4n','EState_VSA1','ExactMolWt',
        'FractionCSP3','HallKierAlpha','Kappa2','MinEStateIndex','MolMR','MolWt',
        'NumAliphaticCarbocycles','NumHAcceptors','NumHeteroatoms',
        'NumRotatableBonds','SMR_VSA10','SMR_VSA5','SlogP_VSA12','SlogP_VSA5',
        'TPSA','VSA_EState10','VSA_EState7','VSA_EState8'
    ]).union(required_descriptors))),

    'Rg': sorted(list(set([
        'AvgIpc','Chi0n','Chi1v','Chi2n','Chi3v','ExactMolWt','FpDensityMorgan1',
        'FpDensityMorgan2','FpDensityMorgan3','HallKierAlpha','HeavyAtomMolWt',
        'Kappa3','MaxAbsEStateIndex','MolWt','NOCount','NumRotatableBonds',
        'NumUnspecifiedAtomStereoCenters','NumValenceElectrons','PEOE_VSA14',
        'PEOE_VSA6','SMR_VSA1','SMR_VSA5','SPS','SlogP_VSA1','SlogP_VSA2',
        'SlogP_VSA7','SlogP_VSA8','VSA_EState1','VSA_EState8','fr_alkyl_halide',
        'fr_halogen'
    ]).union(required_descriptors)))
}

print("Feature selection filters defined.")

Feature selection filters defined.


In [20]:
# =============================================================================
# MODELING CONFIGURATION AND wMAE WEIGHTS
#
# This cell centralizes all parameters for the modeling stage.
# =============================================================================
from sklearn.metrics import mean_absolute_error
import numpy as np

def calculate_wmae_weights(df, targets):
    """
    Calculates the wMAE weights based on the competition's specific formula.
    """
    K = len(targets)
    n_i = df[targets].notna().sum()
    r_i = df[targets].max() - df[targets].min()
    inv_sqrt_n = 1 / np.sqrt(n_i)
    normalization_factor = np.sum(inv_sqrt_n)
    weights = (1 / r_i) * (K * inv_sqrt_n / normalization_factor)
    return weights.to_dict()

# Calculate the official evaluation weights using our full training data.
wmae_weights = calculate_wmae_weights(train_extended, TARGETS)
print("Official wMAE Weights:", wmae_weights)


# Store our pre-tuned XGBoost hyperparameters in a dictionary for easy access.
model_params = {
    'Tg': {'n_estimators': 2173, 'learning_rate': 0.0672, 'max_depth': 6, 'reg_lambda': 5.545},
    'FFV': {'n_estimators': 2202, 'learning_rate': 0.0722, 'max_depth': 4, 'reg_lambda': 2.887},
    'Tc': {'n_estimators': 1488, 'learning_rate': 0.0104, 'max_depth': 5, 'reg_lambda': 9.970},
    'Density': {'n_estimators': 1958, 'learning_rate': 0.1095, 'max_depth': 5, 'reg_lambda': 3.074},
    'Rg': {'n_estimators': 520, 'learning_rate': 0.0732, 'max_depth': 5, 'reg_lambda': 0.971}
}

# Add common parameters to all models
for p in model_params.values():
    p['random_state'] = 42
    p['n_jobs'] = -1 # Use all available CPU cores
    p['tree_method'] = 'hist' # Use fast histogram-based gradient boosting


print("\nModeling and wMAE configurations are set.")

Official wMAE Weights: {'Tg': 0.0005876219273313449, 'FFV': 0.8189855294394338, 'Tc': 0.8811493366469583, 'Density': 0.8387398500722527, 'Rg': 0.0647523046595221}

Modeling and wMAE configurations are set.


In [21]:
# =============================================================================
# PRE-COMPUTATION OF MOLECULAR FEATURES
# =============================================================================
import time
print("Starting feature pre-computation for all unique molecules...")
start_time = time.time()

# Generate features for the entire training and test sets ONCE.
train_features_df = generate_features(train_extended)
test_features_df = generate_features(test_df)

# Align columns to ensure consistency
train_cols = set(train_features_df.columns)
test_cols = set(test_features_df.columns)
missing_in_test = list(train_cols - test_cols)
missing_in_train = list(test_cols - train_cols)
for col in missing_in_test: test_features_df[col] = 0
for col in missing_in_train: train_features_df[col] = 0
test_features_df = test_features_df[train_features_df.columns]

end_time = time.time()
print(f"Feature generation complete for {len(train_features_df)} train and {len(test_features_df)} test molecules.")
print(f"Total features: {len(train_features_df.columns)}. Time taken: {end_time - start_time:.2f} seconds.")

Starting feature pre-computation for all unique molecules...
Feature generation complete for 11327 train and 3 test molecules.
Total features: 515. Time taken: 311.54 seconds.


In [22]:
# =============================================================================
# MODEL TRAINING WITH K-FOLD CROSS-VALIDATION
# =============================================================================
from sklearn.model_selection import KFold
from xgboost import XGBRegressor

# Prepare data subsets and initialize storage for results
subtables = separate_subtables(train_extended)
oof_scores = {}
final_test_predictions = pd.DataFrame({'id': test_df.index})

# --- Main Training Loop ---
for label in TARGETS:
    print(f"\n--- Processing Target: {label} ---")
    
    # 1. Select data for the current target
    target_df = subtables[label]
    X = train_features_df.loc[target_df.index]
    y = target_df[label]
    
    # 2. Apply the specific feature filter for this target
    X = X[filters[label]]
    X_test = test_features_df[filters[label]]
    
    # 3. K-Fold Cross-Validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds_target = np.zeros(len(X))
    test_preds_target = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Initialize the model with pre-defined parameters
        model = XGBRegressor(**model_params[label])
        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  eval_metric='mae',
                  verbose=0,
                  early_stopping_rounds=50) # Stop if MAE doesn't improve for 50 rounds
        
        # Store predictions
        oof_preds_target[val_idx] = model.predict(X_val)
        test_preds_target.append(model.predict(X_test))
        
    # 4. Evaluate and store results for this target
    mae = mean_absolute_error(y, oof_preds_target)
    oof_scores[label] = mae
    print(f"OOF MAE for {label}: {mae:.5f}")
    
    # Average predictions across all 5 folds for the final test prediction
    final_test_predictions[label] = np.mean(test_preds_target, axis=0)

# --- Final Validation Score Calculation ---
wmae_score = 0
for label in TARGETS:
    wmae_score += wmae_weights[label] * oof_scores[label]

print(f"\n--- Validation Complete ---")
print(f"Final Calculated OOF wMAE Score: {wmae_score:.6f}")

# --- Create Submission File ---
submission_df = final_test_predictions.copy()
submission_df['id'] = test_df['id'].values # Ensure correct IDs are used
submission_df.to_csv('submission.csv', index=False)

print("\n✅ submission.csv created successfully.")
print(submission_df.head())


--- Processing Target: Tg ---




OOF MAE for Tg: 28.88676

--- Processing Target: FFV ---




OOF MAE for FFV: 0.00615

--- Processing Target: Tc ---




OOF MAE for Tc: 0.03457

--- Processing Target: Density ---




OOF MAE for Density: 0.05709

--- Processing Target: Rg ---




OOF MAE for Rg: 1.75856

--- Validation Complete ---
Final Calculated OOF wMAE Score: 0.214226

✅ submission.csv created successfully.
           id          Tg       FFV        Tc   Density         Rg
0  1109053969  156.643921  0.373301  0.185462  1.151829  19.528582
1  1422188626  165.612869  0.376664  0.256528  1.126175  20.328976
2  2032016830  135.780090  0.350843  0.235498  1.132888  19.900789


In [24]:
# =============================================================================
# SAVE PRODUCTION ARTIFACTS (FINAL ROBUST VERSION)
# =============================================================================
import joblib
import os
from sklearn.feature_selection import VarianceThreshold

os.makedirs('production_models', exist_ok=True)

for label in TARGETS:
    print(f"--- Finalizing and Saving Model for: {label} ---")
    
    target_df = subtables[label]
    X = train_features_df.loc[target_df.index]
    y = target_df[label]
    
    # Use the sorted list of features
    X = X[filters[label]]
    
    # Fit the selector to find which columns to keep
    selector = VarianceThreshold(threshold=0.01)
    selector.fit(X)
    
    # Instead of saving the selector object, we save the LIST of column names it keeps.
    # This is a simple Python list and has no version dependencies.
    kept_columns = X.columns[selector.get_support()].tolist()
    joblib.dump(kept_columns, f'production_models/{label}_kept_columns.joblib')
    print(f"   Saved list of {len(kept_columns)} kept columns.")
    # -------------------------------------

    # Filter the data using the list of kept columns
    X_selected = X[kept_columns]

    # Train the final model on this correctly filtered data
    final_model = XGBRegressor(**model_params[label])
    final_model.fit(X_selected, y)
    
    joblib.dump(final_model, f'production_models/{label}_model.joblib')
    print(f"   Saved model.")

print("\n✅ All production artifacts have been saved successfully.")

--- Finalizing and Saving Model for: Tg ---
   Saved list of 41 kept columns.
   Saved model.
--- Finalizing and Saving Model for: FFV ---
   Saved list of 86 kept columns.
   Saved model.
--- Finalizing and Saving Model for: Tc ---
   Saved list of 30 kept columns.
   Saved model.
--- Finalizing and Saving Model for: Density ---
   Saved list of 29 kept columns.
   Saved model.
--- Finalizing and Saving Model for: Rg ---
   Saved list of 37 kept columns.
   Saved model.

✅ All production artifacts have been saved successfully.
