# **selectedkbest**

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
test_form = pd.read_csv('TASK1_test_set_Submission_form.csv', encoding='latin1')
x_df_lasso=pd.read_csv('x_train_selectkbest.csv')
y_df=pd.read_csv('y_df.csv')
X_test=pd.read_csv('X_test_selectedkbest.csv')

target_cols = [
    'Green', 'Cucumber', 'Herbal', 'Mint', 'Woody', 'Pine', 'Floral',
    'Powdery', 'Fruity', 'Citrus', 'Tropical', 'Berry', 'Peach', 'Sweet',
    'Caramellic', 'Vanilla', 'BrownSpice', 'Smoky', 'Burnt', 'Roasted',
    'Grainy', 'Meaty', 'Nutty', 'Fatty', 'Coconut', 'Waxy', 'Dairy',
    'Buttery', 'Cheesy', 'Sour', 'Fermented', 'Sulfurous', 'Garlic.Onion',
    'Earthy', 'Mushroom', 'Musty', 'Ammonia', 'Fishy', 'Fecal',
    'Rotten.Decay', 'Rubber', 'Phenolic', 'Animal', 'Medicinal',
    'Cooling', 'Sharp', 'Chlorine', 'Alcoholic', 'Plastic', 'Ozone', 'Metallic'
]

In [None]:
import torch
import torch.nn as nn

class PearsonCosineLoss(nn.Module):
    def __init__(self, alpha=0.5, eps=1e-8):
        super().__init__()
        self.alpha = alpha
        self.eps = eps
        self.cosine_similarity = nn.CosineSimilarity(dim=1, eps=eps)

    def forward(self, y_pred, y_true):
        # Pearson Correlation (1 - correlation for minimization)
        y_pred_centered = y_pred - y_pred.mean(dim=1, keepdim=True)
        y_true_centered = y_true - y_true.mean(dim=1, keepdim=True)
        numerator = (y_pred_centered * y_true_centered).sum(dim=1)
        denominator = (y_pred_centered.pow(2).sum(dim=1) * y_true_centered.pow(2).sum(dim=1)).sqrt() + self.eps
        pearson_corr = numerator / denominator
        pearson_loss = 1 - pearson_corr

        # Cosine Similarity (1 - similarity for minimization)
        cosine_loss = 1 - self.cosine_similarity(y_pred, y_true)
        loss = self.alpha * pearson_loss + (1 - self.alpha) * cosine_loss
        return loss.mean()
class HyperbolicModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)
        self.tanh = nn.Tanh()
    def forward(self, x):
        x = self.tanh(self.fc1(x))
        x = self.tanh(self.fc2(x))
        x = self.fc3(x)
        return x

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    corrs = [np.corrcoef(y_true[:,i], y_pred[:,i])[0,1] if np.std(y_true[:,i])>0 and np.std(y_pred[:,i])>0 else 0 for i in range(y_true.shape[1])]
    avg_pearson = np.mean(corrs)
    norms_true = np.linalg.norm(y_true, axis=1)
    norms_pred = np.linalg.norm(y_pred, axis=1)
    cos = np.sum(y_true * y_pred, axis=1) / (norms_true * norms_pred + 1e-8)
    avg_cosine = np.mean(cos)
    return {
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'Avg Pearson Correlation': avg_pearson,
        'Avg Cosine Similarity': avg_cosine
    }

# Training routine
def train_and_evaluate(X, y, test_ratio=0.2, epochs=100, lr=1e-3, batch_size=32, alpha=0.5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # Split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_ratio, random_state=42)
    X_train_t = torch.FloatTensor(X_train).to(device)
    y_train_t = torch.FloatTensor(y_train).to(device)
    X_val_t = torch.FloatTensor(X_val).to(device)
    y_val_t = torch.FloatTensor(y_val).to(device)

    # Model
    model = HyperbolicModel(X.shape[1], y.shape[1]).to(device)
    loss_fn = PearsonCosineLoss(alpha=alpha)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    n = X_train.shape[0]
    steps_per_epoch = int(np.ceil(n / batch_size))

    # Training loop
    for epoch in range(epochs):
        model.train()
        idx = np.random.permutation(n)
        for i in range(steps_per_epoch):
            batch_idx = idx[i*batch_size:(i+1)*batch_size]
            xb = X_train_t[batch_idx]
            yb = y_train_t[batch_idx]
            optimizer.zero_grad()
            y_pred = model(xb)
            loss = loss_fn(y_pred, yb)
            loss.backward()
            optimizer.step()
        if (epoch+1) % 20 == 0:
            print(f"Epoch {epoch+1}/{epochs}: Train Loss = {loss.item():.4f}")

    model.eval()
    with torch.no_grad():
        y_train_pred = model(X_train_t).cpu().numpy()
        y_val_pred = model(X_val_t).cpu().numpy()
    y_train_pred = np.clip(y_train_pred, 0.0, 5.0)  # Clip predictions to valid range
    y_val_pred = np.clip(y_val_pred, 0.0, 5.0)

    # Print metrics on split data
    train_metrics = calculate_metrics(y_train, y_train_pred)
    val_metrics = calculate_metrics(y_val, y_val_pred)
    print("\\nTrain Metrics:", train_metrics)
    print("Validation Metrics:", val_metrics)

    # Retrain on full data
    model = HyperbolicModel(X.shape[1], y.shape[1]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    X_full_t = torch.FloatTensor(X).to(device)
    y_full_t = torch.FloatTensor(y).to(device)
    for epoch in range(epochs):
        optimizer.zero_grad()
        y_pred = model(X_full_t)
        loss = loss_fn(y_pred, y_full_t)
        loss.backward()
        optimizer.step()
    model.eval()
    with torch.no_grad():
        y_full_pred = model(X_full_t).cpu().numpy()
    y_full_pred = np.clip(y_full_pred, 0.0, 5.0)
    full_metrics = calculate_metrics(y, y_full_pred)
    print("\\nMetrics on Full Data:", full_metrics)
    return model, device
# Run training and print metrics
model, device = train_and_evaluate(
    x_df_lasso.to_numpy(dtype=np.float32),
    y_df.to_numpy(dtype=np.float32),
    test_ratio=0.2,
    epochs=100,
    lr=1e-3,
    batch_size=32,
    alpha=0.5
)

# Convert all columns to numeric (with NaNs where conversion failed)
X_test_numeric = X_test.apply(pd.to_numeric, errors='coerce')

# Fill NaN values (e.g., with 0 or other imputation strategy)
X_test_filled = X_test_numeric.fillna(0)

# Convert to numpy array of floats
X_test_arr = X_test_filled.to_numpy(dtype=np.float32)
with torch.no_grad():
    X_test_tensor = torch.FloatTensor(X_test_arr).to(device)
    test_preds = model(X_test_tensor).cpu().numpy()
test_preds = np.clip(test_preds, 0.0, 5.0)
# Save submission
test_submission = test_form[['stimulus']].copy()
for i, col in enumerate(target_cols):
    test_submission[col] = test_preds[:, i]

test_submission.to_csv('hyperbolic_task1_selectKBest.csv', index=False)
print("Test predictions saved.")
print(f"Test submission shape: {test_submission.shape}")


Epoch 20/100: Train Loss = 0.4861
Epoch 40/100: Train Loss = 0.4786
Epoch 60/100: Train Loss = 0.4570
Epoch 80/100: Train Loss = 0.4140
Epoch 100/100: Train Loss = 0.3910
\nTrain Metrics: {'MSE': 0.08505743741989136, 'MAE': 0.2044556587934494, 'R2': -0.1326073706150055, 'Avg Pearson Correlation': np.float64(0.3333684225257838), 'Avg Cosine Similarity': np.float32(0.67982143)}
Validation Metrics: {'MSE': 0.09934374690055847, 'MAE': 0.21815361082553864, 'R2': -0.3802337646484375, 'Avg Pearson Correlation': np.float64(0.22906903505735934), 'Avg Cosine Similarity': np.float32(0.635747)}
\nMetrics on Full Data: {'MSE': 0.11345918476581573, 'MAE': 0.24520465731620789, 'R2': -0.5061202645301819, 'Avg Pearson Correlation': np.float64(0.3074414896060002), 'Avg Cosine Similarity': np.float32(0.67076266)}
Test predictions saved.
Test submission shape: (31, 52)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# --- Custom metrics ---
def pearson_correlation_score(y_true, y_pred):
    corrs = []
    for i in range(y_true.shape[1]):
        corr = np.corrcoef(y_true[:, i], y_pred[:, i])[0, 1]
        if np.isnan(corr): corr = 0
        corrs.append(corr)
    return np.mean(corrs)

def cosine_similarity_score(y_true, y_pred):
    similarities = []
    for i in range(y_true.shape[0]):
        y_true_norm = y_true[i] / (np.linalg.norm(y_true[i]) + 1e-8)
        y_pred_norm = y_pred[i] / (np.linalg.norm(y_pred[i]) + 1e-8)
        similarity = np.dot(y_true_norm, y_pred_norm)
        similarities.append(similarity)
    return np.mean(similarities)

def calculate_metrics(y_true, y_pred):
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R2": r2_score(y_true, y_pred),
        "Avg Pearson Correlation": pearson_correlation_score(y_true, y_pred),
        "Avg Cosine Similarity": cosine_similarity_score(y_true, y_pred),
    }

# --------- Example workflow ---------
# These need to be set before running:
# X_df: DataFrame of features, y_df: DataFrame of targets (n_samples, n_targets)
# X_test_df: DataFrame of test features
# test_form: DataFrame with 'stimulus' column, matches rows of X_test_df
# target_cols: list of 51 string column names

# 1. Convert to numpy if needed
X = x_df_lasso.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)
y = y_df.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)

X_test = X_test.to_numpy(dtype=np.float32)

# 2. 80/20 split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Train MultiOutput RandomForest
rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42, n_jobs=-1))
rf.fit(X_train, y_train)

# 4. Print metrics on splits
y_train_pred = rf.predict(X_train)
y_val_pred = rf.predict(X_val)
y_train_pred = np.clip(y_train_pred, 0.0, 5.0)
y_val_pred = np.clip(y_val_pred, 0.0, 5.0)
print("Training metrics:")
print(calculate_metrics(y_train, y_train_pred))
print("\nValidation metrics:")
print(calculate_metrics(y_val, y_val_pred))

# 5. Retrain on entire dataset, print metrics
rf_full = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42, n_jobs=-1))
rf_full.fit(X, y)
y_full_pred = rf_full.predict(X)
y_full_pred = np.clip(y_full_pred, 0.0, 5.0)
print("\nFull dataset metrics:")
print(calculate_metrics(y, y_full_pred))

# 6. Predict on test set, save file
test_preds = rf_full.predict(X_test)
test_preds = np.clip(test_preds, 0.0, 5.0)
test_submission = test_form[['stimulus']].copy()
for i, col in enumerate(target_cols):
    test_submission[col] = test_preds[:, i]
test_submission.to_csv('rf_corr_cosine_metrics_test_task2_combined.csv', index=False)
print("Test predictions saved.")
print(f"Test submission shape: {test_submission.shape}")


Training metrics:
{'MSE': 0.016460161045869375, 'MAE': 0.07081173386693435, 'R2': 0.7772488143974703, 'Avg Pearson Correlation': np.float64(0.8897664957978499), 'Avg Cosine Similarity': np.float64(0.9310904127522671)}

Validation metrics:
{'MSE': 0.05799656965050777, 'MAE': 0.14020161963860026, 'R2': 0.12946453091028354, 'Avg Pearson Correlation': np.float64(0.49833673833341974), 'Avg Cosine Similarity': np.float64(0.7463615173016137)}

Full dataset metrics:
{'MSE': 0.019063999760902898, 'MAE': 0.07744751784573044, 'R2': 0.7425729396812802, 'Avg Pearson Correlation': np.float64(0.8651278064250011), 'Avg Cosine Similarity': np.float64(0.9179334384776027)}
Test predictions saved.
Test submission shape: (31, 52)


# **pcalasso**

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
test_form = pd.read_csv('TASK1_test_set_Submission_form.csv', encoding='latin1')
x_df_lasso=pd.read_csv('x_df_lasso_pca.csv')
y_df=pd.read_csv('y_df.csv')
X_test=pd.read_csv('X_test_pcalasso.csv')

target_cols = [
    'Green', 'Cucumber', 'Herbal', 'Mint', 'Woody', 'Pine', 'Floral',
    'Powdery', 'Fruity', 'Citrus', 'Tropical', 'Berry', 'Peach', 'Sweet',
    'Caramellic', 'Vanilla', 'BrownSpice', 'Smoky', 'Burnt', 'Roasted',
    'Grainy', 'Meaty', 'Nutty', 'Fatty', 'Coconut', 'Waxy', 'Dairy',
    'Buttery', 'Cheesy', 'Sour', 'Fermented', 'Sulfurous', 'Garlic.Onion',
    'Earthy', 'Mushroom', 'Musty', 'Ammonia', 'Fishy', 'Fecal',
    'Rotten.Decay', 'Rubber', 'Phenolic', 'Animal', 'Medicinal',
    'Cooling', 'Sharp', 'Chlorine', 'Alcoholic', 'Plastic', 'Ozone', 'Metallic'
]



In [None]:
import torch
import torch.nn as nn

class PearsonCosineLoss(nn.Module):
    def __init__(self, alpha=0.5, eps=1e-8):
        super().__init__()
        self.alpha = alpha
        self.eps = eps
        self.cosine_similarity = nn.CosineSimilarity(dim=1, eps=eps)

    def forward(self, y_pred, y_true):
        # Pearson Correlation (1 - correlation for minimization)
        y_pred_centered = y_pred - y_pred.mean(dim=1, keepdim=True)
        y_true_centered = y_true - y_true.mean(dim=1, keepdim=True)
        numerator = (y_pred_centered * y_true_centered).sum(dim=1)
        denominator = (y_pred_centered.pow(2).sum(dim=1) * y_true_centered.pow(2).sum(dim=1)).sqrt() + self.eps
        pearson_corr = numerator / denominator
        pearson_loss = 1 - pearson_corr

        # Cosine Similarity (1 - similarity for minimization)
        cosine_loss = 1 - self.cosine_similarity(y_pred, y_true)
        loss = self.alpha * pearson_loss + (1 - self.alpha) * cosine_loss
        return loss.mean()
class HyperbolicModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)
        self.tanh = nn.Tanh()
    def forward(self, x):
        x = self.tanh(self.fc1(x))
        x = self.tanh(self.fc2(x))
        x = self.fc3(x)
        return x

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    corrs = [np.corrcoef(y_true[:,i], y_pred[:,i])[0,1] if np.std(y_true[:,i])>0 and np.std(y_pred[:,i])>0 else 0 for i in range(y_true.shape[1])]
    avg_pearson = np.mean(corrs)
    norms_true = np.linalg.norm(y_true, axis=1)
    norms_pred = np.linalg.norm(y_pred, axis=1)
    cos = np.sum(y_true * y_pred, axis=1) / (norms_true * norms_pred + 1e-8)
    avg_cosine = np.mean(cos)
    return {
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'Avg Pearson Correlation': avg_pearson,
        'Avg Cosine Similarity': avg_cosine
    }

# Training routine
# Training routine
def train_and_evaluate(X, y, test_ratio=0.2, epochs=100, lr=1e-3, batch_size=32, alpha=0.5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # Split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_ratio, random_state=42)
    X_train_t = torch.FloatTensor(X_train).to(device)
    y_train_t = torch.FloatTensor(y_train).to(device)
    X_val_t = torch.FloatTensor(X_val).to(device)
    y_val_t = torch.FloatTensor(y_val).to(device)

    # Model
    model = HyperbolicModel(X.shape[1], y.shape[1]).to(device)
    loss_fn = PearsonCosineLoss(alpha=alpha)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    n = X_train.shape[0]
    steps_per_epoch = int(np.ceil(n / batch_size))

    # Training loop
    for epoch in range(epochs):
        model.train()
        idx = np.random.permutation(n)
        for i in range(steps_per_epoch):
            batch_idx = idx[i*batch_size:(i+1)*batch_size]
            xb = X_train_t[batch_idx]
            yb = y_train_t[batch_idx]
            optimizer.zero_grad()
            y_pred = model(xb)
            loss = loss_fn(y_pred, yb)
            loss.backward()
            optimizer.step()
        if (epoch+1) % 20 == 0:
            print(f"Epoch {epoch+1}/{epochs}: Train Loss = {loss.item():.4f}")

    model.eval()
    with torch.no_grad():
        y_train_pred = model(X_train_t).cpu().numpy()
        y_val_pred = model(X_val_t).cpu().numpy()
    y_train_pred = np.clip(y_train_pred, 0.0, 5.0)  # Clip predictions to valid range
    y_val_pred = np.clip(y_val_pred, 0.0, 5.0)

    # Print metrics on split data
    train_metrics = calculate_metrics(y_train, y_train_pred)
    val_metrics = calculate_metrics(y_val, y_val_pred)
    print("\\nTrain Metrics:", train_metrics)
    print("Validation Metrics:", val_metrics)

    # Retrain on full data
    model = HyperbolicModel(X.shape[1], y.shape[1]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    X_full_t = torch.FloatTensor(X).to(device)
    y_full_t = torch.FloatTensor(y).to(device)
    for epoch in range(epochs):
        optimizer.zero_grad()
        y_pred = model(X_full_t)
        loss = loss_fn(y_pred, y_full_t)
        loss.backward()
        optimizer.step()
    model.eval()
    with torch.no_grad():
        y_full_pred = model(X_full_t).cpu().numpy()
    y_full_pred = np.clip(y_full_pred, 0.0, 5.0)
    full_metrics = calculate_metrics(y, y_full_pred)
    print("\\nMetrics on Full Data:", full_metrics)
    return model, device
# Run training and print metrics
model, device = train_and_evaluate(
    x_df_lasso.to_numpy(dtype=np.float32),
    y_df.to_numpy(dtype=np.float32),
    test_ratio=0.2,
    epochs=100,
    lr=1e-3,
    batch_size=32,
    alpha=0.5
)

# Convert all columns to numeric (with NaNs where conversion failed)
X_test_numeric = X_test.apply(pd.to_numeric, errors='coerce')

# Fill NaN values (e.g., with 0 or other imputation strategy)
X_test_filled = X_test_numeric.fillna(0)

# Convert to numpy array of floats
X_test_arr = X_test_filled.to_numpy(dtype=np.float32)
with torch.no_grad():
    X_test_tensor = torch.FloatTensor(X_test_arr).to(device)
    test_preds = model(X_test_tensor).cpu().numpy()
test_preds = np.clip(test_preds, 0.0, 5.0)
# Save submission
test_submission = test_form[['stimulus']].copy()
for i, col in enumerate(target_cols):
    test_submission[col] = test_preds[:, i]

test_submission.to_csv('hyperbolic_task1_pcalasso.csv', index=False)
print("Test predictions saved.")
print(f"Test submission shape: {test_submission.shape}")


Epoch 20/100: Train Loss = 0.3192
Epoch 40/100: Train Loss = 0.2657
Epoch 60/100: Train Loss = 0.1955
Epoch 80/100: Train Loss = 0.2055
Epoch 100/100: Train Loss = 0.1724
\nTrain Metrics: {'MSE': 0.04865102097392082, 'MAE': 0.11435043066740036, 'R2': 0.359571635723114, 'Avg Pearson Correlation': np.float64(0.6559933136022059), 'Avg Cosine Similarity': np.float32(0.8611705)}
Validation Metrics: {'MSE': 0.07503879070281982, 'MAE': 0.1535327136516571, 'R2': -0.06507529318332672, 'Avg Pearson Correlation': np.float64(0.33903986808351716), 'Avg Cosine Similarity': np.float32(0.6828246)}
\nMetrics on Full Data: {'MSE': 0.056439291685819626, 'MAE': 0.1415093094110489, 'R2': 0.22781814634799957, 'Avg Pearson Correlation': np.float64(0.5094723148818334), 'Avg Cosine Similarity': np.float32(0.79340154)}
Test predictions saved.
Test submission shape: (31, 52)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# --- Custom metrics ---
def pearson_correlation_score(y_true, y_pred):
    corrs = []
    for i in range(y_true.shape[1]):
        corr = np.corrcoef(y_true[:, i], y_pred[:, i])[0, 1]
        if np.isnan(corr): corr = 0
        corrs.append(corr)
    return np.mean(corrs)

def cosine_similarity_score(y_true, y_pred):
    similarities = []
    for i in range(y_true.shape[0]):
        y_true_norm = y_true[i] / (np.linalg.norm(y_true[i]) + 1e-8)
        y_pred_norm = y_pred[i] / (np.linalg.norm(y_pred[i]) + 1e-8)
        similarity = np.dot(y_true_norm, y_pred_norm)
        similarities.append(similarity)
    return np.mean(similarities)

def calculate_metrics(y_true, y_pred):
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R2": r2_score(y_true, y_pred),
        "Avg Pearson Correlation": pearson_correlation_score(y_true, y_pred),
        "Avg Cosine Similarity": cosine_similarity_score(y_true, y_pred),
    }

# --------- Example workflow ---------
# These need to be set before running:
# X_df: DataFrame of features, y_df: DataFrame of targets (n_samples, n_targets)
# X_test_df: DataFrame of test features
# test_form: DataFrame with 'stimulus' column, matches rows of X_test_df
# target_cols: list of 51 string column names

# 1. Convert to numpy if needed
X = x_df_lasso.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)
y = y_df.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)

X_test = X_test.to_numpy(dtype=np.float32)

# 2. 80/20 split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Train MultiOutput RandomForest
rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42, n_jobs=-1))
rf.fit(X_train, y_train)

# 4. Print metrics on splits
y_train_pred = rf.predict(X_train)
y_val_pred = rf.predict(X_val)
y_train_pred = np.clip(y_train_pred, 0.0, 5.0)
y_val_pred = np.clip(y_val_pred, 0.0, 5.0)
print("Training metrics:")
print(calculate_metrics(y_train, y_train_pred))
print("\nValidation metrics:")
print(calculate_metrics(y_val, y_val_pred))

# 5. Retrain on entire dataset, print metrics
rf_full = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42, n_jobs=-1))
rf_full.fit(X, y)
y_full_pred = rf_full.predict(X)
y_full_pred = np.clip(y_full_pred, 0.0, 5.0)
print("\nFull dataset metrics:")
print(calculate_metrics(y, y_full_pred))

# 6. Predict on test set, save file
test_preds = rf_full.predict(X_test)
test_preds = np.clip(test_preds, 0.0, 5.0)
test_submission = test_form[['stimulus']].copy()
for i, col in enumerate(target_cols):
    test_submission[col] = test_preds[:, i]
test_submission.to_csv('rf_corr_cosine_metrics_test_pcalasso_task1.csv', index=False)
print("Test predictions saved.")
print(f"Test submission shape: {test_submission.shape}")


Training metrics:
{'MSE': 0.00961315256578003, 'MAE': 0.056235240379853293, 'R2': 0.8726747980654954, 'Avg Pearson Correlation': np.float64(0.953686032946323), 'Avg Cosine Similarity': np.float64(0.9616390605602336)}

Validation metrics:
{'MSE': 0.06406072680997, 'MAE': 0.15094556960081654, 'R2': 0.047752748179677944, 'Avg Pearson Correlation': np.float64(0.43850742826886024), 'Avg Cosine Similarity': np.float64(0.725503530674159)}

Full dataset metrics:
{'MSE': 0.00885623069946202, 'MAE': 0.054235231075094106, 'R2': 0.881433704821212, 'Avg Pearson Correlation': np.float64(0.9554680404239337), 'Avg Cosine Similarity': np.float64(0.9646680611278172)}
Test predictions saved.
Test submission shape: (31, 52)
