In [None]:
import pandas as pd
x_df_lasso= pd.read_csv("x_train_concat_task2.csv")
X_test= pd.read_csv("x_test_concat_task2.csv")
y_df= pd.read_csv("y_train.csv")
test_form= pd.read_csv("TASK2_Test_set_Submission_form.csv")
# Remove unwanted columns



target_cols = [
    'Green', 'Cucumber', 'Herbal', 'Mint', 'Woody', 'Pine', 'Floral',
    'Powdery', 'Fruity', 'Citrus', 'Tropical', 'Berry', 'Peach', 'Sweet',
    'Caramellic', 'Vanilla', 'BrownSpice', 'Smoky', 'Burnt', 'Roasted',
    'Grainy', 'Meaty', 'Nutty', 'Fatty', 'Coconut', 'Waxy', 'Dairy',
    'Buttery', 'Cheesy', 'Sour', 'Fermented', 'Sulfurous', 'Garlic.Onion',
    'Earthy', 'Mushroom', 'Musty', 'Ammonia', 'Fishy', 'Fecal',
    'Rotten.Decay', 'Rubber', 'Phenolic', 'Animal', 'Medicinal',
    'Cooling', 'Sharp', 'Chlorine', 'Alcoholic', 'Plastic', 'Ozone', 'Metallic'
]

In [None]:
print(x_df_lasso.shape)

print(X_test.shape)
print(y_df.shape)



(393, 101)
(130, 101)
(393, 51)


# **Hyperbolic model**

In [None]:
import torch
import torch.nn as nn

class PearsonCosineLoss(nn.Module):
    def __init__(self, alpha=0.5, eps=1e-8):
        super().__init__()
        self.alpha = alpha
        self.eps = eps
        self.cosine_similarity = nn.CosineSimilarity(dim=1, eps=eps)

    def forward(self, y_pred, y_true):
        # Pearson Correlation (1 - correlation for minimization)
        y_pred_centered = y_pred - y_pred.mean(dim=1, keepdim=True)
        y_true_centered = y_true - y_true.mean(dim=1, keepdim=True)
        numerator = (y_pred_centered * y_true_centered).sum(dim=1)
        denominator = (y_pred_centered.pow(2).sum(dim=1) * y_true_centered.pow(2).sum(dim=1)).sqrt() + self.eps
        pearson_corr = numerator / denominator
        pearson_loss = 1 - pearson_corr

        # Cosine Similarity (1 - similarity for minimization)
        cosine_loss = 1 - self.cosine_similarity(y_pred, y_true)
        loss = self.alpha * pearson_loss + (1 - self.alpha) * cosine_loss
        return loss.mean()
class HyperbolicModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)
        self.tanh = nn.Tanh()
    def forward(self, x):
        x = self.tanh(self.fc1(x))
        x = self.tanh(self.fc2(x))
        x = self.fc3(x)
        return x

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    corrs = [np.corrcoef(y_true[:,i], y_pred[:,i])[0,1] if np.std(y_true[:,i])>0 and np.std(y_pred[:,i])>0 else 0 for i in range(y_true.shape[1])]
    avg_pearson = np.mean(corrs)
    norms_true = np.linalg.norm(y_true, axis=1)
    norms_pred = np.linalg.norm(y_pred, axis=1)
    cos = np.sum(y_true * y_pred, axis=1) / (norms_true * norms_pred + 1e-8)
    avg_cosine = np.mean(cos)
    return {
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'Avg Pearson Correlation': avg_pearson,
        'Avg Cosine Similarity': avg_cosine
    }

# Training routine
def train_and_evaluate(X, y, test_ratio=0.2, epochs=100, lr=1e-3, batch_size=32, alpha=0.5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # Split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_ratio, random_state=42)
    X_train_t = torch.FloatTensor(X_train).to(device)
    y_train_t = torch.FloatTensor(y_train).to(device)
    X_val_t = torch.FloatTensor(X_val).to(device)
    y_val_t = torch.FloatTensor(y_val).to(device)

    # Model
    model = HyperbolicModel(X.shape[1], y.shape[1]).to(device)
    loss_fn = PearsonCosineLoss(alpha=alpha)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    n = X_train.shape[0]
    steps_per_epoch = int(np.ceil(n / batch_size))

    # Training loop
    for epoch in range(epochs):
        model.train()
        idx = np.random.permutation(n)
        for i in range(steps_per_epoch):
            batch_idx = idx[i*batch_size:(i+1)*batch_size]
            xb = X_train_t[batch_idx]
            yb = y_train_t[batch_idx]
            optimizer.zero_grad()
            y_pred = model(xb)
            loss = loss_fn(y_pred, yb)
            loss.backward()
            optimizer.step()
        if (epoch+1) % 20 == 0:
            print(f"Epoch {epoch+1}/{epochs}: Train Loss = {loss.item():.4f}")

    model.eval()
    with torch.no_grad():
        y_train_pred = model(X_train_t).cpu().numpy()
        y_val_pred = model(X_val_t).cpu().numpy()

    # Print metrics on split data
    train_metrics = calculate_metrics(y_train, y_train_pred)
    val_metrics = calculate_metrics(y_val, y_val_pred)
    print("\\nTrain Metrics:", train_metrics)
    print("Validation Metrics:", val_metrics)

    # Retrain on full data
    model = HyperbolicModel(X.shape[1], y.shape[1]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    X_full_t = torch.FloatTensor(X).to(device)
    y_full_t = torch.FloatTensor(y).to(device)
    for epoch in range(epochs):
        optimizer.zero_grad()
        y_pred = model(X_full_t)
        loss = loss_fn(y_pred, y_full_t)
        loss.backward()
        optimizer.step()
    model.eval()
    with torch.no_grad():
        y_full_pred = model(X_full_t).cpu().numpy()
    full_metrics = calculate_metrics(y, y_full_pred)
    print("\\nMetrics on Full Data:", full_metrics)
    return model, device
# Run training and print metrics
model, device = train_and_evaluate(
    x_df_lasso.to_numpy(dtype=np.float32),
    y_df.to_numpy(dtype=np.float32),
    test_ratio=0.2,
    epochs=100,
    lr=1e-3,
    batch_size=32,
    alpha=0.5
)

# Convert all columns to numeric (with NaNs where conversion failed)
X_test_numeric = X_test.apply(pd.to_numeric, errors='coerce')

# Fill NaN values (e.g., with 0 or other imputation strategy)
X_test_filled = X_test_numeric.fillna(0)

# Convert to numpy array of floats
X_test_arr = X_test_filled.to_numpy(dtype=np.float32)
with torch.no_grad():
    X_test_tensor = torch.FloatTensor(X_test_arr).to(device)
    test_preds = model(X_test_tensor).cpu().numpy()

# Save submission
test_submission = test_form[['stimulus']].copy()
for i, col in enumerate(target_cols):
    test_submission[col] = test_preds[:, i]

test_submission.to_csv('hyperbolic_task2_pca.csv', index=False)
print("Test predictions saved.")
print(f"Test submission shape: {test_submission.shape}")


Epoch 20/100: Train Loss = 0.2170
Epoch 40/100: Train Loss = 0.2034
Epoch 60/100: Train Loss = 0.1382
Epoch 80/100: Train Loss = 0.1228
Epoch 100/100: Train Loss = 0.1107
\nTrain Metrics: {'MSE': 0.0583437941968441, 'MAE': 0.13845588266849518, 'R2': 0.428681880235672, 'Avg Pearson Correlation': np.float64(0.7757019969660994), 'Avg Cosine Similarity': np.float32(0.91562706)}
Validation Metrics: {'MSE': 0.08716513216495514, 'MAE': 0.1750948280096054, 'R2': 0.14064952731132507, 'Avg Pearson Correlation': np.float64(0.49657111434224094), 'Avg Cosine Similarity': np.float32(0.7389037)}
\nMetrics on Full Data: {'MSE': 0.07775706797838211, 'MAE': 0.16313889622688293, 'R2': 0.2487296462059021, 'Avg Pearson Correlation': np.float64(0.5662829883781576), 'Avg Cosine Similarity': np.float32(0.82520145)}
Test predictions saved.
Test submission shape: (130, 52)


# **Correlation regressor**

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import backend as K
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd


# Pearson correlation loss (maximize correlation by minimizing negative correlation)
def pearson_correlation_loss(y_true, y_pred):
    y_true_centered = y_true - tf.reduce_mean(y_true, axis=1, keepdims=True)
    y_pred_centered = y_pred - tf.reduce_mean(y_pred, axis=1, keepdims=True)
    numerator = tf.reduce_sum(y_true_centered * y_pred_centered, axis=1)
    denominator = tf.sqrt(tf.reduce_sum(tf.square(y_true_centered), axis=1)) * tf.sqrt(tf.reduce_sum(tf.square(y_pred_centered), axis=1))
    correlation = numerator / (denominator + 1e-8)
    return -tf.reduce_mean(correlation)


def create_olfactory_model(input_dim, output_dim):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_dim,)),
        Dense(64, activation='relu'),
        Dense(output_dim, activation='linear')
    ])
    model.compile(optimizer='adam', loss=pearson_correlation_loss, metrics=['mse'])
    return model


class CorrelationRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, input_dim=None, output_dim=None, epochs=100, batch_size=32, verbose=0):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        self.model_ = None

    def fit(self, X, y):
        if self.input_dim is None:
            self.input_dim = X.shape[1]
        if self.output_dim is None:
            self.output_dim = y.shape[1] if len(y.shape) > 1 else 1
        self.model_ = create_olfactory_model(self.input_dim, self.output_dim)
        self.model_.fit(
            X, y,
            epochs=self.epochs,
            batch_size=self.batch_size,
            verbose=self.verbose
        )
        return self

    def predict(self, X):
        if self.model_ is None:
            raise ValueError("Model not fitted yet")
        return self.model_.predict(X)


def pearson_correlation_score(y_true, y_pred):
    correlations = []
    for i in range(y_true.shape[1]):
        corr = np.corrcoef(y_true[:, i], y_pred[:, i])[0, 1]
        if np.isnan(corr):
            corr = 0
        correlations.append(corr)
    return np.mean(correlations)


correlation_scorer = make_scorer(pearson_correlation_score, greater_is_better=True)


from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    corrs = []
    cos_sims = []
    for i in range(y_true.shape[1]):
        y_true_col = y_true[:, i]
        y_pred_col = y_pred[:, i]

        # Pearson correlation
        if np.std(y_true_col) > 0 and np.std(y_pred_col) > 0:
            corr = np.corrcoef(y_true_col, y_pred_col)[0, 1]
        else:
            corr = 0
        corrs.append(corr)

        # Cosine similarity
        if np.linalg.norm(y_true_col) > 0 and np.linalg.norm(y_pred_col) > 0:
            cos_sim = cosine_similarity(
                y_true_col.reshape(1, -1), y_pred_col.reshape(1, -1))[0, 0]
        else:
            cos_sim = 0
        cos_sims.append(cos_sim)

    avg_pearson = np.mean(corrs)
    avg_cosine_similarity = np.mean(cos_sims)

    return {
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'Avg Pearson Correlation': avg_pearson,
        'Avg Cosine Similarity': avg_cosine_similarity
    }



# Convert DataFrames to numpy arrays
X = x_df_lasso.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)
y = y_df.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)


# 1. Split data 80% train, 20% validation
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42)

# 2. Initialize regressor
regressor = CorrelationRegressor(
    input_dim=X_train_split.shape[1],
    output_dim=y_train_split.shape[1],
    epochs=100,
    batch_size=32,
    verbose=1
)

# 3. Train on 80% split
regressor.fit(X_train_split, y_train_split)

# 4. Evaluate on train split and validation split
y_train_pred = regressor.predict(X_train_split)
y_val_pred = regressor.predict(X_val)

print("Training set metrics:")
print(calculate_metrics(y_train_split, y_train_pred))
print("\nValidation set metrics:")
print(calculate_metrics(y_val, y_val_pred))


# 5. Train on full dataset (train + val)
regressor_full = CorrelationRegressor(
    input_dim=X.shape[1],
    output_dim=y.shape[1],
    epochs=100,
    batch_size=32,
    verbose=1
)
regressor_full.fit(X, y)

# 6. Evaluate on entire training dataset
y_full_pred = regressor_full.predict(X)
print("\nFull training dataset metrics:")
print(calculate_metrics(y, y_full_pred))


# 7. Predict on test set
X_test_arr = X_test.to_numpy() if isinstance(X_test, pd.DataFrame) else X_test
predictions = regressor_full.predict(X_test_arr)

# 8. Save submission file
test_submission = test_form[['stimulus']].copy()
for i, col in enumerate(target_cols):
    test_submission[col] = predictions[:, i]

test_submission.to_csv('corr_test_task2_combined.csv', index=False)
print("Test predictions saved.")
print(f"Test submission shape: {test_submission.shape}")


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 55ms/step - loss: -0.0712 - mse: 29.1187
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - loss: -0.3211 - mse: 44.4138
Epoch 3/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: -0.3735 - mse: 63.5932
Epoch 4/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: -0.4284 - mse: 70.6566 
Epoch 5/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: -0.4387 - mse: 67.9878 
Epoch 6/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: -0.4683 - mse: 64.0970 
Epoch 7/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: -0.4949 - mse: 52.4813 
Epoch 8/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: -0.4823 - mse: 47.0436 
Epoch 9/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: -0.0131 - mse: 40.2363
Epoch 2/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: -0.2895 - mse: 67.7059 
Epoch 3/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: -0.3774 - mse: 91.5868 
Epoch 4/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: -0.4427 - mse: 92.4694 
Epoch 5/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: -0.4641 - mse: 86.1756 
Epoch 6/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: -0.4649 - mse: 78.7181 
Epoch 7/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: -0.4894 - mse: 63.3202 
Epoch 8/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: -0.4834 - mse: 49.9786 
Epoch 9/100
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

# **RF1**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import pandas as pd

# Custom Pearson correlation scorer
def pearson_correlation_score(y_true, y_pred):
    correlations = []
    for i in range(y_true.shape[1]):
        corr = np.corrcoef(y_true[:, i], y_pred[:, i])[0, 1]
        if np.isnan(corr):
            corr = 0
        correlations.append(corr)
    return np.mean(correlations)

correlation_scorer = make_scorer(pearson_correlation_score, greater_is_better=True)

# Evaluation metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    corrs = []
    cos_sims = []
    for i in range(y_true.shape[1]):
        y_true_col = y_true[:, i]
        y_pred_col = y_pred[:, i]

        # Pearson correlation
        if np.std(y_true_col) > 0 and np.std(y_pred_col) > 0:
            corr = np.corrcoef(y_true_col, y_pred_col)[0, 1]
        else:
            corr = 0
        corrs.append(corr)

        # Cosine similarity
        if np.linalg.norm(y_true_col) > 0 and np.linalg.norm(y_pred_col) > 0:
            cos_sim = cosine_similarity(
                y_true_col.reshape(1, -1), y_pred_col.reshape(1, -1))[0, 0]
        else:
            cos_sim = 0
        cos_sims.append(cos_sim)

    avg_pearson = np.mean(corrs)
    avg_cosine_similarity = np.mean(cos_sims)

    return {
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'Avg Pearson Correlation': avg_pearson,
        'Avg Cosine Similarity': avg_cosine_similarity
    }


# Prepare data
X = x_df_lasso.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)
y = y_df.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)

# Train-validation split
X_train_split, X_val, y_train_split, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train model
base_model = RandomForestRegressor(n_estimators=100, random_state=42)
model = MultiOutputRegressor(base_model)
model.fit(X_train_split, y_train_split)

# Predictions
y_train_pred = model.predict(X_train_split)
y_val_pred = model.predict(X_val)

# Evaluation
print("Training set metrics:")
print(calculate_metrics(y_train_split, y_train_pred))

print("\nValidation set metrics:")
print(calculate_metrics(y_val, y_val_pred))

# Train on full data
final_model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
final_model.fit(X, y)
y_pred=final_model.predict(X)
print(calculate_metrics(y, y_pred))
# Predict on test set
X_test_arr = X_test.to_numpy() if isinstance(X_test, pd.DataFrame) else X_test
predictions = final_model.predict(X_test_arr)

# Prepare submission
test_submission = test_form[['stimulus']].copy()
for i, col in enumerate(target_cols):
    test_submission[col] = predictions[:, i]

test_submission.to_csv('rf_test_task2_combined.csv', index=False)
print("Test predictions saved to rf_test_lasso.csv")
print(f"Test submission shape: {test_submission.shape}")


Training set metrics:
{'MSE': 0.010518383492117523, 'MAE': 0.06420235958896899, 'R2': 0.8921443693959343, 'Avg Pearson Correlation': np.float64(0.9673701840709672), 'Avg Cosine Similarity': np.float64(0.9712884166031108)}

Validation set metrics:
{'MSE': 0.06542474246573686, 'MAE': 0.16211139329666316, 'R2': 0.2592146740249138, 'Avg Pearson Correlation': np.float64(0.5446494684539364), 'Avg Cosine Similarity': np.float64(0.7452429569882943)}
{'MSE': 0.010049656856470149, 'MAE': 0.06250374935680934, 'R2': 0.8954517864510795, 'Avg Pearson Correlation': np.float64(0.9659844905167095), 'Avg Cosine Similarity': np.float64(0.9717169381991124)}
Test predictions saved to rf_test_lasso.csv
Test submission shape: (130, 52)


# **RF2**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# --- Custom metrics ---
def pearson_correlation_score(y_true, y_pred):
    corrs = []
    for i in range(y_true.shape[1]):
        corr = np.corrcoef(y_true[:, i], y_pred[:, i])[0, 1]
        if np.isnan(corr): corr = 0
        corrs.append(corr)
    return np.mean(corrs)

def cosine_similarity_score(y_true, y_pred):
    similarities = []
    for i in range(y_true.shape[0]):
        y_true_norm = y_true[i] / (np.linalg.norm(y_true[i]) + 1e-8)
        y_pred_norm = y_pred[i] / (np.linalg.norm(y_pred[i]) + 1e-8)
        similarity = np.dot(y_true_norm, y_pred_norm)
        similarities.append(similarity)
    return np.mean(similarities)

def calculate_metrics(y_true, y_pred):
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R2": r2_score(y_true, y_pred),
        "Avg Pearson Correlation": pearson_correlation_score(y_true, y_pred),
        "Avg Cosine Similarity": cosine_similarity_score(y_true, y_pred),
    }

# --------- Example workflow ---------
# These need to be set before running:
# X_df: DataFrame of features, y_df: DataFrame of targets (n_samples, n_targets)
# X_test_df: DataFrame of test features
# test_form: DataFrame with 'stimulus' column, matches rows of X_test_df
# target_cols: list of 51 string column names

# 1. Convert to numpy if needed
X = x_df_lasso.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)
y = y_df.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)

X_test = X_test.to_numpy(dtype=np.float32)

# 2. 80/20 split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Train MultiOutput RandomForest
rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42, n_jobs=-1))
rf.fit(X_train, y_train)

# 4. Print metrics on splits
y_train_pred = rf.predict(X_train)
y_val_pred = rf.predict(X_val)
print("Training metrics:")
print(calculate_metrics(y_train, y_train_pred))
print("\nValidation metrics:")
print(calculate_metrics(y_val, y_val_pred))

# 5. Retrain on entire dataset, print metrics
rf_full = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42, n_jobs=-1))
rf_full.fit(X, y)
y_full_pred = rf_full.predict(X)
print("\nFull dataset metrics:")
print(calculate_metrics(y, y_full_pred))

# 6. Predict on test set, save file
test_preds = rf_full.predict(X_test)
test_submission = test_form[['stimulus']].copy()
for i, col in enumerate(target_cols):
    test_submission[col] = test_preds[:, i]
test_submission.to_csv('rf_corr_cosine_metrics_test_task2_combined.csv', index=False)
print("Test predictions saved.")
print(f"Test submission shape: {test_submission.shape}")


Training metrics:
{'MSE': 0.010518383492117523, 'MAE': 0.06420235958896899, 'R2': 0.8921443693959343, 'Avg Pearson Correlation': np.float64(0.9673701840709672), 'Avg Cosine Similarity': np.float64(0.9756392310670461)}

Validation metrics:
{'MSE': 0.06542474246573686, 'MAE': 0.16211139329666316, 'R2': 0.2592146740249138, 'Avg Pearson Correlation': np.float64(0.5446494684539364), 'Avg Cosine Similarity': np.float64(0.8035231979903125)}

Full dataset metrics:
{'MSE': 0.010049656856470149, 'MAE': 0.06250374935680934, 'R2': 0.8954517864510795, 'Avg Pearson Correlation': np.float64(0.9659844905167095), 'Avg Cosine Similarity': np.float64(0.9765042337977773)}
Test predictions saved.
Test submission shape: (130, 52)


# **XG1**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb

# --------------- Metrics ---------------

def pearson_correlation_score(y_true, y_pred):
    corrs = []
    for i in range(y_true.shape[1]):
        corr = np.corrcoef(y_true[:, i], y_pred[:, i])[0, 1]
        if np.isnan(corr):
            corr = 0
        corrs.append(corr)
    return np.mean(corrs)

correlation_scorer = make_scorer(pearson_correlation_score, greater_is_better=True)

def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    avg_pearson = pearson_correlation_score(y_true, y_pred)

    return {
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'Avg Pearson Correlation': avg_pearson
    }

# --------------- Custom Objectives for XGBoost ---------------
# They receive (preds, dtrain) and must return (grad, hess)

def pearson_correlation_obj(preds, dtrain):
    """
    Approximate gradient and hessian for negative Pearson correlation loss.
    This is a rough approximation for demonstration only.
    """
    labels = dtrain.get_label()
    preds = preds.reshape(labels.shape)

    pred_mean = np.mean(preds)
    label_mean = np.mean(labels)
    pred_centered = preds - pred_mean
    label_centered = labels - label_mean

    cov = np.sum(pred_centered * label_centered)
    pred_var = np.sum(pred_centered ** 2) + 1e-8
    label_var = np.sum(label_centered ** 2) + 1e-8

    # Gradient (negative derivative of correlation)
    grad = - (label_centered / (np.sqrt(pred_var) * np.sqrt(label_var))) + \
           (cov * pred_centered) / (pred_var ** 1.5 * np.sqrt(label_var))

    # Hessian approximation with small constant
    hess = np.ones_like(grad) * 1e-4

    return grad, hess

def cosine_similarity_obj(preds, dtrain):
    """
    Approximate gradient and hessian for negative cosine similarity loss.
    Rough approximation for demonstration.
    """
    labels = dtrain.get_label()
    preds = preds.reshape(labels.shape)

    pred_norm = np.linalg.norm(preds) + 1e-8
    label_norm = np.linalg.norm(labels) + 1e-8

    cos_sim = np.dot(preds, labels) / (pred_norm * label_norm)
    grad = - (labels / (pred_norm * label_norm)) + \
           (cos_sim * preds) / (pred_norm ** 2)
    hess = np.ones_like(grad) * 1e-4

    return grad, hess

# --------------- Model Wrappers ---------------

class XGBoostCorrelationRegressor:
    """XGBoost regressor for single output with Pearson correlation loss."""
    def __init__(self, **kwargs):
        self.model = xgb.XGBRegressor(objective=pearson_correlation_obj, **kwargs)

    def fit(self, X, y):
        # y must be 1D for single output
        y = y.reshape(-1)
        self.model.fit(X, y)
        return self

    def predict(self, X):
        return self.model.predict(X).reshape(-1, 1)

class XGBoostCosineSimilarityRegressor:
    """XGBoost regressor for single output with Cosine similarity loss."""
    def __init__(self, **kwargs):
        self.model = xgb.XGBRegressor(objective=cosine_similarity_obj, **kwargs)

    def fit(self, X, y):
        y = y.reshape(-1)
        self.model.fit(X, y)
        return self

    def predict(self, X):
        return self.model.predict(X).reshape(-1, 1)

class MultiOutputXGBoostRegressor:
    """Multi-output regressor as sklearn wrapper over XGBRegressor."""
    def __init__(self, **kwargs):
        base_est = xgb.XGBRegressor(objective='reg:squarederror', **kwargs)
        self.model = MultiOutputRegressor(base_est)

    def fit(self, X, y):
        self.model.fit(X, y)
        return self

    def predict(self, X):
        return self.model.predict(X)

# --------------- Training & evaluation function ---------------

def train_evaluate_save(model, X, y, X_test, test_form, target_cols, model_name="model"):
    """
    Train with 80/20 split, print evaluation, retrain on full data, print evaluation, predict on test, save CSV.
    """
    print(f"--- Training and evaluating {model_name} ---")
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train on split
    model.fit(X_train, y_train)

    # Predict & evaluate on train and validation split
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    print("Train Metrics:", calculate_metrics(y_train, y_train_pred))
    print("Validation Metrics:", calculate_metrics(y_val, y_val_pred))

    # Retrain on full dataset
    model.fit(X, y)
    y_full_pred = model.predict(X)
    print("Full Data Metrics:", calculate_metrics(y, y_full_pred))

    # Predict on test data
    test_preds = model.predict(X_test)

    # Save predictions file
    test_submission = test_form[['stimulus']].copy()
    for i, col in enumerate(target_cols):
        test_submission[col] = test_preds[:, i] if test_preds.shape[1] > 1 else test_preds[:, 0]

    output_file = f'{model_name}_test_task2_combined.csv'
    test_submission.to_csv(output_file, index=False)
    print(f"Test predictions saved to {output_file}")
    print(f"Test submission shape: {test_submission.shape}\n")

# --------------- Usage Notes ---------------
# Variables you must have prepared before calling:

# X: numpy array of shape (n_samples, n_features)
# y: numpy array of shape (n_samples, 51) - targets
# X_test: numpy array for test features, shape (n_test_samples, n_features)
# test_form: pandas DataFrame with 'stimulus' column for test samples
# target_cols: list of 51 odor descriptor column names, matching y columns

# Example calls (uncomment and set your datasets appropriately):
#
X = x_df_lasso.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)
y = y_df.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)

model_pearson = MultiOutputXGBoostRegressor()  # Multitarget with standard MSE
train_evaluate_save(model_pearson, X, y, X_test, test_form, target_cols, model_name="MultiOutputXGBoost_MSE")



--- Training and evaluating MultiOutputXGBoost_MSE ---
Train Metrics: {'MSE': 0.0005327186081558466, 'MAE': 0.0024665924720466137, 'R2': 0.9956809878349304, 'Avg Pearson Correlation': np.float64(0.9978358783317698)}
Validation Metrics: {'MSE': 0.07560475170612335, 'MAE': 0.1649562120437622, 'R2': 0.17050018906593323, 'Avg Pearson Correlation': np.float64(0.5156266204771057)}
Full Data Metrics: {'MSE': 0.0006901447777636349, 'MAE': 0.003441475797444582, 'R2': 0.9933826923370361, 'Avg Pearson Correlation': np.float64(0.9966812609679261)}
Test predictions saved to MultiOutputXGBoost_MSE_test_task2_combined.csv
Test submission shape: (130, 52)



# **XG2**

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity

# ------- Custom metrics -------

def pearson_correlation_score(y_true, y_pred):
    corrs = []
    for i in range(y_true.shape[1]):
        corr = np.corrcoef(y_true[:, i], y_pred[:, i])[0, 1]
        corrs.append(0 if np.isnan(corr) else corr)
    return np.mean(corrs)

def cosine_similarity_score(y_true, y_pred):
    cos_sims = []
    for i in range(y_true.shape[1]):
        if np.linalg.norm(y_true[:, i]) > 0 and np.linalg.norm(y_pred[:, i]) > 0:
            cs = cosine_similarity(y_true[:, i].reshape(1, -1), y_pred[:, i].reshape(1, -1))[0, 0]
        else:
            cs = 0
        cos_sims.append(cs)
    return np.mean(cos_sims)

def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    pearson = pearson_correlation_score(y_true, y_pred)
    cosine = cosine_similarity_score(y_true, y_pred)
    return {
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'Avg Pearson Correlation': pearson,
        'Avg Cosine Similarity': cosine
    }

# ------- Custom objectives for XGBoost -------

def pearson_correlation_obj(preds, dtrain):
    labels = dtrain.get_label()
    preds = preds.reshape(labels.shape)
    y_true_mean = np.mean(labels)
    y_pred_mean = np.mean(preds)
    y_true_centered = labels - y_true_mean
    y_pred_centered = preds - y_pred_mean
    numerator = np.sum(y_true_centered * y_pred_centered)
    y_true_std = np.sqrt(np.sum(y_true_centered**2)) + 1e-8
    y_pred_std = np.sqrt(np.sum(y_pred_centered**2)) + 1e-8
    denominator = y_true_std * y_pred_std

    d_numerator = y_true_centered
    d_y_pred_std = y_pred_centered / y_pred_std
    d_denominator = y_true_std * d_y_pred_std

    gradient = -(d_numerator * denominator - numerator * d_denominator) / (denominator ** 2)
    hessian = np.ones_like(gradient) * 0.1
    return gradient, hessian

def cosine_similarity_obj(preds, dtrain):
    labels = dtrain.get_label()
    preds = preds.reshape(labels.shape)
    pred_norm = np.linalg.norm(preds) + 1e-8
    label_norm = np.linalg.norm(labels) + 1e-8
    y_true_normalized = labels / label_norm
    y_pred_normalized = preds / pred_norm

    gradient = - y_true_normalized / pred_norm
    hessian = np.ones_like(gradient) * 0.1
    return gradient, hessian

# ------- Training function per model -------

def train_custom_xgb_model(X, y, X_val, y_val, X_test, test_form, target_cols, custom_obj, model_name):
    print(f"Training {model_name} ...")

    val_preds = []
    val_trues = []

    test_preds = np.zeros((X_test.shape[0], y.shape[1]))

    for i, col in enumerate(target_cols):
        print(f"Training target: {col}")
        # create DMatrix for training
        dtrain = xgb.DMatrix(X, label=y[:, i])
        dval = xgb.DMatrix(X_val, label=y_val[:, i])
        params = {
            'objective': 'reg:squarederror',  # ignored because of custom obj
            'max_depth': 6,
            'learning_rate': 0.1,
            'verbosity': 0,
            'seed': 42,
        }
        # Train model with early stopping evaluated on val split
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=100,
            obj=custom_obj,
            evals=[(dval, 'validation')],
            early_stopping_rounds=10,
            verbose_eval=False
        )

        # Predict validation
        val_pred = model.predict(dval)
        val_preds.append(val_pred)
        val_trues.append(y_val[:, i])

        # Refit on full data for test prediction
        dfull = xgb.DMatrix(X, label=y[:, i])
        model_full = xgb.train(params, dfull, num_boost_round=model.best_iteration or 100, obj=custom_obj, verbose_eval=False)

        dtest = xgb.DMatrix(X_test)
        test_preds[:, i] = model_full.predict(dtest)

    # Aggregate validation results
    val_preds_arr = np.column_stack(val_preds)
    val_trues_arr = np.column_stack(val_trues)

    print(f"{model_name} - Validation metrics (aggregated):")
    print(calculate_metrics(val_trues_arr, val_preds_arr))

    # Save test predictions
    submission = test_form[['stimulus']].copy()
    for i, col in enumerate(target_cols):
        submission[col] = test_preds[:, i]
    submission_file = f"{model_name}_test_task2_combined.csv"
    submission.to_csv(submission_file, index=False)
    print(f"Saved test predictions to {submission_file}")

# ------- Prepare data -------

X = x_df_lasso.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)
y = y_df.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)
X_test_arr = X_test.to_numpy() if isinstance(X_test, pd.DataFrame) else X_test
target_cols = target_cols  # list of 51 target column names
test_form_df = test_form  # contains 'stimulus'


# Split train data for validation
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------- Run models -------

# 1. Pearson correlation objective model
train_custom_xgb_model(
    X_train_split, y_train_split, X_val, y_val, X_test_arr, test_form_df, target_cols,
    pearson_correlation_obj, "XGBoost_PearsonCorrelation"
)

# 2. Cosine similarity objective model
train_custom_xgb_model(
    X_train_split, y_train_split, X_val, y_val, X_test_arr, test_form_df, target_cols,
    cosine_similarity_obj, "XGBoost_CosineSimilarity"
)


Training XGBoost_PearsonCorrelation ...
Training target: Green
Training target: Cucumber
Training target: Herbal
Training target: Mint
Training target: Woody
Training target: Pine
Training target: Floral
Training target: Powdery
Training target: Fruity
Training target: Citrus
Training target: Tropical
Training target: Berry
Training target: Peach
Training target: Sweet
Training target: Caramellic
Training target: Vanilla
Training target: BrownSpice
Training target: Smoky
Training target: Burnt
Training target: Roasted
Training target: Grainy
Training target: Meaty
Training target: Nutty
Training target: Fatty
Training target: Coconut
Training target: Waxy
Training target: Dairy
Training target: Buttery
Training target: Cheesy
Training target: Sour
Training target: Fermented
Training target: Sulfurous
Training target: Garlic.Onion
Training target: Earthy
Training target: Mushroom
Training target: Musty
Training target: Ammonia
Training target: Fishy
Training target: Fecal
Training targe