# **embeddings**

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
test_form = pd.read_csv('TASK2_Test_set_Submission_form.csv', encoding='latin1')
x_df_lasso=pd.read_csv('X_train_pca_99.csv')
y_df=pd.read_csv('y_train.csv')
X_test=pd.read_csv('X_test_pca_99.csv')

target_cols = [
    'Green', 'Cucumber', 'Herbal', 'Mint', 'Woody', 'Pine', 'Floral',
    'Powdery', 'Fruity', 'Citrus', 'Tropical', 'Berry', 'Peach', 'Sweet',
    'Caramellic', 'Vanilla', 'BrownSpice', 'Smoky', 'Burnt', 'Roasted',
    'Grainy', 'Meaty', 'Nutty', 'Fatty', 'Coconut', 'Waxy', 'Dairy',
    'Buttery', 'Cheesy', 'Sour', 'Fermented', 'Sulfurous', 'Garlic.Onion',
    'Earthy', 'Mushroom', 'Musty', 'Ammonia', 'Fishy', 'Fecal',
    'Rotten.Decay', 'Rubber', 'Phenolic', 'Animal', 'Medicinal',
    'Cooling', 'Sharp', 'Chlorine', 'Alcoholic', 'Plastic', 'Ozone', 'Metallic'
]

In [None]:
import torch
import torch.nn as nn

class PearsonCosineLoss(nn.Module):
    def __init__(self, alpha=0.5, eps=1e-8):
        super().__init__()
        self.alpha = alpha
        self.eps = eps
        self.cosine_similarity = nn.CosineSimilarity(dim=1, eps=eps)

    def forward(self, y_pred, y_true):
        # Pearson Correlation (1 - correlation for minimization)
        y_pred_centered = y_pred - y_pred.mean(dim=1, keepdim=True)
        y_true_centered = y_true - y_true.mean(dim=1, keepdim=True)
        numerator = (y_pred_centered * y_true_centered).sum(dim=1)
        denominator = (y_pred_centered.pow(2).sum(dim=1) * y_true_centered.pow(2).sum(dim=1)).sqrt() + self.eps
        pearson_corr = numerator / denominator
        pearson_loss = 1 - pearson_corr

        # Cosine Similarity (1 - similarity for minimization)
        cosine_loss = 1 - self.cosine_similarity(y_pred, y_true)
        loss = self.alpha * pearson_loss + (1 - self.alpha) * cosine_loss
        return loss.mean()
class HyperbolicModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)
        self.tanh = nn.Tanh()
    def forward(self, x):
        x = self.tanh(self.fc1(x))
        x = self.tanh(self.fc2(x))Training metrics:
{'MSE': 0.010457421466746362, 'MAE': 0.06284540379754484, 'R2': 0.8944068177952577, 'Avg Pearson Correlation': np.float64(0.9659380393732574), 'Avg Cosine Similarity': np.float64(0.9762317650466045)}

Validation metrics:
{'MSE': 0.05917115803051844, 'MAE': 0.15694276661235151, 'R2': 0.3462292577074942, 'Avg Pearson Correlation': np.float64(0.6098647787917398), 'Avg Cosine Similarity': np.float64(0.8395876682714307)}

Full dataset metrics:
        x = self.fc3(x)
        return x

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    corrs = [np.corrcoef(y_true[:,i], y_pred[:,i])[0,1] if np.std(y_true[:,i])>0 and np.std(y_pred[:,i])>0 else 0 for i in range(y_true.shape[1])]
    avg_pearson = np.mean(corrs)
    norms_true = np.linalg.norm(y_true, axis=1)
    norms_pred = np.linalg.norm(y_pred, axis=1)
    cos = np.sum(y_true * y_pred, axis=1) / (norms_true * norms_pred + 1e-8)
    avg_cosine = np.mean(cos)
    return {
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'Avg Pearson Correlation': avg_pearson,
        'Avg Cosine Similarity': avg_cosine
    }

# Training routine
# Training routine
def train_and_evaluate(X, y, test_ratio=0.2, epochs=100, lr=1e-3, batch_size=32, alpha=0.5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # Split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_ratio, random_state=42)
    X_train_t = torch.FloatTensor(X_train).to(device)
    y_train_t = torch.FloatTensor(y_train).to(device)
    X_val_t = torch.FloatTensor(X_val).to(device)
    y_val_t = torch.FloatTensor(y_val).to(device)

    # Model
    model = HyperbolicModel(X.shape[1], y.shape[1]).to(device)
    loss_fn = PearsonCosineLoss(alpha=alpha)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    n = X_train.shape[0]
    steps_per_epoch = int(np.ceil(n / batch_size))

    # Training loop
    for epoch in range(epochs):
        model.train()
        idx = np.random.permutation(n)
        for i in range(steps_per_epoch):
            batch_idx = idx[i*batch_size:(i+1)*batch_size]
            xb = X_train_t[batch_idx]
            yb = y_train_t[batch_idx]
            optimizer.zero_grad()
            y_pred = model(xb)
            loss = loss_fn(y_pred, yb)
            loss.backward()
            optimizer.step()
        if (epoch+1) % 20 == 0:
            print(f"Epoch {epoch+1}/{epochs}: Train Loss = {loss.item():.4f}")

    model.eval()
    with torch.no_grad():
        y_train_pred = model(X_train_t).cpu().numpy()
        y_val_pred = model(X_val_t).cpu().numpy()
    y_train_pred = np.clip(y_train_pred, 0.0, 5.0)  # Clip predictions to valid range
    y_val_pred = np.clip(y_val_pred, 0.0, 5.0)

    # Print metrics on split data
    train_metrics = calculate_metrics(y_train, y_train_pred)
    val_metrics = calculate_metrics(y_val, y_val_pred)
    print("\\nTrain Metrics:", train_metrics)
    print("Validation Metrics:", val_metrics)

    # Retrain on full data
    model = HyperbolicModel(X.shape[1], y.shape[1]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    X_full_t = torch.FloatTensor(X).to(device)
    y_full_t = torch.FloatTensor(y).to(device)
    for epoch in range(epochs):
        optimizer.zero_grad()
        y_pred = model(X_full_t)
        loss = loss_fn(y_pred, y_full_t)
        loss.backward()
        optimizer.step()
    model.eval()
    with torch.no_grad():
        y_full_pred = model(X_full_t).cpu().numpy()
    y_full_pred = np.clip(y_full_pred, 0.0, 5.0)
    full_metrics = calculate_metrics(y, y_full_pred)
    print("\\nMetrics on Full Data:", full_metrics)
    return model, device
# Run training and print metrics
model, device = train_and_evaluate(
    x_df_lasso.to_numpy(dtype=np.float32),
    y_df.to_numpy(dtype=np.float32),
    test_ratio=0.2,
    epochs=100,
    lr=1e-3,
    batch_size=32,
    alpha=0.5
)

# Convert all columns to numeric (with NaNs where conversion failed)
X_test_numeric = X_test.apply(pd.to_numeric, errors='coerce')

# Fill NaN values (e.g., with 0 or other imputation strategy)
X_test_filled = X_test_numeric.fillna(0)

# Convert to numpy array of floats
X_test_arr = X_test_filled.to_numpy(dtype=np.float32)
with torch.no_grad():
    X_test_tensor = torch.FloatTensor(X_test_arr).to(device)
    test_preds = model(X_test_tensor).cpu().numpy()
test_preds = np.clip(test_preds, 0.0, 5.0)
# Save submission
test_submission = test_form[['stimulus']].copy()
for i, col in enumerate(target_cols):
    test_submission[col] = test_preds[:, i]

test_submission.to_csv('hyperbolic_task1_embedding.csv', index=False)
print("Test predictions saved.")
print(f"Test submission shape: {test_submission.shape}")


Epoch 20/100: Train Loss = 0.1925
Epoch 40/100: Train Loss = 0.1136
Epoch 60/100: Train Loss = 0.0915
Epoch 80/100: Train Loss = 0.1551
Epoch 100/100: Train Loss = 0.1325
\nTrain Metrics: {'MSE': 0.0564488023519516, 'MAE': 0.13370798528194427, 'R2': 0.4468505084514618, 'Avg Pearson Correlation': np.float64(0.7138984317559982), 'Avg Cosine Similarity': np.float32(0.89378947)}
Validation Metrics: {'MSE': 0.0885881707072258, 'MAE': 0.17378298938274384, 'R2': 0.18818046152591705, 'Avg Pearson Correlation': np.float64(0.4832927958571777), 'Avg Cosine Similarity': np.float32(0.7661117)}
\nMetrics on Full Data: {'MSE': 0.07794491201639175, 'MAE': 0.15547186136245728, 'R2': 0.2616089880466461, 'Avg Pearson Correlation': np.float64(0.5925635027788144), 'Avg Cosine Similarity': np.float32(0.8463975)}
Test predictions saved.
Test submission shape: (130, 52)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# --- Custom metrics ---
def pearson_correlation_score(y_true, y_pred):
    corrs = []
    for i in range(y_true.shape[1]):
        corr = np.corrcoef(y_true[:, i], y_pred[:, i])[0, 1]
        if np.isnan(corr): corr = 0
        corrs.append(corr)
    return np.mean(corrs)

def cosine_similarity_score(y_true, y_pred):
    similarities = []
    for i in range(y_true.shape[0]):
        y_true_norm = y_true[i] / (np.linalg.norm(y_true[i]) + 1e-8)
        y_pred_norm = y_pred[i] / (np.linalg.norm(y_pred[i]) + 1e-8)
        similarity = np.dot(y_true_norm, y_pred_norm)
        similarities.append(similarity)
    return np.mean(similarities)

def calculate_metrics(y_true, y_pred):
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R2": r2_score(y_true, y_pred),
        "Avg Pearson Correlation": pearson_correlation_score(y_true, y_pred),
        "Avg Cosine Similarity": cosine_similarity_score(y_true, y_pred),
    }

# --------- Example workflow ---------
# These need to be set before running:
# X_df: DataFrame of features, y_df: DataFrame of targets (n_samples, n_targets)
# X_test_df: DataFrame of test features
# test_form: DataFrame with 'stimulus' column, matches rows of X_test_df
# target_cols: list of 51 string column names

# 1. Convert to numpy if needed
X = x_df_lasso.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)
y = y_df.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)

X_test = X_test.to_numpy(dtype=np.float32)

# 2. 80/20 split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Train MultiOutput RandomForest
rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42, n_jobs=-1))
rf.fit(X_train, y_train)

# 4. Print metrics on splits
y_train_pred = rf.predict(X_train)
y_val_pred = rf.predict(X_val)
y_train_pred = np.clip(y_train_pred, 0.0, 5.0)
y_val_pred = np.clip(y_val_pred, 0.0, 5.0)
print("Training metrics:")
print(calculate_metrics(y_train, y_train_pred))
print("\nValidation metrics:")
print(calculate_metrics(y_val, y_val_pred))

# 5. Retrain on entire dataset, print metrics
rf_full = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42, n_jobs=-1))
rf_full.fit(X, y)
y_full_pred = rf_full.predict(X)
y_full_pred = np.clip(y_full_pred, 0.0, 5.0)
print("\nFull dataset metrics:")
print(calculate_metrics(y, y_full_pred))

# 6. Predict on test set, save file
test_preds = rf_full.predict(X_test)
test_preds = np.clip(test_preds, 1.0, 5.0)
test_submission = test_form[['stimulus']].copy()
for i, col in enumerate(target_cols):
    test_submission[col] = test_preds[:, i]
test_submission.to_csv('rf_corr_cosine_metrics_test_task1_embedding.csv', index=False)
print("Test predictions saved.")
print(f"Test submission shape: {test_submission.shape}")


Training metrics:
{'MSE': 0.012924530089000828, 'MAE': 0.06970125282180799, 'R2': 0.8775518347496295, 'Avg Pearson Correlation': np.float64(0.9627352767812898), 'Avg Cosine Similarity': np.float64(0.9709924863735443)}

Validation metrics:
{'MSE': 0.0904898011345628, 'MAE': 0.18506177534377008, 'R2': 0.1767408317108979, 'Avg Pearson Correlation': np.float64(0.44802260227645946), 'Avg Cosine Similarity': np.float64(0.7506912447513886)}

Full dataset metrics:
{'MSE': 0.012487009699261904, 'MAE': 0.06828805190169289, 'R2': 0.8810881970432264, 'Avg Pearson Correlation': np.float64(0.961510866322858), 'Avg Cosine Similarity': np.float64(0.9718065694108035)}
Test predictions saved.
Test submission shape: (130, 52)


# **Embedding-95+selectkbest**

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
test_form = pd.read_csv('TASK2_Test_set_Submission_form.csv', encoding='latin1')
x_df_lasso=pd.read_csv('combined_train.csv')
y_df=pd.read_csv('y_train.csv')
X_test=pd.read_csv('combined_test.csv')

target_cols = [
    'Green', 'Cucumber', 'Herbal', 'Mint', 'Woody', 'Pine', 'Floral',
    'Powdery', 'Fruity', 'Citrus', 'Tropical', 'Berry', 'Peach', 'Sweet',
    'Caramellic', 'Vanilla', 'BrownSpice', 'Smoky', 'Burnt', 'Roasted',
    'Grainy', 'Meaty', 'Nutty', 'Fatty', 'Coconut', 'Waxy', 'Dairy',
    'Buttery', 'Cheesy', 'Sour', 'Fermented', 'Sulfurous', 'Garlic.Onion',
    'Earthy', 'Mushroom', 'Musty', 'Ammonia', 'Fishy', 'Fecal',
    'Rotten.Decay', 'Rubber', 'Phenolic', 'Animal', 'Medicinal',
    'Cooling', 'Sharp', 'Chlorine', 'Alcoholic', 'Plastic', 'Ozone', 'Metallic'
]

In [None]:
import torch
import torch.nn as nn

class PearsonCosineLoss(nn.Module):
    def __init__(self, alpha=0.5, eps=1e-8):
        super().__init__()
        self.alpha = alpha
        self.eps = eps
        self.cosine_similarity = nn.CosineSimilarity(dim=1, eps=eps)

    def forward(self, y_pred, y_true):
        # Pearson Correlation (1 - correlation for minimization)
        y_pred_centered = y_pred - y_pred.mean(dim=1, keepdim=True)
        y_true_centered = y_true - y_true.mean(dim=1, keepdim=True)
        numerator = (y_pred_centered * y_true_centered).sum(dim=1)
        denominator = (y_pred_centered.pow(2).sum(dim=1) * y_true_centered.pow(2).sum(dim=1)).sqrt() + self.eps
        pearson_corr = numerator / denominator
        pearson_loss = 1 - pearson_corr

        # Cosine Similarity (1 - similarity for minimization)
        cosine_loss = 1 - self.cosine_similarity(y_pred, y_true)
        loss = self.alpha * pearson_loss + (1 - self.alpha) * cosine_loss
        return loss.mean()
class HyperbolicModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)
        self.tanh = nn.Tanh()
    def forward(self, x):
        x = self.tanh(self.fc1(x))
        x = self.tanh(self.fc2(x))
        x = self.fc3(x)
        return x

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    corrs = [np.corrcoef(y_true[:,i], y_pred[:,i])[0,1] if np.std(y_true[:,i])>0 and np.std(y_pred[:,i])>0 else 0 for i in range(y_true.shape[1])]
    avg_pearson = np.mean(corrs)
    norms_true = np.linalg.norm(y_true, axis=1)
    norms_pred = np.linalg.norm(y_pred, axis=1)
    cos = np.sum(y_true * y_pred, axis=1) / (norms_true * norms_pred + 1e-8)
    avg_cosine = np.mean(cos)
    return {
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'Avg Pearson Correlation': avg_pearson,
        'Avg Cosine Similarity': avg_cosine
    }

# Training routine
# Training routine
def train_and_evaluate(X, y, test_ratio=0.2, epochs=100, lr=1e-3, batch_size=32, alpha=0.5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # Split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_ratio, random_state=42)
    X_train_t = torch.FloatTensor(X_train).to(device)
    y_train_t = torch.FloatTensor(y_train).to(device)
    X_val_t = torch.FloatTensor(X_val).to(device)
    y_val_t = torch.FloatTensor(y_val).to(device)

    # Model
    model = HyperbolicModel(X.shape[1], y.shape[1]).to(device)
    loss_fn = PearsonCosineLoss(alpha=alpha)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    n = X_train.shape[0]
    steps_per_epoch = int(np.ceil(n / batch_size))

    # Training loop
    for epoch in range(epochs):
        model.train()
        idx = np.random.permutation(n)
        for i in range(steps_per_epoch):
            batch_idx = idx[i*batch_size:(i+1)*batch_size]
            xb = X_train_t[batch_idx]
            yb = y_train_t[batch_idx]
            optimizer.zero_grad()
            y_pred = model(xb)
            loss = loss_fn(y_pred, yb)
            loss.backward()
            optimizer.step()
        if (epoch+1) % 20 == 0:
            print(f"Epoch {epoch+1}/{epochs}: Train Loss = {loss.item():.4f}")

    model.eval()
    with torch.no_grad():
        y_train_pred = model(X_train_t).cpu().numpy()
        y_val_pred = model(X_val_t).cpu().numpy()
    y_train_pred = np.clip(y_train_pred, 0.0, 5.0)  # Clip predictions to valid range
    y_val_pred = np.clip(y_val_pred, 0.0, 5.0)

    # Print metrics on split data
    train_metrics = calculate_metrics(y_train, y_train_pred)
    val_metrics = calculate_metrics(y_val, y_val_pred)
    print("\\nTrain Metrics:", train_metrics)
    print("Validation Metrics:", val_metrics)

    # Retrain on full data
    model = HyperbolicModel(X.shape[1], y.shape[1]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    X_full_t = torch.FloatTensor(X).to(device)
    y_full_t = torch.FloatTensor(y).to(device)
    for epoch in range(epochs):
        optimizer.zero_grad()
        y_pred = model(X_full_t)
        loss = loss_fn(y_pred, y_full_t)
        loss.backward()
        optimizer.step()
    model.eval()
    with torch.no_grad():
        y_full_pred = model(X_full_t).cpu().numpy()
    y_full_pred = np.clip(y_full_pred, 0.0, 5.0)
    full_metrics = calculate_metrics(y, y_full_pred)
    print("\\nMetrics on Full Data:", full_metrics)
    return model, device
# Run training and print metrics
model, device = train_and_evaluate(
    x_df_lasso.to_numpy(dtype=np.float32),
    y_df.to_numpy(dtype=np.float32),
    test_ratio=0.2,
    epochs=100,
    lr=1e-3,
    batch_size=32,
    alpha=0.5
)

# Convert all columns to numeric (with NaNs where conversion failed)
X_test_numeric = X_test.apply(pd.to_numeric, errors='coerce')

# Fill NaN values (e.g., with 0 or other imputation strategy)
X_test_filled = X_test_numeric.fillna(0)

# Convert to numpy array of floats
X_test_arr = X_test_filled.to_numpy(dtype=np.float32)
with torch.no_grad():
    X_test_tensor = torch.FloatTensor(X_test_arr).to(device)
    test_preds = model(X_test_tensor).cpu().numpy()
test_preds = np.clip(test_preds, 0.0, 5.0)
# Save submission
test_submission = test_form[['stimulus']].copy()
for i, col in enumerate(target_cols):
    test_submission[col] = test_preds[:, i]

test_submission.to_csv('hyperbolic_task1_embeddingselected.csv', index=False)
print("Test predictions saved.")
print(f"Test submission shape: {test_submission.shape}")


Epoch 20/100: Train Loss = 0.3005
Epoch 40/100: Train Loss = 0.2398
Epoch 60/100: Train Loss = 0.2184
Epoch 80/100: Train Loss = 0.3096
Epoch 100/100: Train Loss = 0.1938
\nTrain Metrics: {'MSE': 0.050878871232271194, 'MAE': 0.14275841414928436, 'R2': 0.4570937156677246, 'Avg Pearson Correlation': np.float64(0.6982612100152307), 'Avg Cosine Similarity': np.float32(0.8739893)}
Validation Metrics: {'MSE': 0.08204477280378342, 'MAE': 0.18219299614429474, 'R2': 0.18200212717056274, 'Avg Pearson Correlation': np.float64(0.5159627029937517), 'Avg Cosine Similarity': np.float32(0.77512324)}
\nMetrics on Full Data: {'MSE': 0.07584412395954132, 'MAE': 0.16589325666427612, 'R2': 0.27048173546791077, 'Avg Pearson Correlation': np.float64(0.5305303592981897), 'Avg Cosine Similarity': np.float32(0.7997642)}
Test predictions saved.
Test submission shape: (130, 52)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# --- Custom metrics ---
def pearson_correlation_score(y_true, y_pred):
    corrs = []
    for i in range(y_true.shape[1]):
        corr = np.corrcoef(y_true[:, i], y_pred[:, i])[0, 1]
        if np.isnan(corr): corr = 0
        corrs.append(corr)
    return np.mean(corrs)

def cosine_similarity_score(y_true, y_pred):
    similarities = []
    for i in range(y_true.shape[0]):
        y_true_norm = y_true[i] / (np.linalg.norm(y_true[i]) + 1e-8)
        y_pred_norm = y_pred[i] / (np.linalg.norm(y_pred[i]) + 1e-8)
        similarity = np.dot(y_true_norm, y_pred_norm)
        similarities.append(similarity)
    return np.mean(similarities)

def calculate_metrics(y_true, y_pred):
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R2": r2_score(y_true, y_pred),
        "Avg Pearson Correlation": pearson_correlation_score(y_true, y_pred),
        "Avg Cosine Similarity": cosine_similarity_score(y_true, y_pred),
    }

# --------- Example workflow ---------
# These need to be set before running:
# X_df: DataFrame of features, y_df: DataFrame of targets (n_samples, n_targets)
# X_test_df: DataFrame of test features
# test_form: DataFrame with 'stimulus' column, matches rows of X_test_df
# target_cols: list of 51 string column names

# 1. Convert to numpy if needed
X = x_df_lasso.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)
y = y_df.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)

X_test = X_test.to_numpy(dtype=np.float32)

# 2. 80/20 split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Train MultiOutput RandomForest
rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42, n_jobs=-1))
rf.fit(X_train, y_train)

# 4. Print metrics on splits
y_train_pred = rf.predict(X_train)
y_val_pred = rf.predict(X_val)
y_train_pred = np.clip(y_train_pred, 0.0, 5.0)
y_val_pred = np.clip(y_val_pred, 0.0, 5.0)
print("Training metrics:")
print(calculate_metrics(y_train, y_train_pred))
print("\nValidation metrics:")
print(calculate_metrics(y_val, y_val_pred))

# 5. Retrain on entire dataset, print metrics
rf_full = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42, n_jobs=-1))
rf_full.fit(X, y)
y_full_pred = rf_full.predict(X)
y_full_pred = np.clip(y_full_pred, 0.0, 5.0)
print("\nFull dataset metrics:")
print(calculate_metrics(y, y_full_pred))

# 6. Predict on test set, save file
test_preds = rf_full.predict(X_test)
test_preds = np.clip(test_preds, 0.0, 5.0)
test_submission = test_form[['stimulus']].copy()
for i, col in enumerate(target_cols):
    test_submission[col] = test_preds[:, i]
test_submission.to_csv('rf_corr_cosine_metrics_test_task1_embeddingselected.csv', index=False)
print("Test predictions saved.")
print(f"Test submission shape: {test_submission.shape}")


Training metrics:
{'MSE': 0.009851440192688517, 'MAE': 0.0609627863115571, 'R2': 0.8994140372182589, 'Avg Pearson Correlation': np.float64(0.9673644522502512), 'Avg Cosine Similarity': np.float64(0.9773205629724832)}

Validation metrics:
{'MSE': 0.05892787491337023, 'MAE': 0.1564852763367268, 'R2': 0.3706585373342168, 'Avg Pearson Correlation': np.float64(0.624083710898084), 'Avg Cosine Similarity': np.float64(0.8408294778589058)}

Full dataset metrics:
{'MSE': 0.009251093740760594, 'MAE': 0.0593817850123281, 'R2': 0.9041519242865723, 'Avg Pearson Correlation': np.float64(0.9668635977824981), 'Avg Cosine Similarity': np.float64(0.9783503654391273)}
Test predictions saved.
Test submission shape: (130, 52)


# **selectkbest**

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
test_form = pd.read_csv('TASK2_Test_set_Submission_form.csv', encoding='latin1')
x_df_lasso=pd.read_csv('X_train_lasso_selected.csv')
y_df=pd.read_csv('y_train.csv')
X_test=pd.read_csv('X_test_lasso_selected.csv')

target_cols = [
    'Green', 'Cucumber', 'Herbal', 'Mint', 'Woody', 'Pine', 'Floral',
    'Powdery', 'Fruity', 'Citrus', 'Tropical', 'Berry', 'Peach', 'Sweet',
    'Caramellic', 'Vanilla', 'BrownSpice', 'Smoky', 'Burnt', 'Roasted',
    'Grainy', 'Meaty', 'Nutty', 'Fatty', 'Coconut', 'Waxy', 'Dairy',
    'Buttery', 'Cheesy', 'Sour', 'Fermented', 'Sulfurous', 'Garlic.Onion',
    'Earthy', 'Mushroom', 'Musty', 'Ammonia', 'Fishy', 'Fecal',
    'Rotten.Decay', 'Rubber', 'Phenolic', 'Animal', 'Medicinal',
    'Cooling', 'Sharp', 'Chlorine', 'Alcoholic', 'Plastic', 'Ozone', 'Metallic'
]

In [None]:
import torch
import torch.nn as nn

class PearsonCosineLoss(nn.Module):
    def __init__(self, alpha=0.5, eps=1e-8):
        super().__init__()
        self.alpha = alpha
        self.eps = eps
        self.cosine_similarity = nn.CosineSimilarity(dim=1, eps=eps)

    def forward(self, y_pred, y_true):
        # Pearson Correlation (1 - correlation for minimization)
        y_pred_centered = y_pred - y_pred.mean(dim=1, keepdim=True)
        y_true_centered = y_true - y_true.mean(dim=1, keepdim=True)
        numerator = (y_pred_centered * y_true_centered).sum(dim=1)
        denominator = (y_pred_centered.pow(2).sum(dim=1) * y_true_centered.pow(2).sum(dim=1)).sqrt() + self.eps
        pearson_corr = numerator / denominator
        pearson_loss = 1 - pearson_corr

        # Cosine Similarity (1 - similarity for minimization)
        cosine_loss = 1 - self.cosine_similarity(y_pred, y_true)
        loss = self.alpha * pearson_loss + (1 - self.alpha) * cosine_loss
        return loss.mean()
class HyperbolicModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)
        self.tanh = nn.Tanh()
    def forward(self, x):
        x = self.tanh(self.fc1(x))
        x = self.tanh(self.fc2(x))
        x = self.fc3(x)
        return x

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def calculate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    corrs = [np.corrcoef(y_true[:,i], y_pred[:,i])[0,1] if np.std(y_true[:,i])>0 and np.std(y_pred[:,i])>0 else 0 for i in range(y_true.shape[1])]
    avg_pearson = np.mean(corrs)
    norms_true = np.linalg.norm(y_true, axis=1)
    norms_pred = np.linalg.norm(y_pred, axis=1)
    cos = np.sum(y_true * y_pred, axis=1) / (norms_true * norms_pred + 1e-8)
    avg_cosine = np.mean(cos)
    return {
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'Avg Pearson Correlation': avg_pearson,
        'Avg Cosine Similarity': avg_cosine
    }

# Training routine
# Training routine
def train_and_evaluate(X, y, test_ratio=0.2, epochs=100, lr=1e-3, batch_size=32, alpha=0.5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # Split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_ratio, random_state=42)
    X_train_t = torch.FloatTensor(X_train).to(device)
    y_train_t = torch.FloatTensor(y_train).to(device)
    X_val_t = torch.FloatTensor(X_val).to(device)
    y_val_t = torch.FloatTensor(y_val).to(device)

    # Model
    model = HyperbolicModel(X.shape[1], y.shape[1]).to(device)
    loss_fn = PearsonCosineLoss(alpha=alpha)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    n = X_train.shape[0]
    steps_per_epoch = int(np.ceil(n / batch_size))

    # Training loop
    for epoch in range(epochs):
        model.train()
        idx = np.random.permutation(n)
        for i in range(steps_per_epoch):
            batch_idx = idx[i*batch_size:(i+1)*batch_size]
            xb = X_train_t[batch_idx]
            yb = y_train_t[batch_idx]
            optimizer.zero_grad()
            y_pred = model(xb)
            loss = loss_fn(y_pred, yb)
            loss.backward()
            optimizer.step()
        if (epoch+1) % 20 == 0:
            print(f"Epoch {epoch+1}/{epochs}: Train Loss = {loss.item():.4f}")

    model.eval()
    with torch.no_grad():
        y_train_pred = model(X_train_t).cpu().numpy()
        y_val_pred = model(X_val_t).cpu().numpy()
    y_train_pred = np.clip(y_train_pred, 0.0, 5.0)  # Clip predictions to valid range
    y_val_pred = np.clip(y_val_pred, 0.0, 5.0)

    # Print metrics on split data
    train_metrics = calculate_metrics(y_train, y_train_pred)
    val_metrics = calculate_metrics(y_val, y_val_pred)
    print("\\nTrain Metrics:", train_metrics)
    print("Validation Metrics:", val_metrics)

    # Retrain on full data
    model = HyperbolicModel(X.shape[1], y.shape[1]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    X_full_t = torch.FloatTensor(X).to(device)
    y_full_t = torch.FloatTensor(y).to(device)
    for epoch in range(epochs):
        optimizer.zero_grad()
        y_pred = model(X_full_t)
        loss = loss_fn(y_pred, y_full_t)
        loss.backward()
        optimizer.step()
    model.eval()
    with torch.no_grad():
        y_full_pred = model(X_full_t).cpu().numpy()
    full_metrics = calculate_metrics(y, y_full_pred)
    print("\\nMetrics on Full Data:", full_metrics)
    return model, device
# Run training and print metrics
model, device = train_and_evaluate(
    x_df_lasso.to_numpy(dtype=np.float32),
    y_df.to_numpy(dtype=np.float32),
    test_ratio=0.2,
    epochs=100,
    lr=1e-3,
    batch_size=32,
    alpha=0.5
)

# Convert all columns to numeric (with NaNs where conversion failed)
X_test_numeric = X_test.apply(pd.to_numeric, errors='coerce')

# Fill NaN values (e.g., with 0 or other imputation strategy)
X_test_filled = X_test_numeric.fillna(0)

# Convert to numpy array of floats
X_test_arr = X_test_filled.to_numpy(dtype=np.float32)
with torch.no_grad():
    X_test_tensor = torch.FloatTensor(X_test_arr).to(device)
    test_preds = model(X_test_tensor).cpu().numpy()
test_preds = np.clip(test_preds, 0.0, 5.0)
# Save submission
test_submission = test_form[['stimulus']].copy()
for i, col in enumerate(target_cols):
    test_submission[col] = test_preds[:, i]

test_submission.to_csv('hyperbolic_task2_selectkbest.csv', index=False)
print("Test predictions saved.")
print(f"Test submission shape: {test_submission.shape}")


Epoch 20/100: Train Loss = 0.5199
Epoch 40/100: Train Loss = 0.4223
Epoch 60/100: Train Loss = 0.3240
Epoch 80/100: Train Loss = 0.0966
Epoch 100/100: Train Loss = 0.2311
\nTrain Metrics: {'MSE': 0.10640577971935272, 'MAE': 0.233772873878479, 'R2': -0.10602099448442459, 'Avg Pearson Correlation': np.float64(0.5281890230779168), 'Avg Cosine Similarity': np.float32(0.7934581)}
Validation Metrics: {'MSE': 0.12370293587446213, 'MAE': 0.24841797351837158, 'R2': -0.3320320248603821, 'Avg Pearson Correlation': np.float64(0.4943663973253342), 'Avg Cosine Similarity': np.float32(0.76316524)}
\nMetrics on Full Data: {'MSE': 0.11923715472221375, 'MAE': 0.2495933473110199, 'R2': -0.1310190111398697, 'Avg Pearson Correlation': np.float64(0.29612302141281827), 'Avg Cosine Similarity': np.float32(0.71252275)}
Test predictions saved.
Test submission shape: (130, 52)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# --- Custom metrics ---
def pearson_correlation_score(y_true, y_pred):
    corrs = []
    for i in range(y_true.shape[1]):
        corr = np.corrcoef(y_true[:, i], y_pred[:, i])[0, 1]
        if np.isnan(corr): corr = 0
        corrs.append(corr)
    return np.mean(corrs)

def cosine_similarity_score(y_true, y_pred):
    similarities = []
    for i in range(y_true.shape[0]):
        y_true_norm = y_true[i] / (np.linalg.norm(y_true[i]) + 1e-8)
        y_pred_norm = y_pred[i] / (np.linalg.norm(y_pred[i]) + 1e-8)
        similarity = np.dot(y_true_norm, y_pred_norm)
        similarities.append(similarity)
    return np.mean(similarities)

def calculate_metrics(y_true, y_pred):
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R2": r2_score(y_true, y_pred),
        "Avg Pearson Correlation": pearson_correlation_score(y_true, y_pred),
        "Avg Cosine Similarity": cosine_similarity_score(y_true, y_pred),
    }

# --------- Example workflow ---------
# These need to be set before running:
# X_df: DataFrame of features, y_df: DataFrame of targets (n_samples, n_targets)
# X_test_df: DataFrame of test features
# test_form: DataFrame with 'stimulus' column, matches rows of X_test_df
# target_cols: list of 51 string column names

# 1. Convert to numpy if needed
X = x_df_lasso.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)
y = y_df.apply(pd.to_numeric, errors='coerce').fillna(0).to_numpy(dtype=np.float32)

X_test = X_test.to_numpy(dtype=np.float32)

# 2. 80/20 split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Train MultiOutput RandomForest
rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42, n_jobs=-1))
rf.fit(X_train, y_train)

# 4. Print metrics on splits
y_train_pred = rf.predict(X_train)
y_val_pred = rf.predict(X_val)
y_train_pred = np.clip(y_train_pred, 0.0, 5.0)
y_val_pred = np.clip(y_val_pred, 0.0, 5.0)
print("Training metrics:")
print(calculate_metrics(y_train, y_train_pred))
print("\nValidation metrics:")
print(calculate_metrics(y_val, y_val_pred))

# 5. Retrain on entire dataset, print metrics
rf_full = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42, n_jobs=-1))
rf_full.fit(X, y)
y_full_pred = rf_full.predict(X)
y_full_pred = np.clip(y_full_pred, 0.0, 5.0)
print("\nFull dataset metrics:")
print(calculate_metrics(y, y_full_pred))

# 6. Predict on test set, save file
test_preds = rf_full.predict(X_test)
test_preds = np.clip(test_preds, 0.0, 5.0)
test_submission = test_form[['stimulus']].copy()
for i, col in enumerate(target_cols):
    test_submission[col] = test_preds[:, i]
test_submission.to_csv('rf_corr_cosine_metrics_test_task2_selectkbest.csv', index=False)
print("Test predictions saved.")
print(f"Test submission shape: {test_submission.shape}")


Training metrics:
{'MSE': 0.010457421466746362, 'MAE': 0.06284540379754484, 'R2': 0.8944068177952577, 'Avg Pearson Correlation': np.float64(0.9659380393732574), 'Avg Cosine Similarity': np.float64(0.9762317650466045)}

Validation metrics:
{'MSE': 0.05917115803051844, 'MAE': 0.15694276661235151, 'R2': 0.3462292577074942, 'Avg Pearson Correlation': np.float64(0.6098647787917398), 'Avg Cosine Similarity': np.float64(0.8395876682714307)}

Full dataset metrics:
{'MSE': 0.009739784831769193, 'MAE': 0.06068153224426844, 'R2': 0.8996719834111578, 'Avg Pearson Correlation': np.float64(0.9652711458305993), 'Avg Cosine Similarity': np.float64(0.977607951980975)}
Test predictions saved.
Test submission shape: (130, 52)
