In [11]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from joblib import Parallel, delayed
import numpy as np
from tqdm import tqdm

# Set initial SEED
SEED = 42
np.random.seed(SEED)

In [12]:
# Load data
train_rose = pd.read_csv("data/train_rose.csv")
test = pd.read_csv("data/test.csv")

In [13]:
# Set "random" parameters
n = 1000
a = 0
b = n*3
SEEDS = np.random.choice(range(a, b+1), n, replace=False)

# Simulation params for each SEED
train_perc = 0.8
boot_perc = 0.8
boot_n = 20

In [14]:
def simulate_hard(SEED):
    np.random.seed(SEED)
    
    # Split data
    train, val = train_test_split(train_rose, train_size=train_perc, random_state=SEED)
    
    # Prepare bootstrap samples
    boot_l = int(round(boot_perc * len(train)))
    
    y_pred_val = np.zeros((len(val), boot_n))
    y_pred_test = np.zeros((len(test), boot_n))
    
    for j in range(boot_n):
        sample_indices = np.random.choice(train.index, size=boot_l, replace=True)
        bootstrap_sample = train.loc[sample_indices]
        
        # Train model
        model = LogisticRegression(solver='liblinear', random_state=SEED)
        model.fit(bootstrap_sample.drop(['CARAVAN'], axis=1), bootstrap_sample['CARAVAN'])
        
        # Predict
        y_pred_val[:, j] = model.predict_proba(val.drop(['CARAVAN'], axis=1))[:, 1]
        y_pred_test[:, j] = model.predict_proba(test.drop(['CARAVAN'], axis=1))[:, 1]
    
    # Compute final predictions
    final_pred_val = (np.mean(y_pred_val, axis=1) > 0.5).astype(int)
    final_pred_test = (np.mean(y_pred_test, axis=1) > 0.5).astype(int)
    final_val = val['CARAVAN']
    final_test = test['CARAVAN']
    
    # Save val preds and val true values
    pd.DataFrame([final_pred_val, final_val]).T.to_csv(f"simulations/HARD_{SEED}_val.csv", index=False, header=['Prediction', 'True'])
    pd.DataFrame([final_pred_test, final_test]).T.to_csv(f"simulations/HARD_{SEED}_test.csv", index=False, header=['Prediction', 'True'])
    
    
    


def simulate_soft(SEED):
    np.random.seed(SEED)
    
    # Split data
    train, val = train_test_split(train_rose, train_size=train_perc, random_state=SEED)
    
    # Prepare bootstrap samples
    boot_l = int(round(boot_perc * len(train)))
    
    y_pred_val = np.zeros((len(val), boot_n))
    y_pred_test = np.zeros((len(test), boot_n))
    
    for j in range(boot_n):
        sample_indices = np.random.choice(train.index, size=boot_l, replace=True)
        bootstrap_sample = train.loc[sample_indices]
        
        # Train model
        model = LogisticRegression(solver='liblinear', random_state=SEED)
        model.fit(bootstrap_sample.drop(['CARAVAN'], axis=1), bootstrap_sample['CARAVAN'])
        
        # Predict
        y_pred_val[:, j] = model.predict_proba(val.drop(['CARAVAN'], axis=1))[:, 1]
        y_pred_test[:, j] = model.predict_proba(test.drop(['CARAVAN'], axis=1))[:, 1]
    
    # Compute final predictions
    final_pred_val = np.mean(y_pred_val, axis=1)
    final_pred_test = np.mean(y_pred_test, axis=1)
    final_val = val['CARAVAN']
    final_test = test['CARAVAN']
    
    # Save predictions to CSV files
    pd.DataFrame([final_pred_val, final_val]).T.to_csv(f"simulations/SOFT_{SEED}_val.csv", index=False, header=['Prediction', 'True'])
    pd.DataFrame([final_pred_test, final_test]).T.to_csv(f"simulations/SOFT_{SEED}_test.csv", index=False, header=['Prediction', 'True'])

In [15]:
# Parallel simulation
print(f"Running {n} simulations of HARD for each SEED...")
Parallel(n_jobs=-1)(delayed(simulate_hard)(SEED) for SEED in tqdm(SEEDS, desc="Simulating"))
print(f"Running {n} simulations of SOFT for each SEED...")
Parallel(n_jobs=-1)(delayed(simulate_soft)(SEED) for SEED in tqdm(SEEDS, desc="Simulating"))

Running 1000 simulations of HARD for each SEED...



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Simulating: 100%|██████████| 1000/1000 [09:03<00:00,  1.84it/s]


Running 1000 simulations of SOFT for each SEED...



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

KeyboardInterrupt: 