In [14]:
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import BayesianRidge
import numpy as np
import pandas as pd
pd.options.mode.copy_on_write = True

In [15]:
train = pd.read_csv(r"C:\Users\basde\OneDrive\Documenten\GitHub\Titanic\train.csv")

In [19]:
train['Deck'] = train['Cabin'].astype(str).str[0]

In [20]:
train['Deck'].value_counts()

Deck
n    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: count, dtype: int64

In [16]:
X = train[['Age','Pclass','Fare']]
y = train['Deck']

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

# --- 1. Create a Realistic Sample Dataset ---
# We need X, an intermediate target with missing values, and a final target.
X_full, y_final = make_regression(n_samples=1000, n_features=20, n_informative=15, noise=25, random_state=42)
X_full = pd.DataFrame(X_full, columns=[f'feature_{i}' for i in range(20)])

# Create the intermediate variable, making it related to some features and the final target
y_intermediate = (0.5 * X_full['feature_2'] + 0.3 * X_full['feature_5'] + 0.2 * y_final + np.random.normal(0, 5, 1000))
y_intermediate = pd.Series(y_intermediate, name='intermediate_target')
y_final = pd.Series(y_final, name='final_target')

# Introduce 80% missing values into the INTERMEDIATE target
y_intermediate.loc[y_intermediate.sample(frac=0.8, random_state=42).index] = np.nan

# Define which columns are "correlated" and will be used for imputation
correlated_features = ['feature_2', 'feature_5', 'feature_8', 'feature_10']
X_corr = X_full[correlated_features]

print(f"Full feature set shape: {X_full.shape}")
print(f"Correlated feature set shape: {X_corr.shape}")
print(f"Intermediate targets missing: {y_intermediate.isna().sum()}")
print("-" * 50)


# --- 2. Define Models and Hyperparameter Grid ---
# The model for the final prediction
main_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
# A simple imputer for the correlated features (in case they also have missing values)
feature_imputer = SimpleImputer(strategy='mean')

# Hyperparameters TO TUNE for the INTERMEDIATE TARGET IMPUTATION model
param_grid = {
    'n_neighbors': [3, 7, 15],
    'weights': ['uniform', 'distance']
}

# --- 3. Manual Hyperparameter Tuning with Nested Cross-Validation ---
results = []
# Outer loop: Iterate through each hyperparameter combination
for params in ParameterGrid(param_grid):
    print(f"Testing params for imputation model: {params}")
    fold_scores = []
    
    # Inner loop: 5-Fold Cross-Validation
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    # Note: We split the original full dataframe indices to keep everything aligned
    for train_idx, test_idx in cv.split(X_full):
        
        # --- Create data splits for this fold ---
        # Correlated features for imputation
        X_train_corr, X_test_corr = X_corr.iloc[train_idx], X_corr.iloc[test_idx]
        # Full features for final model
        X_train_full, X_test_full = X_full.iloc[train_idx], X_full.iloc[test_idx]
        # Intermediate and final targets
        y_train_intermediate, y_test_intermediate = y_intermediate.iloc[train_idx], y_intermediate.iloc[test_idx]
        y_train_final, y_test_final = y_final.iloc[train_idx], y_final.iloc[test_idx]
        
        # --- Imputation Step (on the training data) ---
        # a. Isolate the part of the training data where the intermediate target is known
        X_train_corr_known_y = X_train_corr[y_train_intermediate.notna()]
        y_train_intermediate_known_y = y_train_intermediate[y_train_intermediate.notna()]

        # b. Fit the feature imputer ONLY on this known data subset
        feature_imputer.fit(X_train_corr_known_y)
        
        # c. Train the target imputation model
        target_imputer_model = KNeighborsRegressor(**params)
        target_imputer_model.fit(feature_imputer.transform(X_train_corr_known_y), y_train_intermediate_known_y)
        
        # d. Impute missing intermediate values for both train and test sets
        imputed_y_train = target_imputer_model.predict(feature_imputer.transform(X_train_corr))
        imputed_y_test = target_imputer_model.predict(feature_imputer.transform(X_test_corr))

        # --- Final Model Training and Evaluation ---
        # e. Create the "enhanced" feature sets by adding the imputed variable
        X_train_enhanced = X_train_full.copy()
        X_train_enhanced['imputed_feature'] = imputed_y_train
        
        X_test_enhanced = X_test_full.copy()
        X_test_enhanced['imputed_feature'] = imputed_y_test
        
        # f. Train the final model on the enhanced training data to predict the final target
        main_model.fit(X_train_enhanced, y_train_final)
        
        # g. Make predictions on the enhanced test set
        y_pred_final = main_model.predict(X_test_enhanced)
        
        # h. Evaluate final model performance and store the score
        rmse = np.sqrt(mean_squared_error(y_test_final, y_pred_final))
        fold_scores.append(rmse)

    # Calculate average score for the tested parameters
    avg_score = np.mean(fold_scores)
    results.append({'params': params, 'score': avg_score})
    print(f"  -> Average Final Model RMSE: {avg_score:.4f}\n")


# --- 4. Find and Display the Best Parameters ---
best_result = min(results, key=lambda x: x['score'])

print("-" * 50)
print(f"🏆 Best Hyperparameters for Imputation: {best_result['params']}")
print(f"🏆 Best Resulting Cross-Validated RMSE: {best_result['score']:.4f}")