<a href="https://colab.research.google.com/github/alvinfranklyndavis/Project2023_v3/blob/main/Copy_of_Project2023_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# STEP 1. INSTALL PACKAGES AND IMPORT DATA

# Upgrade pip and install required packages
!pip install -U --upgrade-strategy eager pip
!pip install -U --upgrade-strategy eager pandas gdown numpy matplotlib scikit-learn xgboost shap

# Import necessary libraries
import pandas as pd
import gdown
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.impute import KNNImputer
import shap

# Set up logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the Google Drive file ID
file_id = '1Aa4qkI_xffvqUve1PG-CSMQgVc617qL9'

# Define the URL of the CSV file
csv_url = f'https://drive.google.com/uc?id={file_id}'

# Define the local file path to save the CSV
csv_path = 'Training_Testing_Hybrid_MA.csv'

# Log the start of the dataset download
logger.info("Downloading the dataset...")

# Download the CSV file from the Google Drive link
gdown.download(csv_url, csv_path, quiet=False)

# Log the successful download
logger.info("Dataset downloaded successfully.")

# Read your Original dataset
logger.info("Reading the dataset...")
data = pd.read_csv("Training_Testing_Hybrid_MA.csv")  # Update with your dataset path
logger.info("Dataset loaded successfully.")


[0m

Downloading...
From: https://drive.google.com/uc?id=1Aa4qkI_xffvqUve1PG-CSMQgVc617qL9
To: /content/Training_Testing_Hybrid_MA.csv
100%|██████████| 93.1k/93.1k [00:00<00:00, 57.3MB/s]


In [16]:
print(data.columns)


Index(['Day of the Week', 'Morning', 'Prev_Week', 'Rep_Prev_Week',
       'Prev_Entry', 'Rep_Prev_Entry', 'Mov_Avg_Mor', 'Afternoon',
       'Prev_Week.1', 'Rep_Prev_Week.1', 'Prev_Entry.1', 'Rep_Prev_Entry.1',
       'Mov_Avg_Aft', 'Evening', 'Prev_Week.2', 'Rep_Prev_Week.2',
       'Prev_Entry.2', 'Rep_Prev_Entry.2', 'Mov_Avg_Eve', 'Night',
       'Prev_Week.3', 'Rep_Prev_Week.3', 'Prev_Entry.3', 'Rep_Prev_Entry.3',
       'Mov_Avg_Nig', 'Year', 'Month', 'Day', 'Prediction1'],
      dtype='object')


In [19]:
# STEP 2. PROCESS DATE FEATURES AND SET UP PREDICTION1 COLUMN

# Extract Date features
# 'Date' has already been dropped, and 'Year', 'Month', 'Day' have been extracted
# data['Date'] = pd.to_datetime(data['Date'])
# data = data.drop(columns=['Date'])  # No need to drop 'Date' again

# Set up logging
import logging
logger = logging.getLogger(__name__)

# Log the start of date feature extraction
logger.info("Processing date features...")

# Display data types and check for missing values
logger.info("Data types:\n%s", data.dtypes)
logger.info("Missing values:\n%s", data.isnull().sum())

# Calculate Moving Averages for specified columns
# Define the window size for the moving average
window_size = 3  # You can adjust this as needed

# Columns to calculate moving averages for and their corresponding target columns
columns_to_average = ['Morning', 'Afternoon', 'Evening', 'Night']
target_columns = ['Mov_Avg_Mor', 'Mov_Avg_Aft', 'Mov_Avg_Eve', 'Mov_Avg_Nig']

# Initialize a dictionary to keep track of the window size for each column
window_sizes = {col: 1 for col in columns_to_average}

# Calculate the moving averages and fill the target columns iteratively
for index, row in data.iterrows():
    for col, target_col in zip(columns_to_average, target_columns):
        window_size = window_sizes[col]
        current_value = row[col]
        current_avg = row[target_col]

        # Calculate the updated moving average
        updated_avg = ((window_size - 1) * current_avg + current_value) / window_size

        # Update the target column with the new moving average
        data.at[index, target_col] = updated_avg

        # Increment the window size for the current column
        window_sizes[col] += 1

# Create Target Variable Column for Prediction1
data['Prediction1'] = np.nan

# Keep only relevant columns for Prediction1
selected_columns_p1 = ['Morning', 'Prev_Week', 'Prev_Entry','Mov_Avg_Mor','Year', 'Month', 'Day', 'Prediction1']
data_p1 = data[selected_columns_p1]

# Save the train dataframe to a CSV file
logger.info("Saving the Prediction1 data to CSV...")
data_p1.to_csv('/content/train_data_prediction1.csv', index=False)

# Log the completion of date feature processing
logger.info("Date features processed successfully.")

# Display the first few rows of Prediction1 data
logger.info("First few rows of Prediction1 data:\n%s", data_p1.head())

# Print Data Types if needed
logger.info("Data types after conversion:\n%s", data.dtypes)


In [26]:
# STEP 3.1. LOAD TRAIN DATAFRAME FOR PREDICTION1

# Load the train dataframe
train_data = pd.read_csv('/content/train_data_prediction1.csv')

# Log the start of loading the train dataframe
logger.info("Loading train dataframe for Prediction1...")

# Separate features and create a placeholder for the target variable for Prediction1
X_p1 = train_data.drop('Prediction1', axis=1)  # Features
y_p1_placeholder = np.full(X_p1.shape[0], np.nan)  # Placeholder for target variable 'Prediction1'

# Log the completion of loading the train dataframe
logger.info("Train dataframe for Prediction1 loaded successfully.")

# STEP 3.2. SPLIT DATA INTO TRAINING AND TESTING SETS FOR PREDICTION1

# Set a random seed for reproducibility
random_seed = 42  # You can use any integer value

# Split the data into training and testing sets for Prediction1
X_train_p1, X_test_p1, y_train_p1_placeholder, y_test_p1_placeholder = train_test_split(
    X_p1,  # Features for 'Prediction1'
    y_p1_placeholder,  # Placeholder for target variable 'Prediction1'
    test_size=0.2,
    random_state=random_seed  # Set random seed
)

# Log the completion of splitting data for Prediction1
logger.info("Data split into training and testing sets for Prediction1.")

# STEP 3.3. HANDLE MISSING VALUES IN THE TARGET VARIABLE FOR PREDICTION1

# Check missing values in the target variable for Prediction1
logger.info("Missing values in Prediction1 target variable:\n%s", np.isnan(y_train_p1_placeholder).sum())
# Check for missing values in the feature data for Prediction1
logger.info("Missing values in feature data for Prediction1:")
missing_values_train = X_train_p1.isnull().sum()
missing_values_test = X_test_p1.isnull().sum()

logger.info("Training set missing values:\n%s", missing_values_train)
logger.info("Testing set missing values:\n%s", missing_values_test)


# STEP 3.4. TRAIN RANDOM FOREST REGRESSOR MODEL FOR PREDICTION1

# Print the size of X_train_p1 and y_train_p1 before model training
print("Size of X_train_p1:", X_train_p1.shape)
print("Size of y_train_p1:", y_train_p1.shape)

# ... previous code ...

try:
    # Train a RandomForestRegressor model
    model_p1 = RandomForestRegressor(random_state=42)
    model_p1.fit(X_train_p1, y_train_p1)

    # Make predictions on the test set
    y_pred_p1 = model_p1.predict(X_test_p1)

    # Apply numpy.clip to constrain predictions within the desired range
    lower_bound = 1
    upper_bound = 36
    y_pred_p1_clipped = np.clip(y_pred_p1, lower_bound, upper_bound)

    # Evaluate the model using clipped predictions
    mse_p1 = mean_squared_error(y_test_p1, y_pred_p1_clipped)
    logger.info("Mean Squared Error for Prediction1 (Clipped):\n%s", mse_p1)
    print("Mean Squared Error for Prediction1 (Clipped):", mse_p1)

except Exception as e:
    logger.error("Error during model training for Prediction1: %s", e)
    print("Error during model training or evaluation:", e)


ERROR:__main__:Error during model training for Prediction1: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


Size of X_train_p1: (1127, 7)
Size of y_train_p1: (1127,)
Error during model training or evaluation: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


In [None]:
# Check the column names in X_p1
print(X_p1.columns)

In [None]:
# STEP 4.1. FEATURE ANALYSIS

import matplotlib.pyplot as plt
import seaborn as sns

# Check the column names in X_p1
print(X_p1.columns)

# Visualize the distributions of the actual features
plt.figure(figsize=(12, 6))
sns.histplot(X_p1['Morning'], bins=30, kde=True, color='blue', label='Morning')
sns.histplot(X_p1['Rep_Prev_Week'], bins=30, kde=True, color='orange', label='Rep_Prev_Week')
sns.histplot(X_p1['Rep_Prev_Entry'], bins=30, kde=True, color='blue', label='Rep_Prev_Entry')
sns.histplot(X_p1['Year'], bins=30, kde=True, color='orange', label='Year')
sns.histplot(X_p1['Month'], bins=30, kde=True, color='blue', label='Month')
sns.histplot(X_p1['Day'], bins=30, kde=True, color='orange', label='Day')
plt.title('Distribution of Actual Features')
plt.xlabel('Feature Values')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Calculate key statistics for each feature
for feature in X_p1.columns:
    feature_stats = X_p1[feature].describe()
    print(f"{feature} Statistics:\n", feature_stats)

# STEP 4.2. KNN PARAMETER REVIEW

from scipy.stats import zscore

# Calculate Z-scores for each feature
for feature in X_p1.columns:
    X_p1[f"{feature}_zscore"] = zscore(X_p1[feature])

print("Columns of X_p1:\n", X_p1.columns)

# Identify outliers using a threshold (e.g., Z-score > 3 or < -3)
outliers = pd.DataFrame()
for feature in ['Feature1', 'Feature2']:
    outliers_feature = X_p1[X_p1[f"{feature}_zscore"].abs() > 3]
    outliers = pd.concat([outliers, outliers_feature], axis=0)

# Display the identified outliers
print("Outliers:\n", outliers)

# Remove the temporary Z-score columns
X_p1 = X_p1.drop([f"{feature}_zscore" for feature in ['Feature1', 'Feature2']], axis=1)

# STEP 4.3. ADAPTED APPROACH FOR REAL WORLD CONSTRAINTS

from sklearn.impute import KNNImputer

# Function to experiment with KNN imputation and visualize results
def knn_imputation_experiment(k_value):
    imputer = KNNImputer(n_neighbors=k_value)
    imputed_data = imputer.fit_transform(X_p1[['Feature1', 'Feature2']])

    # Visualize the imputed distributions
    plt.figure(figsize=(12, 6))
    sns.histplot(imputed_data[:, 0], bins=30, kde=True, color='blue', label='Imputed Feature1')
    sns.histplot(imputed_data[:, 1], bins=30, kde=True, color='orange', label='Imputed Feature2')
    plt.title(f'Imputed Distributions (k={k_value})')
    plt.xlabel('Imputed Feature Values')
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()

# Experiment with different values of k (e.g., 3, 5, 7)
for k_value in [3, 5, 7]:
    knn_imputation_experiment(k_value)


In [None]:
# Step 2: Handling Missing Values for all Predictions

# Combine DataFrames for easy iteration
prediction_dfs = [data_p1, data_p2, data_p3, data_p4]

for i, data_p in enumerate(prediction_dfs, start=1):
    # Print Error: Confirm data types and check for unexpected values
    print(f"Data types for Prediction{i}:\n", data_p.dtypes)
    print("Unique values in all columns for Prediction1:\n", data_p1.apply(lambda x: x.unique()))
    print("Unique values in all columns for Prediction2:\n", data_p1.apply(lambda x: x.unique()))
    print("Unique values in all columns for Prediction3:\n", data_p1.apply(lambda x: x.unique()))
    print("Unique values in all columns for Prediction4:\n", data_p1.apply(lambda x: x.unique()))

    # Print Error: Check for missing values before handling
    print(f"Missing values before handling Prediction{i}:\n", data_p.isnull().sum())

# Print columns before imputation
print("Columns before imputation:", data_p.columns)

# Impute missing values using a strategy (e.g., mean, median, or a constant)
imputer = SimpleImputer(strategy='mean')

# Try imputation and print columns after imputation
try:
    data_p_imputed = pd.DataFrame(imputer.fit_transform(data_p), columns=data_p.columns)
    print("Columns after imputation:", data_p_imputed.columns)
except Exception as e:
    print(f"Error during imputation: {e}")

    # Update the original DataFrame with imputed values
    prediction_dfs[i-1] = data_p_imputed

    # Print Error: Confirm missing values are handled
    print(f"Missing values after handling Prediction{i}:\n", data_p_imputed.isnull().sum())

# Retrieve updated DataFrames after handling missing values
train_data_imputed = data_p_imputed
test_data_imputed = test_data  # Assuming you want to keep the original test_data as is


In [None]:
# Step 3: Data Splitting

# Data Splitting for each Prediction
test_size = 0.2  # 80% train / 20% test split

train_dfs = []  # List to store training dataframes
test_dfs = []   # List to store testing dataframes

for i, data_p in enumerate(prediction_dfs, start=1):
    # Split data into train and test sets
    train_data, test_data = train_test_split(data_p, test_size=test_size, shuffle=False)

    # Append the dataframes to the respective lists
    train_dfs.append(train_data)
    test_dfs.append(test_data)

    # Save the train and test dataframes to CSV files
    train_data.to_csv(f'train_data_prediction{i}.csv', index=False, columns=data_p.columns)
    test_data.to_csv(f'test_data_prediction{i}.csv', index=False, columns=data_p.columns)

    # Print Error: Confirm the shapes of train and test sets
    print(f"Train set shape for Prediction{i}: {train_data.shape}")
    print(f"Test set shape for Prediction{i}: {test_data.shape}")



In [None]:
# Step 4: Model Training

for i, data_p in enumerate(prediction_dfs, start=1):
    # Load train set
    train_data = pd.read_csv(f'train_data_prediction{i}.csv')

    # Print Error: Confirm column names in the DataFrame
    print(f"Columns in train_data for Prediction{i}:", train_data.columns)

    print(f"Columns in train_data for Prediction{i} before dropping the target column:", train_data.columns)

    # Separate features and target variable
    target_column = f'Prediction{i}'
    if target_column not in train_data.columns:
        print(f"Target column '{target_column}' not found. Please check your data preparation steps.")
    else:
        X_train = train_data.drop(columns=[target_column])
        y_train = train_data[target_column]

        print(f"Columns in train_data for Prediction{i} after dropping the target column:", X_train.columns)

        # Initialize the model
        model = RandomForestRegressor(random_state=42)

        # Train the model
        model.fit(X_train, y_train)

        # Save the model
        joblib.dump(model, f'model_prediction{i}.joblib')

        # Print Error: Confirm model is trained
        print(f"Model for Prediction{i} is trained.")


In [None]:
# Step 5
for i, data_p in enumerate(test_dfs, start=1):
    # Load test set
    test_data = pd.read_csv(f'test_data_prediction{i}.csv')

    # Print Error: Confirm column names in the DataFrame
    print(f"Columns in test_data for Prediction{i}:", test_data.columns)

    # Separate features
    X_test = test_data.drop(columns=['Prediction{i}'])

    # Load the trained model
    model = joblib.load(f'model_prediction{i}.joblib')

    # Make predictions
    predictions = model.predict(X_test)

    # Assuming 'predictions' is a NumPy array, update the test_data DataFrame with the predictions
    test_data[f'Prediction{i}'] = predictions

    # Save the predictions to a CSV file or perform further analysis
    test_data.to_csv(f'predictions_prediction{i}.csv', index=False)

    # Print a message indicating successful prediction
    print(f"Predictions for Prediction{i} are saved.")



In [None]:
# Check for missing values
missing_values = df.isnull().sum()

# Display the count of missing values for each column
print("Missing Values:\n", missing_values)

# Display the count of missing values in the training sets
print("Missing Values in X_train:\n", X_train.isnull().sum())
print("\nMissing Values in y_train:\n", y_train.isnull().sum())


In [None]:
print("y_train shape:", y_train.shape)


In [None]:
try:
    # Import necessary libraries
    from sklearn.ensemble import RandomForestRegressor, VotingRegressor
    from xgboost import XGBRegressor
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import mean_squared_error, r2_score
    import numpy as np
except Exception as e:
    print(f"Error during library import: {e}")


In [None]:
try:
    # Initialize individual models
    rf_model = RandomForestRegressor(random_state=42, n_estimators=100)
    xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

    # Fit each individual model for Prediction1
    rf_model.fit(X_train, y_train['Prediction1'])
    xgb_model.fit(X_train, y_train['Prediction1'])

    # Fit each individual model for Prediction2
    rf_model.fit(X_train, y_train['Prediction2'])
    xgb_model.fit(X_train, y_train['Prediction2'])

    # Fit each individual model for Prediction3
    rf_model.fit(X_train, y_train['Prediction3'])
    xgb_model.fit(X_train, y_train['Prediction3'])

    # Fit each individual model for Prediction4
    rf_model.fit(X_train, y_train['Prediction4'])
    xgb_model.fit(X_train, y_train['Prediction4'])
except Exception as e:
    print(f"Error during model initialization and fitting: {e}")


In [None]:
try:
    # Create a VotingRegressor with the specified models
    ensemble_model = VotingRegressor(estimators=[
        ('rf', rf_model),
        ('xgb', xgb_model)
    ])
except Exception as e:
    print(f"Error during ensemble model creation: {e}")


In [None]:
try:
    # Use the individual models to create inputs for the ensemble model
    rf_predictions = rf_model.predict(X_train)
    xgb_predictions = xgb_model.predict(X_train)
    ensemble_X_train = np.column_stack((rf_predictions, xgb_predictions))
except Exception as e:
    print(f"Error during prediction and stacking: {e}")


In [None]:
try:
    # Ensure y_train_ensemble has the same number of elements as X_train
    # Flatten y_train to ensure consistency
    y_train_ensemble = y_train[['Prediction1', 'Prediction2', 'Prediction3', 'Prediction4']].values.ravel()

    # Check if the sizes match
    if ensemble_X_train.shape[0] != len(y_train_ensemble):
        # Handle the size mismatch (e.g., by truncating or padding)
        min_size = min(ensemble_X_train.shape[0], len(y_train_ensemble))
        ensemble_X_train = ensemble_X_train[:min_size, :]
        y_train_ensemble = y_train_ensemble[:min_size]
except Exception as e:
    print(f"Error during data preparation: {e}")


In [None]:
try:
    # Fit the ensemble model
    ensemble_model.fit(ensemble_X_train, y_train_ensemble)

    print("Shape of ensemble_X_train:", ensemble_X_train.shape)
    print("Length of y_train_ensemble:", len(y_train_ensemble))
except Exception as e:
    print(f"Error during ensemble model fitting: {e}")


In [None]:
try:
    # Import KFold for cross-validation
    from sklearn.model_selection import KFold
except Exception as e:
    print(f"Error during KFold import: {e}")


In [None]:
try:
    def evaluate_model(model, X, y, cv=5):
        """
        Evaluate the performance of a predictive model.

        Parameters:
        - model: The predictive model to be evaluated.
        - X: The input features for evaluation.
        - y: The target variables for evaluation.
        - cv: Number of cross-validation folds.

        Returns:
        A dictionary containing evaluation metrics.
        """
        kf = KFold(n_splits=cv, shuffle=True, random_state=42)

        # Initialize evaluation metrics
        mse_5fold = []
        mse_10fold = []
        r2_scores = []

        # Loop through each target variable
        for i in range(y.shape[1]):
            # Initialize scores
            mse_5fold_i = []
            mse_10fold_i = []
            r2_scores_i = []

            # Perform cross-validation
            for train_idx, test_idx in kf.split(X):
                X_train_fold, X_test_fold = X[train_idx], X[test_idx]
                y_train_fold, y_test_fold = y[train_idx, i], y[test_idx, i]

                # Fit the model
                model.fit(X_train_fold, y_train_fold)

                # Predict on the test fold
                y_pred_fold = model.predict(X_test_fold)

                # Calculate MSE
                mse_fold = mean_squared_error(y_test_fold, y_pred_fold)
                if len(mse_5fold_i) < 5:
                    mse_5fold_i.append(mse_fold)
                mse_10fold_i.append(mse_fold)

                # R-squared score
                r2_fold = r2_score(y_test_fold, y_pred_fold)
                r2_scores_i.append(r2_fold)

            # Average scores over folds
            mse_5fold.append(np.mean(mse_5fold_i))
            mse_10fold.append(np.mean(mse_10fold_i))
            r2_scores.append(np.mean(r2_scores_i))

        return {
            '5-Fold Cross-Validation MSE': mse_5fold,
            '10-Fold Cross-Validation MSE': mse_10fold,
            'R-squared Score': r2_scores
        }
except Exception as e:
    print(f"Error during model evaluation function definition: {e}")


In [None]:
# Assuming 'Prediction1', 'Prediction2', 'Prediction3', 'Prediction4' are your target variable names
# and 'df' is your DataFrame

# Step 1: Check data types of keys
row_keys = ['Prediction1', 'Prediction2', 'Prediction3', 'Prediction4']
row_keys_data_types = [type(key) for key in row_keys]
print("Data types of keys:", row_keys_data_types)

# Step 2: Convert keys to int if needed
try:
    row_keys_as_int = [int(key[10:]) for key in row_keys]
except ValueError as e:
    print(f"Error converting keys to int: {e}")
    # Handle the error as needed

# Step 3: Check data type of the index
try:
    index_data_type = type(df.index[0])
except IndexError as e:
    print(f"Error accessing index: {e}")
    # Handle the error as needed

print("Data type of the index:", index_data_type)


In [None]:
# Step 4: Check for missing or invalid values in the index
try:
    missing_values = df.index.isnull().sum()  # Check for NaN values
    invalid_values = df.index[~df.index.isin(row_keys_as_int)]  # Check for values not in keys_as_int
except Exception as e:
    print(f"Error checking index values: {e}")
    # Handle the error as needed

# Step 5: Convert index values to integers
try:
    df.index = df.index.astype(int)
except Exception as e:
    print(f"Error converting index values to integers: {e}")
    # Handle the error as needed

print("Number of missing values in the index:", missing_values)
print("Invalid values in the index:", invalid_values)


In [None]:
try:
    # Evaluate the ensemble model
    ensemble_evaluation = evaluate_model(ensemble_model, ensemble_X_train, y_train.iloc[:, :2])

    # Display the evaluation metrics
    for metric, values in ensemble_evaluation.items():
        print(f"{metric}: {values}")

except Exception as e:
    print(f"Error during ensemble model evaluation: {e}")

    # Print specific rows in y_train using the problematic indices
    if isinstance(e, tuple) and len(e) == 2 and isinstance(e[0], np.ndarray) and e[1] == 0:
        problematic_indices = e[0]
        problematic_rows = y_train.iloc[problematic_indices[0], :2]
        print("Problematic Rows in y_train:")
        print(problematic_rows)

    else:
        print("Unable to retrieve problematic rows. Check the error type and message.")



In [None]:
try:
    # Step 1: Initial Training with warm_start
    # Train the RandomForestRegressor with a small number of trees using placeholder values
    # Adjust the number of trees and other hyperparameters as needed
    rf_model.fit(X_train, y_train.iloc[:, 0])

    # Evaluate its performance on the test set
    y_pred_initial = rf_model.predict(X_test.iloc[:, :20])  # Assuming the first target variable is Prediction1
    mse_initial = mean_squared_error(y_test.iloc[:, 0], y_pred_initial)
    print("Mean Squared Error after Initial Training:", mse_initial)
except Exception as e:
    print(f"Error during initial training and evaluation: {e}")


In [None]:
try:
    # Step 2: Transition to Two-Step Approach
    # Randomize the placeholder values for each target variable
    y_train_randomized = y_train.apply(np.random.permutation, axis=0)

    # Train the models on the randomized values
    rf_model.fit(X_train, y_train_randomized.iloc[:, 0])
    xgb_model.fit(X_train, y_train_randomized.iloc[:, 1])

    # Evaluate their performance on the test set
    y_pred_rf_randomized = rf_model.predict(X_test.iloc[:, :20])  # Assuming the first target variable is Prediction1
    y_pred_xgb_randomized = xgb_model.predict(X_test)
    mse_rf_randomized = mean_squared_error(y_test.iloc[:, 0], y_pred_rf_randomized)
    mse_xgb_randomized = mean_squared_error(y_test.iloc[:, 1], y_pred_xgb_randomized)
    print("Mean Squared Error after Training with Randomized Placeholders (RF):", mse_rf_randomized)
    print("Mean Squared Error after Training with Randomized Placeholders (XGB):", mse_xgb_randomized)
except Exception as e:
    print(f"Error during training with randomized placeholders and evaluation: {e}")


In [None]:
try:
    # Step 3: Fine-Tuning with Actual Target Variables
    # Fine-tune the models using the actual values
    rf_model.fit(X_train, y_train.iloc[:, 0])
    xgb_model.fit(X_train, y_train.iloc[:, 1])

    # Evaluate their final performance on the test set
    y_pred_rf_final = rf_model.predict(X_test.iloc[:, :20])  # Assuming the first target variable is Prediction1
    y_pred_xgb_final = xgb_model.predict(X_test)
    mse_rf_final = mean_squared_error(y_test.iloc[:, 0], y_pred_rf_final)
    mse_xgb_final = mean_squared_error(y_test.iloc[:, 1], y_pred_xgb_final)
    print("Mean Squared Error after Fine-Tuning with Actual Target Variables (RF):", mse_rf_final)
    print("Mean Squared Error after Fine-Tuning with Actual Target Variables (XGB):", mse_xgb_final)
except Exception as e:
    print(f"Error during fine-tuning and final evaluation: {e}")


In [None]:
print(y_train)

In [None]:
# Assuming 'Prediction1', 'Prediction2', 'Prediction3', 'Prediction4' are your target variable names
# and 'df' is your DataFrame

# Step 1: Check data types of keys
row_keys = ['Prediction1', 'Prediction2', 'Prediction3', 'Prediction4']
row_keys_data_types = [type(key) for key in row_keys]
print("Data types of keys:", row_keys_data_types)

# Step 2: Check data type of the index
index_data_type = type(df.index[0])
print("Data type of the index:", index_data_type)


In [None]:
# Define a function for model evaluations
def evaluate_model(model, X, y, cv=5):
    # Ensure the model is fitted
    model.fit(X, y)

    # Initialize evaluation metrics
    mse_5fold = []
    mse_10fold = []
    r2_scores = []

    # Loop through each target variable
    for i in range(y.shape[1]):
        # 5-fold cross-validation
        scores_5fold = cross_val_score(model, X, y.iloc[:, i], cv=cv, scoring='neg_mean_squared_error')
        mse_5fold.append(-scores_5fold.mean())

        # 10-fold cross-validation
        scores_10fold = cross_val_score(model, X, y.iloc[:, i], cv=10, scoring='neg_mean_squared_error')
        mse_10fold.append(-scores_10fold.mean())

        # Ensure the model is fitted
        model.fit(X, y.iloc[:, i])

        # R-squared score
        y_pred = model.predict(X)
        r2 = r2_score(y.iloc[:, i], y_pred)
        r2_scores.append(r2)

    return {
        '5-Fold Cross-Validation MSE': mse_5fold,
        '10-Fold Cross-Validation MSE': mse_10fold,
        'R-squared Score': r2_scores
    }


In [None]:
# Fit the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
ensemble_evaluation = evaluate_model(ensemble_model, X_train, y_train)

# Display the evaluation metrics
for metric, values in ensemble_evaluation.items():
    print(f"{metric}: {values}")


In [None]:
# Evaluate the ensemble model
evaluation_results = evaluate_model(ensemble_model, X_train, y_train)

# Access the results from the evaluation
mse_5fold = evaluation_results['5-Fold Cross-Validation MSE']
mse_10fold = evaluation_results['10-Fold Cross-Validation MSE']
r2_scores = evaluation_results['R-squared Score']

# Display the results
for i in range(len(mse_5fold)):
    print(f"Target Variable {i + 1}:")
    print(f"5-Fold Cross-Validation MSE: {mse_5fold[i]}")
    print(f"10-Fold Cross-Validation MSE: {mse_10fold[i]}")
    print(f"R-squared Score: {r2_scores[i]}")
    print()


In [None]:
# Check the shape of the predictions array
print("Shape of predictions array:", ensemble_model.predict(X_train).shape)


In [None]:
# Task 1.1: Analyze Predictions and Visualize Results
import matplotlib.pyplot as plt
import numpy as np

def analyze_predictions(model, X, y):
    predictions = model.predict(X)

    # Check if predictions are 1-dimensional
    if len(predictions.shape) == 1:
        predictions = predictions.reshape(-1, 1)

    print("Shapes - y:", y.shape, "predictions:", predictions.shape)  # Add this line for debugging

    # Create subplots for each target variable
    n_targets = y.shape[1]
    n_subplots = min(n_targets * 2, 8)  # Limit to 8 subplots for better visualization
    fig, axes = plt.subplots(nrows=n_subplots // 2, ncols=2, figsize=(15, 5 * (n_subplots // 2)))

    # Flatten the axes array to handle the case of one target variable
    axes = np.array(axes).flatten()

    # Loop through each target variable
    for i in range(n_targets):
        # Check if there are available subplots
        if i * 2 < n_subplots:
            # Plot predicted vs. actual values
            axes[i * 2].scatter(y.iloc[:, i], predictions[:, i], alpha=0.5)
            axes[i * 2].set_title(f'Target Variable {i + 1}: Predicted vs. Actual')
            axes[i * 2].set_xlabel('Actual Values')
            axes[i * 2].set_ylabel('Predicted Values')

        # Check if there are available subplots for residuals
        if i * 2 + 1 < n_subplots:
            # Plot residuals
            residuals = y.iloc[:, i] - predictions[:, i]
            axes[i * 2 + 1].scatter(predictions[:, i], residuals, alpha=0.5)
            axes[i * 2 + 1].set_title(f'Target Variable {i + 1}: Residuals Plot')
            axes[i * 2 + 1].set_xlabel('Predicted Values')
            axes[i * 2 + 1].set_ylabel('Residuals')
            axes[i * 2 + 1].axhline(y=0, color='red', linestyle='--')  # Add horizontal line at y=0

    plt.tight_layout()
    plt.show()

# Call the function with your ensemble model and training data
analyze_predictions(ensemble_model, X_train, y_train)
