<a href="https://colab.research.google.com/github/alvinfranklyndavis/Project2023_v3/blob/main/GPT_4_Bard_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# CELL 1.1: Package Installation

# Upgrade pip and install required packages
!pip install -U --upgrade-strategy eager pip
!pip install -U --upgrade-strategy eager pandas gdown numpy matplotlib scikit-learn xgboost shap
!pip install -U scikit-learn
!pip install -U imbalanced-learn
!pip install black  # Install Black for code formatting


[0m

In [2]:
# CELL 1.2: Import Libraries and Set Up Logging

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import logging
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.impute import KNNImputer
import shap

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [22]:
# Cell 1.3: Load, Split, and Preprocess Data from Google Drive

import pandas as pd
import logging
import os
from google.colab import drive

# Set up logging
logger = logging.getLogger(__name__)

# Mount Google Drive
drive.mount('/content/drive')

# Define the directory for datasets in Google Drive
drive_dataset_directory = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/'

# Define the path to the comprehensive CSV file for training and testing
csv_filename_train_test = '1_Model_Train_Test_Data.csv'
drive_csv_path_train_test = os.path.join(drive_dataset_directory, csv_filename_train_test)

# Define the path to the CSV file for unseen data
csv_filename_unseen = '2_Model_Unseen_Data.csv'
drive_csv_path_unseen = os.path.join(drive_dataset_directory, csv_filename_unseen)

# Function to preprocess data
def preprocess_data(data, is_training=True):
    logger.info("Starting preprocessing for dataset")

    # Convert 'Date' to datetime and extract 'Year', 'Month', and 'Day'
    data['Date'] = pd.to_datetime(data['Date'])
    data['Year'] = data['Date'].dt.year
    data['Month'] = data['Date'].dt.month
    data['Day'] = data['Date'].dt.day
    logger.info("Extracted year, month, and day from 'Date' column")

    # Drop the 'Date' column
    data.drop(columns=['Date'], inplace=True)
    logger.info("'Date' column dropped after extraction of year, month, and day")

    # Apply the moving averages calculation only for training/testing data
    if is_training:
        # Function to calculate moving averages with dynamic window size
        def calculate_moving_averages(data, window_size, columns_to_average, target_columns):
            try:
                for col, target_col in zip(columns_to_average, target_columns):
                    data[target_col] = data[col].rolling(window=window_size, min_periods=1).mean()
                logger.info(f"Calculated moving averages for specified columns with window size: {window_size}")
            except Exception as e:
                logger.error("Error in moving average calculation: %s", e)

        # Apply the function with an initial window size
        initial_window_size = 2  # Adjust as needed
        columns_to_average = ['Morning', 'Afternoon', 'Evening', 'Night']
        target_columns = ['Mov_Avg_Mor', 'Mov_Avg_Aft', 'Mov_Avg_Eve', 'Mov_Avg_Nig']

        # Initialize moving average columns with default values (e.g., 0)
        for col in target_columns:
            data[col] = 0

        calculate_moving_averages(data, initial_window_size, columns_to_average, target_columns)

    # Adjust entries to use previous day's data
    data['Prev_Morning'] = data['Morning'].shift(1)
    data['Prev_Afternoon'] = data['Afternoon'].shift(1)
    data['Prev_Evening'] = data['Evening'].shift(1)
    logger.info("Created previous day columns")

    # Create 'Prediction1' column
    # Initially, set it to the values from 'Morning' column
    data['Prediction1'] = data['Morning']

    # Handle NaN values for new columns
    data['Prev_Morning'].fillna(18, inplace=True)  # Adjust default values as needed
    data['Prev_Afternoon'].fillna(18, inplace=True)
    data['Prev_Evening'].fillna(18, inplace=True)

    # Apply specific operations for training/testing data
    if is_training:
        # Drop the first row
        data = data.iloc[1:]

        # Conditionally drop row with index 518 if it exists
        if 518 in data.index:
            data = data.drop(index=518)

        # Reset index after row exclusions
        data = data.reset_index(drop=True)
        logger.info("Specific rows dropped and index reset for training/testing data")

    return data

# Check and load the datasets
def load_dataset(file_path):
    if os.path.isfile(file_path):
        print("File found. Proceeding to load the dataset.")
        return pd.read_csv(file_path)
    else:
        print("File not found. Check the file path or the Google Drive mount.")
        return None

train_test_data = load_dataset(drive_csv_path_train_test)
unseen_data = load_dataset(drive_csv_path_unseen)

# Logging the start of dataset loading
logger.info("Reading the training/testing dataset from Google Drive")
logger.info("Reading the unseen dataset from Google Drive")

# Apply preprocessing to training/testing and unseen datasets
train_test_data = preprocess_data(train_test_data, is_training=True) if train_test_data is not None else None
unseen_data = preprocess_data(unseen_data, is_training=False) if unseen_data is not None else None
logger.info("Preprocessing applied to both training/testing and unseen datasets")

# Define paths to save the preprocessed data
preprocessed_train_test_path = os.path.join(drive_dataset_directory, '7_preprocessed_train_test_data.csv')
preprocessed_unseen_path = os.path.join(drive_dataset_directory, '8_preprocessed_unseen_data.csv')

# Saving preprocessed data
if train_test_data is not None:
    train_test_data.to_csv(preprocessed_train_test_path, index=False)
    logger.info("Preprocessed training/testing data saved to Google Drive.")

if unseen_data is not None:
    unseen_data.to_csv(preprocessed_unseen_path, index=False)
    logger.info("Preprocessed unseen data saved to Google Drive.")

# [Continue with further data processing...]


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.


In [23]:
# CELL 2.1: Saving Preprocessed Data

# Define the paths to save the preprocessed data
preprocessed_train_test_path = os.path.join(drive_dataset_directory, '3_train_test_features.csv')
preprocessed_train_test_target_path = os.path.join(drive_dataset_directory, '4_train_test_target.csv')
preprocessed_unseen_features_path = os.path.join(drive_dataset_directory, '5_unseen_features.csv')
preprocessed_unseen_target_path = os.path.join(drive_dataset_directory, '6_unseen_target.csv')

# Save preprocessed training/testing data
train_test_data.to_csv(preprocessed_train_test_path, index=False)
logger.info("Preprocessed training/testing features saved to Google Drive.")

# Assuming 'Prediction1' is the target for training/testing data
train_test_target = train_test_data[['Prediction1']]
train_test_data.drop(columns=['Prediction1'], inplace=True)
train_test_target.to_csv(preprocessed_train_test_target_path, index=False)
logger.info("Preprocessed training/testing target saved to Google Drive.")

# Save preprocessed unseen data
# Assuming we split features and target as we did with train_test_data
unseen_target = unseen_data[['Prediction1']]
unseen_data.drop(columns=['Prediction1'], inplace=True)
unseen_data.to_csv(preprocessed_unseen_features_path, index=False)
unseen_target.to_csv(preprocessed_unseen_target_path, index=False)
logger.info("Preprocessed unseen features and target saved to Google Drive.")

# Print the first few rows of the preprocessed training/testing data for visual confirmation
print("First few rows of the preprocessed training/testing features:")
print(train_test_data.head())
print("\nFirst few rows of the preprocessed training/testing target:")
print(train_test_target.head())

# Print the first few rows of the preprocessed unseen data for visual confirmation
print("\nFirst few rows of the preprocessed unseen features:")
print(unseen_data.head())
print("\nFirst few rows of the preprocessed unseen target:")
print(unseen_target.head())

# Optionally, print the shape and column names for further confirmation
print("\nShape of the training/testing features DataFrame:", train_test_data.shape)
print("Column names:", train_test_data.columns)
print("Shape of the training/testing target DataFrame:", train_test_target.shape)
print("Column names:", train_test_target.columns)
print("Shape of the unseen features DataFrame:", unseen_data.shape)
print("Column names:", unseen_data.columns)
print("Shape of the unseen target DataFrame:", unseen_target.shape)
print("Column names:", unseen_target.columns)

# [Continue with further data processing...]


First few rows of the preprocessed training/testing features:
   Row Number Data_Type  Morning  Prev_Week  Rep_Prev_Week  Prev_Entry  \
0           2  Training       31         11              0           9   
1           3  Training       15         19              0          12   
2           4  Training       31         35              0          35   
3           5  Training       31         18              0          16   
4           6  Training       21         13              0          18   

   Rep_Prev_Entry  Mov_Avg_Mor  Afternoon  Prev_Week.1  ...  Rep_Prev_Week.3  \
0               0         25.0          3           21  ...                0   
1               0         23.0          9           19  ...                0   
2               0         23.0         21           20  ...                0   
3               0         31.0         31           30  ...                1   
4               0         26.0         17           34  ...                0   

   Prev_Entr

In [24]:
# Cell 2.2: Feature Finalization and Saving for Model Input

def finalize_and_save_features(data, filename_suffix, file_number):
    logger.info(f"Finalizing features for model input and saving processed data for {filename_suffix}")

    selected_columns = ['Year', 'Month', 'Day', 'Prev_Week', 'Prev_Entry', 'Mov_Avg_Mor', 'Prev_Morning', 'Prev_Afternoon', 'Prev_Evening']
    X = data[selected_columns]

    # Save the processed data features to CSV with the updated naming convention
    file_name = f'{file_number}_{filename_suffix}_features.csv'
    file_path = os.path.join(drive_dataset_directory, file_name)
    X.to_csv(file_path, index=False)
    logger.info(f"Processed data features for {filename_suffix} saved to Google Drive as {file_name}.")

    # Print the first few rows for verification
    print(f"First few rows of {filename_suffix} features:")
    print(X.head())

# Apply to training/testing data
finalize_and_save_features(train_test_data, 'train_test', '9')

# Apply to unseen data
finalize_and_save_features(unseen_data, 'unseen', '10')


First few rows of train_test features:
   Year  Month  Day  Prev_Week  Prev_Entry  Mov_Avg_Mor  Prev_Morning  \
0  2018      8    2         11           9         25.0          19.0   
1  2018      8    3         19          12         23.0          31.0   
2  2018      8    4         35          35         23.0          15.0   
3  2018      8    6         18          16         31.0          31.0   
4  2018      8    7         13          18         26.0          31.0   

   Prev_Afternoon  Prev_Evening  
0            14.0          33.0  
1             3.0          35.0  
2             9.0          23.0  
3            21.0          29.0  
4            31.0          15.0  
First few rows of unseen features:
   Year  Month  Day  Prev_Week  Prev_Entry  Mov_Avg_Mor  Prev_Morning  \
0  2023      8    1         27           5         26.0          18.0   
1  2023      8    2         33          18         22.5          18.0   
2  2023      8    3         27          28         15.0         

In [26]:
# CELL 3.1: Additional Data Insights for CSVs

import pandas as pd
import logging

# Set up logging
logger = logging.getLogger(__name__)
logger.info("Exploring additional data insights for CSVs...")

# Define the directory for datasets in Google Drive
drive_dataset_directory = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/'

# Read the CSV files with updated paths
train_test_features = pd.read_csv(drive_dataset_directory + '9_train_test_features.csv')
train_test_target = pd.read_csv(drive_dataset_directory + '4_train_test_target.csv')
unseen_features = pd.read_csv(drive_dataset_directory + '10_unseen_features.csv')
unseen_target = pd.read_csv(drive_dataset_directory + '6_unseen_target.csv')

# Function to print data insights
def print_data_insights(df, df_name):
    print(f"Data Insights for {df_name}:\n")
    print("Missing values in each column:")
    print(df.isnull().sum())

    print("\nPercentage of missing values in each column:")
    print(df.isnull().mean() * 100)

    print("\nSummary statistics:")
    print(df.describe())

    print("\nFirst few rows of data:")
    print(df.head())

    print("\n-----------------------------------------\n")

# Print insights for each DataFrame
print_data_insights(train_test_features, "Train/Test Features")
print_data_insights(train_test_target, "Train/Test Target")
print_data_insights(unseen_features, "Unseen Features")
print_data_insights(unseen_target, "Unseen Target")


Data Insights for Train/Test Features:

Missing values in each column:
Year              0
Month             0
Day               0
Prev_Week         0
Prev_Entry        0
Mov_Avg_Mor       0
Prev_Morning      0
Prev_Afternoon    0
Prev_Evening      0
dtype: int64

Percentage of missing values in each column:
Year              0.0
Month             0.0
Day               0.0
Prev_Week         0.0
Prev_Entry        0.0
Mov_Avg_Mor       0.0
Prev_Morning      0.0
Prev_Afternoon    0.0
Prev_Evening      0.0
dtype: float64

Summary statistics:
              Year        Month          Day    Prev_Week   Prev_Entry  \
count  1407.000000  1407.000000  1407.000000  1407.000000  1407.000000   
mean   2020.577825     6.629709    15.626866    18.212509    18.130064   
std       1.556254     3.565689     8.752854    10.568730    10.368159   
min    2018.000000     1.000000     1.000000     0.000000     1.000000   
25%    2019.000000     3.000000     8.000000     9.000000     9.000000   
50%    2021.

In [29]:
# CELL 3.2: Setting Bounds for Numerical Range and Preparing Data for Prediction

import numpy as np
import pandas as pd
import logging

# Set up logging
logger = logging.getLogger(__name__)

try:
    # Corrected path for loading the dataset
    data_path = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/7_preprocessed_train_test_data.csv'
    data = pd.read_csv(data_path)
    logger.info("Dataset loaded successfully.")

    logger.info("Enforcing numerical bounds...")

    # Define all columns that should have values in the range of 1 to 36
    columns_to_check = [
        'Morning', 'Prev_Week', 'Prev_Entry', 'Mov_Avg_Mor', 'Afternoon',
        'Prev_Week.1', 'Prev_Entry.1', 'Mov_Avg_Aft', 'Evening', 'Prev_Week.2',
        'Prev_Entry.2', 'Mov_Avg_Eve', 'Night', 'Prev_Week.3', 'Prev_Entry.3',
        'Mov_Avg_Nig', 'Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prediction1'
    ]

    # Loop through these columns and enforce the range if they exist in the DataFrame
    for col in columns_to_check:
        if col in data.columns:
            # Find values outside the range and enforce the range by clipping values
            data[col] = data[col].clip(lower=1, upper=36)

    # Ensure changes are reflected
    print(data.describe())

    # Prepare the current data with NaNs in 'Prediction1' for testing
    selected_columns = ['Year', 'Month', 'Day', 'Prev_Week', 'Prev_Entry', 'Mov_Avg_Mor', 'Prev_Morning', 'Prev_Afternoon', 'Prev_Evening']
    current_data = data[selected_columns].copy()  # Use .copy() to create an independent copy
    current_data['Prediction1'] = np.nan  # Initialize 'Prediction1' with NaN

    # After initializing 'Prediction1' with NaN
    print("\nFirst few rows of current data with 'Prediction1' initialized as NaN:")
    print(current_data.head())

    # Display Row 1406 (End of Testing Data)
    print("Row 1406 (End of Testing Data):")
    print(data.iloc[1406])  # Remember, Python indexing starts at 0

    # Corrected filename to avoid conflict and match naming convention
    prepared_current_data_path = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/11_prepared_current_data_for_prediction1.csv'
    current_data.to_csv(prepared_current_data_path, index=False)
    logger.info("Prepared current data saved to Google Drive.")

    # Confirming the file saving by reading and displaying the first few rows
    logger.info("Verifying saved prepared current data:")
    saved_current_data = pd.read_csv(prepared_current_data_path)
    print(saved_current_data.head())

except Exception as e:
    logger.error(f"An error occurred: {e}")


        Row Number     Morning    Prev_Week  Rep_Prev_Week   Prev_Entry  \
count  1407.000000  1407.00000  1407.000000    1407.000000  1407.000000   
mean    705.632552    18.77683    18.240938       0.027008    18.130064   
std     406.713037    10.27599    10.520904       0.162164    10.368159   
min       2.000000     1.00000     1.000000       0.000000     1.000000   
25%     353.500000    10.00000     9.000000       0.000000     9.000000   
50%     706.000000    19.00000    18.000000       0.000000    18.000000   
75%    1057.500000    28.00000    27.000000       0.000000    27.000000   
max    1409.000000    36.00000    36.000000       1.000000    36.000000   

       Rep_Prev_Entry  Mov_Avg_Mor    Afternoon  Prev_Week.1  Rep_Prev_Week.1  \
count     1407.000000  1407.000000  1407.000000  1407.000000      1407.000000   
mean         0.014925    18.773632    18.613362    18.156361         0.022033   
std          0.121297     7.076781    10.361999    10.601390         0.146842   


In [None]:
# CELL 4.1: Data Preparation for Prediction

import pandas as pd
import numpy as np
import logging

logger = logging.getLogger(__name__)

# Define the preprocessing function for feature datasets
def preprocess_features(data):
    """Preprocess feature data."""
    logger.info("Starting preprocessing for feature dataset")

    # Check if 'Date' column exists and convert it to datetime, then extract 'Year', 'Month', 'Day'
    if 'Date' in data.columns:
        data['Date'] = pd.to_datetime(data['Date'])
        data['Year'] = data['Date'].dt.year
        data['Month'] = data['Date'].dt.month
        data['Day'] = data['Date'].dt.day
        data.drop('Date', axis=1, inplace=True)  # Drop the 'Date' column after extraction

    # Select relevant columns
    selected_columns = ['Year', 'Month', 'Day', 'Prev_Week', 'Prev_Entry', 'Mov_Avg_Mor', 'Prev_Morning', 'Prev_Afternoon', 'Prev_Evening']
    data = data[selected_columns]

    return data

# Paths to CSV files
paths = {
    "train_test_features": '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/3_train_test_features.csv',
    "train_test_target": '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/4_train_test_target.csv',
    "unseen_features": '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/5_unseen_features.csv',
    "unseen_target": '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/6_unseen_target.csv'
}

# Load and preprocess feature datasets
X_train_test = preprocess_features(pd.read_csv(paths["train_test_features"]))
X_unseen = preprocess_features(pd.read_csv(paths["unseen_features"]))

# Load target datasets without preprocessing
y_train_test = pd.read_csv(paths["train_test_target"])
y_unseen = pd.read_csv(paths["unseen_target"])

# Quick check of data structures
logger.info("Train/Test Features Shape: %s, Unseen Features Shape: %s", X_train_test.shape, X_unseen.shape)
logger.info("Train/Test Target Shape: %s, Unseen Target Shape: %s", y_train_test.shape, y_unseen.shape)

# Further processing (if necessary)
# Example: Feature scaling, handling categorical variables, etc.

# Prepare for unseen data processing (to be done at a later stage)


In [None]:
# CELL 4.1: Data Preparation for Prediction
# ... [previous setup and preprocessing code] ...

# Load and preprocess feature datasets
X_train_test = preprocess_features(pd.read_csv(paths["train_test_features"]))
X_unseen = preprocess_features(pd.read_csv(paths["unseen_features"]))

# Load target datasets without preprocessing
y_train_test = pd.read_csv(paths["train_test_target"])
y_unseen = pd.read_csv(paths["unseen_target"])

# Quick integrity check at the end
print("\nIntegrity Check for Train/Test Features")
print("Shape of Features:", X_train_test.shape)
print("First few rows of Features:")
print(X_train_test.head())

print("\nIntegrity Check for Train/Test Target")
print("Shape of Target:", y_train_test.shape)
print("First few rows of Target:")
print(y_train_test.head())

print("\nIntegrity Check for Unseen Features")
print("Shape of Features:", X_unseen.shape)
print("First few rows of Features:")
print(X_unseen.head())

print("\nIntegrity Check for Unseen Target")
print("Shape of Target:", y_unseen.shape)
print("First few rows of Target:")
print(y_unseen.head())


In [None]:
# CELL 4.2: Model Training, Prediction, and Unseen Data Loading

import pandas as pd
import numpy as np
import logging
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

logger = logging.getLogger(__name__)

def train_and_evaluate_model(X_train, y_train, X_val, y_val):
    """Train Random Forest model and evaluate it on validation data."""
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train.values.ravel())
    logger.info("Random Forest model trained.")

    y_val_pred = model.predict(X_val)
    logger.info(f"Validation MSE: {mean_squared_error(y_val, y_val_pred)}, R2 Score: {r2_score(y_val, y_val_pred)}")
    return model

# Paths to data
data_paths = {
    "X_train": '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/3_train_test_features.csv',
    "y_train": '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/4_train_test_target.csv',
    "X_val": '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/3_train_test_features.csv',
    "y_val": '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/4_train_test_target.csv',
    "current_data": '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/7_prepared_current_data_for_prediction1.csv',
    "original_unseen_data": '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/5_unseen_features.csv'  # Adjust to original unseen data file
}

# Load and process data
X_train = pd.read_csv(data_paths["X_train"])
y_train = pd.read_csv(data_paths["y_train"])
X_val = pd.read_csv(data_paths["X_val"])
y_val = pd.read_csv(data_paths["y_val"])
current_data = pd.read_csv(data_paths["current_data"])
original_unseen_data = pd.read_csv(data_paths["original_unseen_data"])

# Train and save model
model = train_and_evaluate_model(X_train, y_train, X_val, y_val)
joblib.dump(model, '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/random_forest_model.pkl')
logger.info("Model saved to Google Drive.")

# Predict and save current data
current_data['Prediction1'] = np.round(model.predict(current_data.drop('Prediction1', axis=1)))
current_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/current_data_with_prediction1.csv', index=False)
logger.info("Current data predictions saved.")

# Predict unseen data
original_unseen_data['Prediction1'] = model.predict(original_unseen_data[selected_columns])

# Save unseen data with predictions
original_unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Model_Unseen_Data_with_Predictions.csv', index=False)
logger.info("Unseen data predictions saved.")


In [None]:
import pandas as pd

# Load and display the first few rows of '8_current_data_with_prediction1.csv'
current_data_with_predictions_path = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/current_data_with_prediction1.csv'
current_data_with_predictions = pd.read_csv(current_data_with_predictions_path)

print("First few rows of current data with predictions ('8_current_data_with_prediction1.csv'):")
print(current_data_with_predictions.head())

# Load and display the first few rows of '9_Model_Unseen_Data_with_Predictions.csv'
unseen_data_with_predictions_path = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Model_Unseen_Data_with_Predictions.csv'
unseen_data_with_predictions = pd.read_csv(unseen_data_with_predictions_path)

print("\nFirst few rows of unseen data with predictions ('9_Model_Unseen_Data_with_Predictions.csv'):")
print(unseen_data_with_predictions.head())


In [None]:
# Cell 3.3: Filter Unseen Data to Match Previous Dataset Structure

# Define the columns to keep
columns_to_keep = ['Year', 'Month', 'Day', 'Prev_Week', 'Prev_Entry',
                   'Mov_Avg_Mor', 'Prev_Morning', 'Prev_Afternoon',
                   'Prev_Evening', 'Prediction1']

# Drop irrelevant columns
unseen_data_filtered = unseen_data[columns_to_keep]

# Print the first few rows of the filtered unseen data
print("First few rows of filtered unseen data:")
print(unseen_data_filtered.head())


In [None]:
# Cell 3.3: Enhanced NaN Check and Handling in 'Prediction1'

import pandas as pd
import logging

logger = logging.getLogger(__name__)

logger.info("Loading current data with predictions for enhanced NaN handling...")

# Load the current data with predictions
current_data_with_predictions = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/current_data_with_prediction1.csv')

# Forward fill NaNs in the first row
current_data_with_predictions.iloc[0] = current_data_with_predictions.iloc[0].fillna(method='ffill')

# Print the first few rows to verify NaN handling
print("First few rows of current data after NaN handling:\n", current_data_with_predictions.head())

# Check for NaN values in 'Prediction1'
nan_count_prediction1 = current_data_with_predictions['Prediction1'].isnull().sum()
logger.info(f"Number of NaN values in 'Prediction1': {nan_count_prediction1}")

if nan_count_prediction1 > 0:
    logger.warning("NaNs detected in 'Prediction1'. Here are the details:")
    nan_rows = current_data_with_predictions[current_data_with_predictions['Prediction1'].isnull()]
    print("Rows with NaN in 'Prediction1':\n", nan_rows)
else:
    print("No NaN values found in 'Prediction1'.")

# Additional analysis or handling of NaNs can be added here if needed


In [None]:
# Cell 4.1. Model Interpretation

import shap
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score

logger = logging.getLogger(__name__)

# Load the finalized model
final_model = joblib.load('/content/drive/My Drive/Predictive_Modeling_Four_Draws/random_forest_prediction_model.pkl')

logger.info("Interpreting the model with SHAP values...")

# Assuming X_val is already prepared in previous cells
# Using SHAP to interpret the model
explainer = shap.TreeExplainer(final_model)
shap_values = explainer.shap_values(X_val)
shap.summary_plot(shap_values, X_val, plot_type="bar")

# STEP 4.2. FINAL MODEL SELECTION AND REPORTING

logger.info("Evaluating final model performance with regression metrics...")

# Flatten y_val to ensure it's 1-dimensional
y_val_flat = y_val.values.ravel() if isinstance(y_val, pd.DataFrame) else y_val

# Generate predictions for the validation set
y_pred_val = final_model.predict(X_val)
y_pred_val_rounded = np.round(y_pred_val)

# Flatten y_pred_val_rounded to ensure it's 1-dimensional
y_pred_val_flat = y_pred_val_rounded.ravel() if isinstance(y_pred_val_rounded, pd.DataFrame) else y_pred_val_rounded

# Create the DataFrame
predictions_df = pd.DataFrame({'Actual': y_val_flat, 'Predicted': y_pred_val_flat})

# Save the DataFrame as CSV
predictions_df.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/predictions_df.csv', index=False)

# Calculate Mean Squared Error (MSE) and R2 Score for the validation set
mse_val = mean_squared_error(y_val_flat, y_pred_val_flat)
r2_val = r2_score(y_val_flat, y_pred_val_flat)

logger.info(f"Validation MSE: {mse_val}, R2 Score: {r2_val}")

logger.info(f"Regression Metrics:\nMSE: {mse_val}\nR2 Score: {r2_val}")

# STEP 4.3. PREPARATION FOR DEPLOYMENT

# ...[Include steps for preparing the model for deployment]...

# STEP 4.4. DOCUMENTATION AND REPORTING

# ...[Prepare a comprehensive report on the model's performance, limitations, and deployment steps]...

logger.info("Model documentation and reporting completed.")

# Final Checks and Tests (if applicable)
# ...[Include any final testing or checks before deployment]...

logger.info("Final checks and tests completed.")
logger.info("Cell 4 tasks completed successfully.")


In [None]:
# Cell 5. Cross-Validation and  additional metrics analysis

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import joblib
import shap
import logging

logger = logging.getLogger(__name__)

# Assuming you are using RandomForestRegressor as your model
model = RandomForestRegressor(random_state=42)

# Define your scoring metrics for regression
scoring_metrics = {
    'MSE': make_scorer(mean_squared_error, greater_is_better=False),
    'R2': make_scorer(r2_score)
}
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

# Assuming 'data' is sorted by date and 'model' is your trained model
tscv = TimeSeriesSplit(n_splits=5)

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)  # Train your model
    predictions = model.predict(X_test)  # Make predictions

    mse = mean_squared_error(y_test, predictions)  # Calculate MSE
    print(f"MSE for the current fold: {mse}")

# Given the list of MSE scores from each fold
mse_scores = [3.4702196581196585, 1.9403478632478632, 1.051690170940171, 0.7291717948717948, 0.4926722222222223]

# Calculate the average MSE
average_mse = sum(mse_scores) / len(mse_scores)
print(f"Average MSE across all folds: {average_mse}")

# Perform 10-fold cross-validation
k_folds = 10  # Number of folds
cv_results = {}
for metric_name, scorer in scoring_metrics.items():
    scores = cross_val_score(model, X, y.fillna(y.mean()), scoring=scorer, cv=k_folds)
    cv_results[metric_name] = scores
    logger.info(f"{metric_name} scores for each fold: {scores}")
    logger.info(f"Average {metric_name} over {k_folds} folds: {np.mean(scores)}")

# Additional metrics analysis and error/bias exploration
# ... Add your code for detailed analysis of errors, biases, etc. ...
logger.info("Cross-validation and additional metrics analysis completed.")

# Feature Importance Analysis using SHAP
# Assuming 'final_model' is your trained RandomForestRegressor model

# Load the trained model (if not already loaded)
final_model = joblib.load('/content/drive/My Drive/Predictive_Modeling_Four_Draws/random_forest_prediction_model.pkl')

# Explain the model's predictions using SHAP
explainer = shap.TreeExplainer(final_model)
shap_values = explainer.shap_values(X)

# Plot summary plot using SHAP values
shap.summary_plot(shap_values, X)

logger.info("Feature importance analysis using SHAP completed.")


In [None]:
# Cell 6. Detailed error and bias analysis

import pandas as pd
import matplotlib.pyplot as plt
import logging
from google.colab import drive

# Set up logging
logger = logging.getLogger(__name__)

# Ensure Google Drive is mounted
drive.mount('/content/drive', force_remount=True)

# Define the path in Google Drive where the predictions DataFrame is saved
predictions_df_path = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/predictions_df.csv'

# Load or create the predictions DataFrame
if os.path.exists(predictions_df_path):
    predictions_df = pd.read_csv(predictions_df_path)
else:
    # Assuming predictions were made in a previous step and saved as 'predictions_df.csv'
    raise FileNotFoundError("predictions_df.csv not found. Ensure it's created in previous steps.")

logger.info("Predictions DataFrame loaded successfully for error and bias analysis.")
print(predictions_df.columns)

# Error Analysis
predictions_df['Error'] = predictions_df['Predicted'] - predictions_df['Actual']
predictions_df['Absolute_Error'] = predictions_df['Error'].abs()

# Plotting error distribution
plt.hist(predictions_df['Error'], bins=30)
plt.title('Error Distribution')
plt.xlabel('Prediction Error')
plt.ylabel('Frequency')
plt.show()

# Subgroup Analysis
# Ensure that 'Prev_Week' and 'Prev_Entry' are in predictions_df
if 'Prev_Week' in predictions_df and 'Prev_Entry' in predictions_df:
    # Subgroup analysis based on 'Prev_Week'
    prev_week_performance = predictions_df.groupby('Prev_Week').mean()['Absolute_Error']
    prev_week_performance.plot(kind='bar', figsize=(10, 6))
    plt.title('Performance by Previous Week')
    plt.xlabel('Previous Week')
    plt.ylabel('Average Absolute Error')
    plt.show()

    # Subgroup analysis based on 'Prev_Entry'
    prev_entry_performance = predictions_df.groupby('Prev_Entry').mean()['Absolute_Error']
    prev_entry_performance.plot(kind='bar', figsize=(10, 6))
    plt.title('Performance by Previous Entry')
    plt.xlabel('Previous Entry')
    plt.ylabel('Average Absolute Error')
    plt.show()

# Document findings
error_bias_report = """
Detailed Error Analysis:
- Error Distribution Insights: {'Describe your findings from the error distribution here'}
- Largest Errors: {'Describe characteristics of instances with largest errors here'}

Bias Exploration:
- Performance by Previous Week: {'Describe performance variations based on the previous week here'}
- Performance by Previous Entry: {'Describe performance variations based on the previous entry here'}
"""

logger.info("Error and bias analysis completed.")
logger.info(error_bias_report)


In [None]:
# Cell 7. Final review, deployment preparation, and documentation

import joblib
import logging

logger = logging.getLogger(__name__)

# Final Model Review and Refinement
# ... Code/comments for any last adjustments to the model ...

# Deployment Preparation
# Serialize the final model
# Ensure that 'final_model' is the variable name for your trained model to be deployed
final_model_path = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/random_forest_prediction_model.pkl'
joblib.dump(final_model, final_model_path)
logger.info(f"Final model serialized and saved for deployment at: {final_model_path}")

# Comprehensive Documentation Update
# ... Update your comprehensive report with all final findings and methodologies ...
# Include details on model performance, SHAP interpretation, error analysis, etc.

# Final Checks and Tests
# ... Code/comments for final tests and checks to ensure model is ready for deployment ...

# Planning for Future Improvements
# Describe areas where further research could be beneficial, and methodologies to explore in future iterations of the project
future_improvement_plan = """
Future Improvement Plans:
- Areas for further research: {describe areas where additional data, feature engineering, or alternative modeling techniques could be explored}
- Methodologies to explore: {describe potential methodologies, like deep learning or ensemble methods, for future iterations}
"""

logger.info("Final review and deployment preparation completed.")
logger.info(future_improvement_plan)


# New Section

# New Section