<a href="https://colab.research.google.com/github/alvinfranklyndavis/Draw1_Predictive_Model/blob/main/Initial_Data_Prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# CELL 1.1: Package Installation and Library Import

# Check for existing libraries
!pip show pandas numpy

# Install or upgrade required packages
!pip install -U --upgrade-strategy eager pip
!pip install -U --upgrade-strategy eager pandas==<desired_version> numpy==<desired_version>

# Import required libraries
import pandas as pd
import numpy as np
import logging
import os

# Set up logging to save logs in a file
log_file = 'project.log'
logging.basicConfig(filename=log_file, level=logging.INFO)
logger = logging.getLogger(__name__)

# Set up virtual environment (optional but recommended)
# You can create a virtual environment with: !python -m venv myenv
# And activate it with: source myenv/bin/activate (Linux/macOS) or myenv\Scripts\activate (Windows)


Name: pandas
Version: 1.5.3
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: The Pandas Development Team
Author-email: pandas-dev@python.org
License: BSD-3-Clause
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy, python-dateutil, pytz
Required-by: altair, arviz, bigframes, bokeh, bqplot, cmdstanpy, cufflinks, datascience, db-dtypes, dopamine-rl, fastai, geemap, geopandas, google-colab, gspread-dataframe, holoviews, ibis-framework, mizani, mlxtend, pandas-datareader, pandas-gbq, panel, plotnine, prophet, pymc, seaborn, sklearn-pandas, statsmodels, vega-datasets, xarray, yfinance
---
Name: numpy
Version: 1.25.2
Summary: Fundamental package for array computing in Python
Home-page: https://www.numpy.org
Author: Travis E. Oliphant et al.
Author-email: 
License: BSD-3-Clause
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: albumentations, altair, arviz, astropy, autograd, bl

In [17]:
# Cell 1.2: Data Loading from Google Drive Training / Testing  and Unseen datasets

import pandas as pd
import logging
import os
from google.colab import drive

# Set up logging
logger = logging.getLogger(__name__)

# Mount Google Drive
drive.mount('/content/drive')

# Define the directory for datasets in Google Drive
drive_dataset_directory = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/'

# Define the paths to the CSV files
csv_filename_train_test = 'A_Initial_Train_Test_Data.csv'
csv_filename_unseen = 'B_Initial_Unseen_Data.csv'

drive_csv_path_train_test = os.path.join(drive_dataset_directory, csv_filename_train_test)
drive_csv_path_unseen = os.path.join(drive_dataset_directory, csv_filename_unseen)

# Check and load the datasets
def load_dataset(file_path):
    if os.path.isfile(file_path):
        print("File found. Proceeding to load the dataset.")
        return pd.read_csv(file_path)
    else:
        print("File not found. Check the file path or the Google Drive mount.")
        return None

# Load training/testing data
train_test_data = load_dataset(drive_csv_path_train_test)

# Load unseen data
unseen_data = load_dataset(drive_csv_path_unseen)

# Print the first few rows of both datasets for inspection
print("First few rows of training/testing data:")
print(train_test_data.head())

print("\nFirst few rows of unseen data:")
print(unseen_data.head())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
First few rows of training/testing data:
       Date  Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  \
0  01-08-18           1  Training   19.0            7.0        27.0   
1  02-08-18           2  Training   31.0           11.0         1.0   
2  03-08-18           3  Training   15.0           19.0        21.0   
3  04-08-18           4  Training   31.0           35.0        18.0   
4  05-08-18           5       NaN    NaN            NaN         NaN   

   DR1_Prev_Entry  DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  ...  \
0            23.0              32.0         27.5          17.0  ...   
1             9.0              33.0         21.0           6.0  ...   
2            12.0              35.0         23.5          20.0  ...   
3            35.0              

In [18]:
# Cell 1.3: Surveillance Check for NaNs within both datasets

# Check for NaN values in training/testing data
print("NaN check for training/testing data:")
print(train_test_data.isna().sum())

# Check for NaN values in unseen data
print("\nNaN check for unseen data:")
print(unseen_data.isna().sum())


NaN check for training/testing data:
Date                  0
Row Number            0
Data_Type           239
Draw1               239
DR1_Prev_Week       239
DR1_2Weeks          239
DR1_Prev_Entry      239
DR1_Prev_Entry-2    239
DR1_Mov_Avg         239
DR1_Vert_Avg        239
Draw2               239
DR2_Prev_Week       239
DR2_2Weeks          239
DR2_Prev_Entry      239
DR2_Prev_Entry-2    239
DR2_Mov_Avg         239
DR2_Vert_Avg        239
Draw3               239
DR3_Prev_Week       239
DR3_2Weeks          239
DR3_Prev_Entry      239
DR3_Prev_Entry-2    239
DR3_Mov_Avg         239
DR3_Vert_Avg        239
Draw4               239
DR4_Prev_Week       239
DR4_2Weeks          239
DR4_Prev_Entry      239
DR4_Prev_Entry-2    239
DR4_Mov_Avg         239
DR4_Vert_Avg        239
dtype: int64

NaN check for unseen data:
Date                 0
Row Number           0
Data_Type           17
Draw1               17
DR1_Prev_Week       17
DR1_2Weeks          17
DR1_Prev_Entry      17
DR1_Prev_Entry-2 

In [19]:
# Cell 2.1: NaN handling and new CSV saving for Training / Testing  and Unseen datasets


# Impute NaN values with zeros in training/testing data
train_test_data = train_test_data.fillna(0)

# Impute NaN values with zeros in unseen data
unseen_data = unseen_data.fillna(0)

# Define new CSV file names
new_csv_filename_train_test = 'C_NaN_Handled_Train_Test_Data.csv'
new_csv_filename_unseen = 'C_NaN_Handled_Unseen_Data.csv'

# Define the paths for saving the new CSV files
new_csv_path_train_test = os.path.join(drive_dataset_directory, new_csv_filename_train_test)
new_csv_path_unseen = os.path.join(drive_dataset_directory, new_csv_filename_unseen)

# Save the preprocessed training/testing data as a new CSV file
train_test_data.to_csv(new_csv_path_train_test, index=False)

# Save the preprocessed unseen data as a new CSV file
unseen_data.to_csv(new_csv_path_unseen, index=False)

# Print a message to confirm that the preprocessing and saving is complete
print("Preprocessing and saving of datasets is complete.")

# Check for NaN values in the preprocessed training/testing data
print("\nNaN check for preprocessed training/testing data:")
print(train_test_data.isna().sum())

# Check for NaN values in the preprocessed unseen data
print("\nNaN check for preprocessed unseen data:")
print(unseen_data.isna().sum())


Preprocessing and saving of datasets is complete.

NaN check for preprocessed training/testing data:
Date                0
Row Number          0
Data_Type           0
Draw1               0
DR1_Prev_Week       0
DR1_2Weeks          0
DR1_Prev_Entry      0
DR1_Prev_Entry-2    0
DR1_Mov_Avg         0
DR1_Vert_Avg        0
Draw2               0
DR2_Prev_Week       0
DR2_2Weeks          0
DR2_Prev_Entry      0
DR2_Prev_Entry-2    0
DR2_Mov_Avg         0
DR2_Vert_Avg        0
Draw3               0
DR3_Prev_Week       0
DR3_2Weeks          0
DR3_Prev_Entry      0
DR3_Prev_Entry-2    0
DR3_Mov_Avg         0
DR3_Vert_Avg        0
Draw4               0
DR4_Prev_Week       0
DR4_2Weeks          0
DR4_Prev_Entry      0
DR4_Prev_Entry-2    0
DR4_Mov_Avg         0
DR4_Vert_Avg        0
dtype: int64

NaN check for preprocessed unseen data:
Date                0
Row Number          0
Data_Type           0
Draw1               0
DR1_Prev_Week       0
DR1_2Weeks          0
DR1_Prev_Entry      0
DR1_Prev_

In [20]:
# Cell 2.2: Extract Y/M/D from Date and new CSV saving for Training / Testing  and Unseen datasets

# Load the NaN-handled training/testing data
nan_handled_train_test_data = load_dataset(new_csv_path_train_test)

# Load the NaN-handled unseen data
nan_handled_unseen_data = load_dataset(new_csv_path_unseen)

# Check and load the datasets
def load_dataset(file_path):
    if os.path.isfile(file_path):
        print("File found. Proceeding to load the dataset.")
        return pd.read_csv(file_path)
    else:
        print("File not found. Check the file path.")
        return None

# Function to extract 'Year', 'Month', and 'Day' from the 'Date' column
def extract_date_features(data):
    if 'Date' in data.columns:
        print("Converting 'Date' to datetime and extracting Year, Month, and Day...")
        date_formats = ['%d-%m-%y', '%d/%m/%Y']
        for date_format in date_formats:
            try:
                data['Date'] = pd.to_datetime(data['Date'], format=date_format)
                data['Year'] = data['Date'].dt.year.fillna(0).astype(int)
                data['Month'] = data['Date'].dt.month.fillna(0).astype(int)
                data['Day'] = data['Date'].dt.day.fillna(0).astype(int)
                print("After extracting Year, Month, and Day:", data.columns)
                break  # Break the loop if successful date conversion
            except ValueError:
                print(f"Failed to convert 'Date' with format: {date_format}")
    else:
        print("'Date' column not found in the dataset.")

# Extract 'Year', 'Month', and 'Day' from the 'Date' column in training/testing data
extract_date_features(nan_handled_train_test_data)

# Extract 'Year', 'Month', and 'Day' from the 'Date' column in unseen data
extract_date_features(nan_handled_unseen_data)

# Define new CSV file names
new_csv_filename_train_test_date = 'D_Date_Extracted_Train_Test_Data.csv'
new_csv_filename_unseen_date = 'D_Date_Extracted_Unseen_Data.csv'

# Define the paths for saving the new CSV files
new_csv_path_train_test_date = os.path.join(drive_dataset_directory, new_csv_filename_train_test_date)
new_csv_path_unseen_date = os.path.join(drive_dataset_directory, new_csv_filename_unseen_date)

# Save the datasets with extracted date features as new CSV files
nan_handled_train_test_data.to_csv(new_csv_path_train_test_date, index=False)
nan_handled_unseen_data.to_csv(new_csv_path_unseen_date, index=False)

# Print a message to confirm that the date extraction and saving is complete
print("Date extraction and saving of datasets is complete.")

# Check for NaN values in the datasets with extracted date features
print("\nNaN check for training/testing data with extracted date features:")
print(nan_handled_train_test_data.isna().sum())

print("\nNaN check for unseen data with extracted date features:")
print(nan_handled_unseen_data.isna().sum())


File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
Converting 'Date' to datetime and extracting Year, Month, and Day...
After extracting Year, Month, and Day: Index(['Date', 'Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week',
       'DR1_2Weeks', 'DR1_Prev_Entry', 'DR1_Prev_Entry-2', 'DR1_Mov_Avg',
       'DR1_Vert_Avg', 'Draw2', 'DR2_Prev_Week', 'DR2_2Weeks',
       'DR2_Prev_Entry', 'DR2_Prev_Entry-2', 'DR2_Mov_Avg', 'DR2_Vert_Avg',
       'Draw3', 'DR3_Prev_Week', 'DR3_2Weeks', 'DR3_Prev_Entry',
       'DR3_Prev_Entry-2', 'DR3_Mov_Avg', 'DR3_Vert_Avg', 'Draw4',
       'DR4_Prev_Week', 'DR4_2Weeks', 'DR4_Prev_Entry', 'DR4_Prev_Entry-2',
       'DR4_Mov_Avg', 'DR4_Vert_Avg', 'Year', 'Month', 'Day'],
      dtype='object')
Converting 'Date' to datetime and extracting Year, Month, and Day...
Failed to convert 'Date' with format: %d-%m-%y
After extracting Year, Month, and Day: Index(['Date', 'Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week',
       

In [21]:
# Cell 2.3: Create shifted columns for previous day's data

# Function to create shifted columns for previous day's data
def create_shifted_columns(data):
    data['Prev_Morning'] = data['Draw1'].shift(1)
    data['Prev_Afternoon'] = data['Draw2'].shift(1)
    data['Prev_Evening'] = data['Draw3'].shift(1)
    data['Prev_Night'] = data['Draw4'].shift(1)
    data[['Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prev_Night']] = data[['Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prev_Night']].fillna(0).astype(int)

# Load the date extracted training/testing data
date_extracted_train_test_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/D_Date_Extracted_Train_Test_Data.csv')

# Apply the function to create shifted columns
create_shifted_columns(date_extracted_train_test_data)

# Save the updated training/testing data with shifted columns
date_extracted_train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/E_Shifted_Train_Test_Data.csv', index=False)

# Load the date extracted unseen data
date_extracted_unseen_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/D_Date_Extracted_Unseen_Data.csv')

# Apply the function to create shifted columns
create_shifted_columns(date_extracted_unseen_data)

# Save the updated unseen data with shifted columns
date_extracted_unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/E_Shifted_Unseen_Data.csv', index=False)

# Print the first few rows of both datasets for inspection
print("First few rows of date extracted training/testing data:")
print(date_extracted_train_test_data.head())

print("\nFirst few rows of date extracted unseen data:")
print(date_extracted_unseen_data.head())


File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
First few rows of date extracted training/testing data:
         Date  Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  \
0  2018-08-01           1  Training   19.0            7.0        27.0   
1  2018-08-02           2  Training   31.0           11.0         1.0   
2  2018-08-03           3  Training   15.0           19.0        21.0   
3  2018-08-04           4  Training   31.0           35.0        18.0   
4  2018-08-05           5         0    0.0            0.0         0.0   

   DR1_Prev_Entry  DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  ...  \
0            23.0              32.0         27.5          17.0  ...   
1             9.0              33.0         21.0           6.0  ...   
2            12.0              35.0         23.5          20.0  ...   
3            35.0              23.0         29.0          26.5  ...   
4             0.0               0.0          0.0           0.0

In [22]:
# Cell 2.4: Handle NaN values for previous day's data

# Load the shifted training/testing data
shifted_train_test_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/E_Shifted_Train_Test_Data.csv')

# Manually set values for the first row of training/testing set
shifted_train_test_data.at[0, 'Prev_Morning'] = 13
shifted_train_test_data.at[0, 'Prev_Afternoon'] = 34
shifted_train_test_data.at[0, 'Prev_Evening'] = 32
shifted_train_test_data.at[0, 'Prev_Night'] = 23

# Save the updated training/testing data
shifted_train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/F_NaN_Handled_Shifted_Train_Test_Data.csv', index=False)

# Load the shifted unseen data
shifted_unseen_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/E_Shifted_Unseen_Data.csv')

# Manually set values for the first row of unseen set
shifted_unseen_data.at[0, 'Prev_Morning'] = 25
shifted_unseen_data.at[0, 'Prev_Afternoon'] = 9
shifted_unseen_data.at[0, 'Prev_Evening'] = 7
shifted_unseen_data.at[0, 'Prev_Night'] = 5

# Save the updated unseen data
shifted_unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/F_NaN_Handled_Shifted_Unseen_Data.csv', index=False)

# Print the first few rows of both datasets for inspection
print("First few rows of handled shifted training/testing data:")
print(shifted_train_test_data.head())

print("\nFirst few rows of handled shifted unseen data:")
print(shifted_unseen_data.head())


File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
First few rows of handled shifted training/testing data:
         Date  Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  \
0  2018-08-01           1  Training   19.0            7.0        27.0   
1  2018-08-02           2  Training   31.0           11.0         1.0   
2  2018-08-03           3  Training   15.0           19.0        21.0   
3  2018-08-04           4  Training   31.0           35.0        18.0   
4  2018-08-05           5         0    0.0            0.0         0.0   

   DR1_Prev_Entry  DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  ...  \
0            23.0              32.0         27.5          17.0  ...   
1             9.0              33.0         21.0           6.0  ...   
2            12.0              35.0         23.5          20.0  ...   
3            35.0              23.0         29.0          26.5  ...   
4             0.0               0.0          0.0           0.

In [23]:
# Cell 3.1: Converting the columns to integer in both datasets (excluding 'Date')

# Load the most recent CSVs
train_test_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/F_NaN_Handled_Shifted_Train_Test_Data.csv')
unseen_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/F_NaN_Handled_Shifted_Unseen_Data.csv')

# List of columns to convert to integer (excluding 'Data_Type' and 'Date')
columns_to_convert_train_test = [col for col in train_test_data.columns if col not in ['Data_Type', 'Date']]
columns_to_convert_unseen = [col for col in unseen_data.columns if col not in ['Data_Type', 'Date']]

# Convert columns to integer
train_test_data[columns_to_convert_train_test] = train_test_data[columns_to_convert_train_test].astype(int)
unseen_data[columns_to_convert_unseen] = unseen_data[columns_to_convert_unseen].astype(int)

# Save the updated datasets
train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/G_Handled_DataType_Train_Test_Data.csv', index=False)
unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/G_Handled_DataType_Unseen_Data.csv', index=False)

# Display the data types of the columns after conversion
print("Data types of columns in train/test data after conversion:")
print(train_test_data.dtypes)

print("\nData types of columns in unseen data after conversion:")
print(unseen_data.dtypes)


Data types of columns in train/test data after conversion:
Date                object
Row Number           int64
Data_Type           object
Draw1                int64
DR1_Prev_Week        int64
DR1_2Weeks           int64
DR1_Prev_Entry       int64
DR1_Prev_Entry-2     int64
DR1_Mov_Avg          int64
DR1_Vert_Avg         int64
Draw2                int64
DR2_Prev_Week        int64
DR2_2Weeks           int64
DR2_Prev_Entry       int64
DR2_Prev_Entry-2     int64
DR2_Mov_Avg          int64
DR2_Vert_Avg         int64
Draw3                int64
DR3_Prev_Week        int64
DR3_2Weeks           int64
DR3_Prev_Entry       int64
DR3_Prev_Entry-2     int64
DR3_Mov_Avg          int64
DR3_Vert_Avg         int64
Draw4                int64
DR4_Prev_Week        int64
DR4_2Weeks           int64
DR4_Prev_Entry       int64
DR4_Prev_Entry-2     int64
DR4_Mov_Avg          int64
DR4_Vert_Avg         int64
Year                 int64
Month                int64
Day                  int64
Prev_Morning         in

In [24]:
# CELL 3.2: Flagging and Exemption of Sundays and Public Holidays

import pandas as pd
import numpy as np

# Define the base directory for file paths
base_dir = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/'

# Load datasets function
def load_dataset(filename):
    full_path = f'{base_dir}{filename}'
    return pd.read_csv(full_path)

# Save datasets function
def save_dataset(df, filename):
    full_path = f'{base_dir}{filename}'
    df.to_csv(full_path, index=False)

# Load the most recent datasets
train_test_data = load_dataset('G_Handled_DataType_Train_Test_Data.csv')
unseen_data = load_dataset('G_Handled_DataType_Unseen_Data.csv')

# Ensure Date Column is in DateTime Format for both datasets
train_test_data['Date'] = pd.to_datetime(train_test_data['Date'])
unseen_data['Date'] = pd.to_datetime(unseen_data['Date'])

# Updated public holidays list including 2025
public_holidays_list = [
    "2018-03-30", "2018-05-31", "2018-06-15", "2018-11-06", "2018-12-25",
    "2019-03-30", "2019-04-19", "2019-06-05", "2019-06-20", "2019-10-27", "2019-12-25",
    "2020-03-30", "2020-04-10", "2020-05-24", "2020-06-11", "2020-11-14", "2020-12-25",
    "2021-03-30", "2021-04-03", "2021-05-13", "2021-06-03", "2021-11-04", "2021-12-25",
    "2022-03-30", "2022-04-15", "2022-05-02", "2022-06-16", "2022-10-24", "2022-12-26",
    "2023-03-30", "2023-04-07", "2023-04-22", "2023-06-08", "2023-11-12", "2023-12-25",
    "2024-03-29", "2024-03-30", "2024-04-10", "2024-05-30", "2024-10-31", "2024-12-25",
    # Including 2025 holidays
    "2025-01-01", "2025-03-30", "2025-03-31", "2025-04-18", "2025-04-21",
    "2025-05-30", "2025-06-19", "2025-08-01", "2025-08-31", "2025-09-01",
    "2025-09-24", "2025-10-20", "2025-12-25", "2025-12-26"
]

# Convert public holidays list to datetime for comparison
public_holidays = pd.to_datetime(public_holidays_list)

# Flag public holidays in the datasets
train_test_data['Is_Holiday'] = train_test_data['Date'].isin(public_holidays)
unseen_data['Is_Holiday'] = unseen_data['Date'].isin(public_holidays)

# Flag Sundays as special days alongside public holidays
train_test_data['Is_Sunday'] = train_test_data['Date'].dt.dayofweek == 6
unseen_data['Is_Sunday'] = unseen_data['Date'].dt.dayofweek == 6

# Combine flags to identify any special day
train_test_data['Is_Special_Day'] = train_test_data['Is_Holiday'] | train_test_data['Is_Sunday']
unseen_data['Is_Special_Day'] = unseen_data['Is_Holiday'] | unseen_data['Is_Sunday']

# Debugging - Print to verify flags for a subset
print(train_test_data[['Date', 'Is_Special_Day', 'Is_Holiday', 'Is_Sunday']].head(20))

# Assuming 'Draw1', 'Draw2', 'Draw3', and 'Draw4' are the draw columns
draw_columns = ['Draw1', 'Draw2', 'Draw3', 'Draw4']

# Adding a temporary column to check if all draws are zero for a day
train_test_data['All_Draws_Zero'] = train_test_data[draw_columns].eq(0).all(axis=1)
unseen_data['All_Draws_Zero'] = unseen_data[draw_columns].eq(0).all(axis=1)

# Remove rows where it's a special day AND all draws are zero
train_test_data = train_test_data[~((train_test_data['Is_Special_Day']) & (train_test_data['All_Draws_Zero']))]
unseen_data = unseen_data[~((unseen_data['Is_Special_Day']) & (unseen_data['All_Draws_Zero']))]

# Drop the 'All_Draws_Zero' helper column as it's no longer needed
train_test_data.drop(columns=['All_Draws_Zero'], inplace=True)
unseen_data.drop(columns=['All_Draws_Zero'], inplace=True)

# Now continue to save the datasets
save_dataset(train_test_data, 'H_Train_Test_Data_Excluding_Special_Days.csv')
save_dataset(unseen_data, 'H_Unseen_Data_Excluding_Special_Days.csv')
print("Datasets excluding special days saved successfully.")


         Date  Is_Special_Day  Is_Holiday  Is_Sunday
0  2018-08-01           False       False      False
1  2018-08-02           False       False      False
2  2018-08-03           False       False      False
3  2018-08-04           False       False      False
4  2018-08-05            True       False       True
5  2018-08-06           False       False      False
6  2018-08-07           False       False      False
7  2018-08-08           False       False      False
8  2018-08-09           False       False      False
9  2018-08-10           False       False      False
10 2018-08-11           False       False      False
11 2018-08-12            True       False       True
12 2018-08-13           False       False      False
13 2018-08-14           False       False      False
14 2018-08-15           False       False      False
15 2018-08-16           False       False      False
16 2018-08-17           False       False      False
17 2018-08-18           False       False     

In [25]:
# CELL 3.3: Adjusted Logic for Handling 'Prev_' Columns Without Future Data Leakage

import pandas as pd
import numpy as np

# Define the base directory for file paths
base_dir = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/'

# Function to load and save datasets
def load_dataset(filename):
    full_path = f'{base_dir}{filename}'
    return pd.read_csv(full_path)

def save_dataset(df, filename):
    full_path = f'{base_dir}{filename}'
    df.to_csv(full_path, index=False)

# Load the datasets
train_test_data = load_dataset('H_Train_Test_Data_Excluding_Special_Days.csv')
unseen_data = load_dataset('H_Unseen_Data_Excluding_Special_Days.csv')

# Assuming train_test_data is your DataFrame

# Function to conditionally shift values backward by one day
def conditional_shift(df, target_column, source_column):
    for i in range(1, len(df)):  # Start from the second row
        if df.at[i, target_column] == 0:  # Check if the current row's value is zero
            # Replace with the previous day's source column value
            df.at[i, target_column] = df.at[i-1, source_column]

# Apply conditional shifting
conditional_shift(train_test_data, 'DR1_Prev_Entry', 'Prev_Night')
conditional_shift(train_test_data, 'DR2_Prev_Entry-2', 'Prev_Night')
conditional_shift(train_test_data, 'DR1_Prev_Entry-2', 'Prev_Afternoon')

# Repeat for unseen_data if necessary
conditional_shift(unseen_data, 'DR1_Prev_Entry', 'Prev_Night')
conditional_shift(unseen_data, 'DR2_Prev_Entry-2', 'Prev_Night')
conditional_shift(unseen_data, 'DR1_Prev_Entry-2', 'Prev_Afternoon')

# Columns for adjusted logic
columns_to_adjust = ['Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prev_Night']

# Assuming 'Draw1', 'Draw2', 'Draw3', and 'Draw4' are your original draw columns
original_draw_columns = ['Draw1', 'Draw2', 'Draw3', 'Draw4']

# Mapping of 'Prev_' columns to their corresponding 'Draw' columns
columns_mapping = dict(zip(columns_to_adjust, original_draw_columns))

# Apply the conditional shift for train_test_data
for prev_col, draw_col in columns_mapping.items():
    # Find indices where 'Prev_' column is 0 and shift the corresponding 'Draw' column value from the day before
    indices_to_shift = train_test_data[train_test_data[prev_col] == 0].index
    for idx in indices_to_shift:
        if idx > 0:  # Ensure it's not the first row to avoid indexing issues
            train_test_data.at[idx, prev_col] = train_test_data.at[idx-1, draw_col]

# Apply the conditional shift for unseen_data
for prev_col, draw_col in columns_mapping.items():
    indices_to_shift = unseen_data[unseen_data[prev_col] == 0].index
    for idx in indices_to_shift:
        if idx > 0:
            unseen_data.at[idx, prev_col] = unseen_data.at[idx-1, draw_col]

# Note: This approach assumes the first row does not contain zeros in 'Prev_' columns.
# If the first row can have zeros, additional logic is needed to handle those cases.
# Rename 'DR__Vert_Avg' to 'Draw__Mov_Avg_2'
train_test_data.rename(columns={'DR1_Vert_Avg': 'DR1_Mov_Avg_2'}, inplace=True)
unseen_data.rename(columns={'DR1_Vert_Avg': 'DR1_Mov_Avg_2'}, inplace=True)
train_test_data.rename(columns={'DR2_Vert_Avg': 'DR2_Mov_Avg_2'}, inplace=True)
unseen_data.rename(columns={'DR2_Vert_Avg': 'DR2_Mov_Avg_2'}, inplace=True)
train_test_data.rename(columns={'DR3_Vert_Avg': 'DR3_Mov_Avg_2'}, inplace=True)
unseen_data.rename(columns={'DR3_Vert_Avg': 'DR3_Mov_Avg_2'}, inplace=True)
train_test_data.rename(columns={'DR4_Vert_Avg': 'DR4_Mov_Avg_2'}, inplace=True)
unseen_data.rename(columns={'DR4_Vert_Avg': 'DR4_Mov_Avg_2'}, inplace=True)

# Debug: Confirm the renaming and the addition of new features
print("Columns in Training/Testing Data after renaming and feature addition:", train_test_data.columns)
print("Columns in Unseen Data after renaming and feature addition:", unseen_data.columns)

# Save the datasets with adjusted 'Prev_' columns
save_dataset(train_test_data, 'I_Train_Test_Data_adjusted_Prev_columns.csv')
save_dataset(unseen_data, 'I_Unseen_Data_adjusted_Prev_columns.csv')

print("Adjusted 'Prev_' columns completed and datasets saved successfully.")


Columns in Training/Testing Data after renaming and feature addition: Index(['Date', 'Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week',
       'DR1_2Weeks', 'DR1_Prev_Entry', 'DR1_Prev_Entry-2', 'DR1_Mov_Avg',
       'DR1_Mov_Avg_2', 'Draw2', 'DR2_Prev_Week', 'DR2_2Weeks',
       'DR2_Prev_Entry', 'DR2_Prev_Entry-2', 'DR2_Mov_Avg', 'DR2_Mov_Avg_2',
       'Draw3', 'DR3_Prev_Week', 'DR3_2Weeks', 'DR3_Prev_Entry',
       'DR3_Prev_Entry-2', 'DR3_Mov_Avg', 'DR3_Mov_Avg_2', 'Draw4',
       'DR4_Prev_Week', 'DR4_2Weeks', 'DR4_Prev_Entry', 'DR4_Prev_Entry-2',
       'DR4_Mov_Avg', 'DR4_Mov_Avg_2', 'Year', 'Month', 'Day', 'Prev_Morning',
       'Prev_Afternoon', 'Prev_Evening', 'Prev_Night', 'Is_Holiday',
       'Is_Sunday', 'Is_Special_Day'],
      dtype='object')
Columns in Unseen Data after renaming and feature addition: Index(['Date', 'Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week',
       'DR1_2Weeks', 'DR1_Prev_Entry', 'DR1_Prev_Entry-2', 'DR1_Mov_Avg',
       'DR1_Mov_Avg_2', 'Dr

In [26]:
# Cell 4.1: Introducing "Lines" as a new feature in both datasets

import pandas as pd

# Define the directory for file paths
base_dir = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/'

# Load the most recent CSVs
train_test_data = pd.read_csv(base_dir + 'I_Train_Test_Data_adjusted_Prev_columns.csv')
unseen_data = pd.read_csv(base_dir + 'I_Unseen_Data_adjusted_Prev_columns.csv')

# Directory of lines with corresponding numbers
lines_directory = {
    1: [1, 10, 19, 28],
    2: [2, 11, 20, 29],
    3: [3, 12, 21, 30],
    4: [4, 13, 22, 31],
    5: [5, 14, 23, 32],
    6: [6, 15, 24, 33],
    7: [7, 16, 25, 34],
    8: [8, 17, 26, 35],
    9: [9, 18, 27, 36],
}

# Function to calculate the sum of digits and map to a 'Line'
def sum_to_line(x):
    try:
        sum_of_digits = sum(int(digit) for digit in str(x))
        # Ensure the sum is between 1 and 9
        while sum_of_digits > 9:
            sum_of_digits = sum(int(digit) for digit in str(sum_of_digits))
        return sum_of_digits
    except ValueError:
        # Return 0 if the value cannot be converted to an integer (e.g., missing or non-numeric data)
        return 0

# Create the 'Line_Prev_Entry' column
train_test_data['Line_Prev_Entry'] = train_test_data['Prev_Night'].apply(sum_to_line)
unseen_data['Line_Prev_Entry'] = unseen_data['Prev_Night'].apply(sum_to_line)

# Function to extract numbers from the line subset for the active line
def extract_line_numbers(row, lines_dict):
    active_line = row['Line_Prev_Entry']
    # Check if it's a day with no draws or an invalid line number
    if active_line == 0 or active_line not in lines_dict:
        # Set all numbers for this row to zero or NaN
        for i in range(1, 5):
            row[f'Line_PE_Num_{i}'] = 0  # Or use NaN if that's preferred
    else:
        # Populate the row with numbers from the active line
        for i, num in enumerate(lines_dict[active_line], start=1):
            row[f'Line_PE_Num_{i}'] = num
    return row

# Apply the function to each row of the DataFrame
train_test_data = train_test_data.apply(lambda row: extract_line_numbers(row, lines_directory), axis=1)
unseen_data = unseen_data.apply(lambda row: extract_line_numbers(row, lines_directory), axis=1)

# Save the updated datasets with 'Lines' as new features
train_test_data.to_csv(base_dir + 'J_Lines_Train_Test_Data.csv', index=False)
unseen_data.to_csv(base_dir + 'J_Lines_Unseen_Data.csv', index=False)

# Display the first few rows of both datasets to verify the "Lines" assignment
print("First few rows of train/test data with 'Lines' assigned:")
print(train_test_data.head())

print("\nFirst few rows of unseen data with 'Lines' assigned:")
print(unseen_data.head())


First few rows of train/test data with 'Lines' assigned:
         Date  Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  \
0  2018-08-01           1  Training     19              7          27   
1  2018-08-02           2  Training     31             11           1   
2  2018-08-03           3  Training     15             19          21   
3  2018-08-04           4  Training     31             35          18   
4  2018-08-06           6  Training     31             18          22   

   DR1_Prev_Entry  DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Mov_Avg_2  ...  \
0              23                32           27             17  ...   
1               9                33           21              6  ...   
2              12                35           23             20  ...   
3              35                23           29             26  ...   
4              16                29           22             20  ...   

   Prev_Evening  Prev_Night  Is_Holiday  Is_Sunday  Is_Special_Day  \
0

In [27]:
## Cell 4.2: # Introducing "Special Groups" as a new feature in both datasets

# Load the most recent CSVs
#train_test_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/M_Lines_Train_Test_Data.csv')
#unseen_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/N_Lines_Unseen_Data.csv')

# Define the mapping for "Special Groups"
#special_groups_mapping = {
#    2: 1, 15: 1, 16: 1, 24: 1, 31: 1,  # "Ladies"
#    4: 2, 5: 2, 12: 2, 29: 2, 34: 2,  # "Men"
#    11: 3, 17: 3, 26: 3,  # "Birds"
#    7: 4, 9: 4, 19: 4, 20: 4, 22: 4, 30: 4, 36: 4,  # "Domestic Animals"
#    8: 5, 10: 5, 13: 5, 25: 5,  # "Wild Animals"
#    18: 6, 28: 6, 32: 6,  # "Ocean"
#    1: 7, 27: 7, 33: 7, 35: 7,  # "Snakes & Insects"
#    3: 8, 6: 8, 14: 8, 21: 8, 23: 8  # "Home"
#}

# Function to assign "Special Groups" based on the mapping
#def assign_special_groups(data, column_name, special_groups_mapping):
#    data[f'Special_Groups_{column_name}'] = data[column_name].map(special_groups_mapping).fillna(0).astype(int)

# List of columns to assign "Special Groups"
#columns_to_assign_special_groups = ['Draw1', 'DR1_Prev_Week', 'DR1_Prev_Entry']

# Assign "Special Groups" for specified columns in train/test data
#for column in columns_to_assign_special_groups:
#    assign_special_groups(train_test_data, column, special_groups_mapping)

# Assign "Special Groups" for specified columns in unseen data
#for column in columns_to_assign_special_groups:
#    assign_special_groups(unseen_data, column, special_groups_mapping)

# Save the updated datasets
#train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/O_Special_Groups_Train_Test_Data.csv', index=False)
#unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/P_Special_Groups_Unseen_Data.csv', index=False)

# Display the first few rows of both datasets to verify the "Special Groups" assignment
#print("First few rows of train/test data with 'Special Groups' assigned:")
#print(train_test_data.head())

#print("\nFirst few rows of unseen data with 'Special Groups' assigned:")
#print(unseen_data.head())


In [28]:
# Cell 4.3: Introducing "Spirits" as a new feature in both datasets

import pandas as pd

# Define the directory for file paths
base_dir = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/'

# Load the most recent CSVs
train_test_data = pd.read_csv(base_dir + 'J_Lines_Train_Test_Data.csv')
unseen_data = pd.read_csv(base_dir + 'J_Lines_Unseen_Data.csv')

# Define a mapping dictionary for "Spirits" pairs
spirits_mapping = {
    1: 5,
    2: 24,
    3: 19,
    4: 35,
    5: 1,
    6: 15,
    7: 13,
    8: 29,
    9: 33,
    10: 28,
    11: 36,
    12: 32,
    13: 7,
    14: 25,
    15: 6,
    16: 17,
    17: 16,
    18: 30,
    19: 3,
    20: 22,
    21: 23,
    22: 20,
    23: 21,
    24: 2,
    25: 14,
    26: 27,
    27: 26,
    28: 10,
    29: 8,
    30: 18,
    31: 34,
    32: 12,
    33: 9,
    34: 31,
    35: 4,
    36: 11
}

# Function to map "Prev_Night" to its spirit pair
def map_to_spirit(x, spirits_dict):
    # Return the corresponding spirit number or None if not found
    return spirits_dict.get(x)

# Create the 'Spirit_PE_Num' column
train_test_data['Spirit_PE_Num'] = train_test_data['Prev_Night'].apply(lambda x: map_to_spirit(x, spirits_mapping))
unseen_data['Spirit_PE_Num'] = unseen_data['Prev_Night'].apply(lambda x: map_to_spirit(x, spirits_mapping))

# Replace NaNs with 0 and convert to int
train_test_data['Spirit_PE_Num'] = train_test_data['Spirit_PE_Num'].fillna(0).astype(int)
unseen_data['Spirit_PE_Num'] = unseen_data['Spirit_PE_Num'].fillna(0).astype(int)

# Save the updated datasets with 'Spirits' as new features
train_test_data.to_csv(base_dir + 'K_Spirits_Train_Test_Data.csv', index=False)
unseen_data.to_csv(base_dir + 'K_Spirits_Unseen_Data.csv', index=False)

# Display the first few rows of both datasets to verify the "Spirits" assignment
print("First few rows of train/test data with 'Spirits' assigned:")
print(train_test_data.head())

print("\nFirst few rows of unseen data with 'Spirits' assigned:")
print(unseen_data.head())


First few rows of train/test data with 'Spirits' assigned:
         Date  Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  \
0  2018-08-01           1  Training     19              7          27   
1  2018-08-02           2  Training     31             11           1   
2  2018-08-03           3  Training     15             19          21   
3  2018-08-04           4  Training     31             35          18   
4  2018-08-06           6  Training     31             18          22   

   DR1_Prev_Entry  DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Mov_Avg_2  ...  \
0              23                32           27             17  ...   
1               9                33           21              6  ...   
2              12                35           23             20  ...   
3              35                23           29             26  ...   
4              16                29           22             20  ...   

   Prev_Night  Is_Holiday  Is_Sunday  Is_Special_Day  Line_Prev_Entry

In [29]:
# Cell 4.4: Introducing "Rakes" as a new feature in both datasets

import pandas as pd

# Define the directory for file paths
base_dir = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/'

# Load the most recent CSVs
train_test_data = pd.read_csv(base_dir + 'K_Spirits_Train_Test_Data.csv')
unseen_data = pd.read_csv(base_dir + 'K_Spirits_Unseen_Data.csv')

# Define a mapping dictionary for "Rakes" numbers
rakes_mapping = {
    1: [7, 12, 15, 36],
    2: [4, 14, 16, 23],
    3: [19, 22, 34, 35],
    4: [7, 11, 14, 32],
    5: [20, 23, 31, 33],
    6: [14, 24, 30, 32],
    7: [4, 11, 24, 29],
    8: [12, 14, 33, 36],
    9: [7, 11, 24, 32],
    10: [7, 12, 15, 36],
    11: [4, 7, 17, 22],
    12: [1, 10, 28, 35],
    13: [10, 25, 28, 29],
    14: [16, 23, 31, 33],
    15: [5, 6, 23, 36],
    16: [14, 31, 35, 36],
    17: [11, 16, 26, 29],
    18: [5, 7, 28, 33],
    19: [10, 27, 32, 36],
    20: [4, 14, 24, 30],
    21: [16, 22, 24, 29],
    22: [11, 30, 32, 34],
    23: [12, 14, 16, 20],
    24: [2, 6, 12, 21],
    25: [1, 3, 13, 18],
    26: [1, 2, 11, 17],
    27: [14, 16, 19, 35],
    28: [12, 13, 18, 33],
    29: [7, 13, 16, 28],
    30: [6, 8, 20, 22],
    31: [5, 14, 16, 36],
    32: [4, 6, 28, 36],
    33: [5, 10, 14, 20],
    34: [1, 12, 20, 22],
    35: [3, 12, 14, 16],
    36: [1, 8, 16, 32],
}

# Function to assign "Rakes" based on the 'Prev_Night' value
def assign_rakes(data, column_name, rakes_dict):
    # Create new columns for Rakes numbers
    for i in range(1, 5):
        data[f'Rake_PE_Num_{i}'] = 0

    # Populate Rakes numbers
    for index, row in data.iterrows():
        rakes_numbers = rakes_dict.get(row[column_name], [0, 0, 0, 0])
        for i, rake_num in enumerate(rakes_numbers, start=1):
            data.at[index, f'Rake_PE_Num_{i}'] = rake_num

    return data

# Apply 'assign_rakes' function to create new Rakes columns
train_test_data = assign_rakes(train_test_data, 'Prev_Night', rakes_mapping)
unseen_data = assign_rakes(unseen_data, 'Prev_Night', rakes_mapping)

# Save the updated datasets with 'Rakes' as new features
train_test_data.to_csv(base_dir + 'L_Rakes_Train_Test_Data.csv', index=False)
unseen_data.to_csv(base_dir + 'L_Rakes_Unseen_Data.csv', index=False)

# Display the first few rows of both datasets to verify the "Rakes" assignment
print("First few rows of train/test data with 'Rakes' assigned:")
print(train_test_data[['Prev_Night', 'Rake_PE_Num_1', 'Rake_PE_Num_2', 'Rake_PE_Num_3', 'Rake_PE_Num_4']].head())

print("\nFirst few rows of unseen data with 'Rakes' assigned:")
print(unseen_data[['Prev_Night', 'Rake_PE_Num_1', 'Rake_PE_Num_2', 'Rake_PE_Num_3', 'Rake_PE_Num_4']].head())


First few rows of train/test data with 'Rakes' assigned:
   Prev_Night  Rake_PE_Num_1  Rake_PE_Num_2  Rake_PE_Num_3  Rake_PE_Num_4
0          23             12             14             16             20
1           9              7             11             24             32
2          12              1             10             28             35
3          35              3             12             14             16
4          16             14             31             35             36

First few rows of unseen data with 'Rakes' assigned:
   Prev_Night  Rake_PE_Num_1  Rake_PE_Num_2  Rake_PE_Num_3  Rake_PE_Num_4
0           5             20             23             31             33
1          18              5              7             28             33
2          28             12             13             18             33
3           2              4             14             16             23
4          12              1             10             28             35


In [30]:
# CELL 5.1: Creation of Arithmetical Features for 'Draw1'

import pandas as pd
import numpy as np

# Define the directory for file paths
base_dir = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/'

# Load datasets
def load_dataset(filename):
    full_path = f'{base_dir}{filename}'
    return pd.read_csv(full_path)

def save_dataset(df, filename):
    full_path = f'{base_dir}{filename}'
    df.to_csv(full_path, index=False)

train_test_data = load_dataset('L_Rakes_Train_Test_Data.csv')
unseen_data = load_dataset('L_Rakes_Unseen_Data.csv')
def create_arithmetical_features_draw1(df, window_sizes=[3, 5, 10]):
    """
    Amend DataFrame in-place, adding rolling window calculations and other
    mathematical features based specifically on 'Draw1'.
    """
    for window in window_sizes:
        # Calculate rolling features for 'Draw1'
        df[f'Draw1_Moving_Avg_{window}'] = df['Draw1'].rolling(window=window).mean().shift(1).fillna(method='bfill')
        df[f'Draw1_Median_{window}'] = df['Draw1'].rolling(window=window).median().shift(1).fillna(method='bfill')
        df[f'Draw1_Std_Dev_{window}'] = df['Draw1'].rolling(window=window).std().shift(1).fillna(method='bfill')
        df[f'Draw1_RMS_{window}'] = np.sqrt(df['Draw1'].rolling(window=window).apply(lambda x: np.mean(np.square(x)))).shift(1).fillna(method='bfill')
        df[f'Draw1_Rolling_Min_{window}'] = df['Draw1'].rolling(window=window).min().shift(1).fillna(method='bfill')
        df[f'Draw1_Rolling_Max_{window}'] = df['Draw1'].rolling(window=window).max().shift(1).fillna(method='bfill')
        df[f'Draw1_Skew_{window}'] = df['Draw1'].rolling(window=window).skew().shift(1).fillna(method='bfill')
        df[f'Draw1_Kurtosis_{window}'] = df['Draw1'].rolling(window=window).kurt().shift(1).fillna(method='bfill')

        # Ensure no future data is used in calculating these features
        df[f'Draw1_EMA_{window}'] = df['Draw1'].ewm(span=window, adjust=False).mean().shift(1).fillna(method='bfill')
        df[f'Draw1_Rolling_Var_{window}'] = df['Draw1'].rolling(window=window).var().shift(1).fillna(method='bfill')
        df[f'Draw1_Rolling_Range_{window}'] = df[f'Draw1_Rolling_Max_{window}'] - df[f'Draw1_Rolling_Min_{window}']

    return df

# Apply feature creation for 'Draw1'
train_test_data = create_arithmetical_features_draw1(train_test_data)
unseen_data = create_arithmetical_features_draw1(unseen_data)

# Assuming 'df' is your DataFrame

# List of all the newly created feature names
new_feature_names = [
    'Draw1_Moving_Avg_3', 'Draw1_Median_3', 'Draw1_Std_Dev_3', 'Draw1_RMS_3', 'Draw1_Rolling_Min_3', 'Draw1_Rolling_Max_3', 'Draw1_Skew_3', 'Draw1_Kurtosis_3', 'Draw1_EMA_3', 'Draw1_Rolling_Var_3', 'Draw1_Rolling_Range_3',
    'Draw1_Moving_Avg_5', 'Draw1_Median_5', 'Draw1_Std_Dev_5', 'Draw1_RMS_5', 'Draw1_Rolling_Min_5', 'Draw1_Rolling_Max_5', 'Draw1_Skew_5', 'Draw1_Kurtosis_5', 'Draw1_EMA_5', 'Draw1_Rolling_Var_5', 'Draw1_Rolling_Range_5',
    'Draw1_Moving_Avg_10', 'Draw1_Median_10', 'Draw1_Std_Dev_10', 'Draw1_RMS_10', 'Draw1_Rolling_Min_10', 'Draw1_Rolling_Max_10', 'Draw1_Skew_10', 'Draw1_Kurtosis_10', 'Draw1_EMA_10', 'Draw1_Rolling_Var_10', 'Draw1_Rolling_Range_10'
]

# Fill NaN values for these features
# You can choose to fill with 0, mean, median, or use forward/backward filling
def fill_nan_values(df, features):
    for feature in features:
        df[feature] = df[feature].fillna(0)

fill_nan_values(train_test_data, new_feature_names)
fill_nan_values(unseen_data, new_feature_names)

# Check if there are any remaining NaNs in these features
remaining_nans = train_test_data[new_feature_names].isna().sum().sum()
print(f"Remaining NaNs in new features: {remaining_nans}")

# Debug: Confirm the addition of new features
print("Shape of Training/Testing Data after feature addition:", train_test_data.shape)
print("Shape of Unseen Data after feature addition:", unseen_data.shape)

# Save the enhanced datasets
save_dataset(train_test_data, 'M_Arithmetic_Features_Train_Test_Data.csv')
save_dataset(unseen_data, 'M_Arithmetic_Features_Unseen_Data.csv')
print("Enhanced datasets saved successfully.")


Remaining NaNs in new features: 0
Shape of Training/Testing Data after feature addition: (1409, 84)
Shape of Unseen Data after feature addition: (105, 84)
Enhanced datasets saved successfully.


In [31]:
# CELL 5.2: Creation of Temporal Features

import pandas as pd
import numpy as np

# Define the directory for file paths
base_dir = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/'

# Load datasets function
def load_dataset(filename):
    full_path = f'{base_dir}{filename}'
    return pd.read_csv(full_path)

# Save dataset function
def save_dataset(df, filename):
    full_path = f'{base_dir}{filename}'
    df.to_csv(full_path, index=False)

# Load your datasets
train_test_data = load_dataset('M_Arithmetic_Features_Train_Test_Data.csv')
unseen_data = load_dataset('M_Arithmetic_Features_Unseen_Data.csv')

def create_temporal_features(df):
    """
    Enhance the DataFrame with temporal features, carefully avoiding data leakage.
    """
    # Convert 'Date' to datetime format if not already done
    df['Date'] = pd.to_datetime(df['Date'])

    # Extract Day of the Week from 'Date'
    df['DayofWeek'] = df['Date'].dt.dayofweek

    # Shifting 'Draw1' to 'Draw4' to use only up to the previous day's results
    for draw in ['Draw1', 'Draw2', 'Draw3', 'Draw4']:
        df[f'{draw}_Prev'] = df[draw].shift(1).fillna(method='bfill')
        df[f'{draw}_Change'] = df[f'{draw}_Prev'].diff().fillna(method='bfill')

    # Adding Day of Year for seasonality
    df['DayOfYear'] = df['Date'].dt.dayofyear

    # Interval Since Last Appearance and Cumulative Count for each number
    max_num =  36
    for num in range(1, max_num +  1):
        mask = df[['Draw1', 'Draw2', 'Draw3', 'Draw4']].apply(lambda x: num in x.values, axis=1)
        df[f'Num_{num}_Interval_Last'] = (~mask).cumsum()
        df[f'Num_{num}_Cum_Count'] = mask.cumsum()

    # Impute NaNs with zero
    df.fillna(0, inplace=True)

    return df

# Applying the function to your datasets
train_test_data = create_temporal_features(train_test_data)
unseen_data = create_temporal_features(unseen_data)

# Debug: Print the shape to confirm the addition of new temporal features
print("Shape of Training/Testing Data after temporal feature addition:", train_test_data.shape)
print("Shape of Unseen Data after temporal feature addition:", unseen_data.shape)

# Check for missing values (should be zero now)
print("\nMissing values in Training/Testing Data:\n", train_test_data.isnull().sum())
print("\nMissing values in Unseen Data:\n", unseen_data.isnull().sum())

# Save the enhanced datasets
save_dataset(train_test_data, 'N_Temporal_Features_Train_Test_Data.csv')
save_dataset(unseen_data, 'N_Temporal_Features_Unseen_Data.csv')
print("Datasets with temporal features saved successfully.")

# Debug: Confirm the addition of new features
print("Columns in Training/Testing Data after temporal feature addition:", train_test_data.columns)
print("Columns in Unseen Data after temporal feature addition:", unseen_data.columns)

# Identify and print the new features
original_columns = ['Date', 'Draw1', 'Draw2', 'Draw3', 'Draw4']  # Assuming these are the original columns
new_columns = train_test_data.columns.tolist()  # Get the new columns

# Filter new columns
new_features = [col for col in new_columns if col not in original_columns]

print("\nNew features added:", new_features)


Shape of Training/Testing Data after temporal feature addition: (1409, 166)
Shape of Unseen Data after temporal feature addition: (105, 166)

Missing values in Training/Testing Data:
 Date                    0
Row Number              0
Data_Type               0
Draw1                   0
DR1_Prev_Week           0
                       ..
Num_34_Cum_Count        0
Num_35_Interval_Last    0
Num_35_Cum_Count        0
Num_36_Interval_Last    0
Num_36_Cum_Count        0
Length: 166, dtype: int64

Missing values in Unseen Data:
 Date                    0
Row Number              0
Data_Type               0
Draw1                   0
DR1_Prev_Week           0
                       ..
Num_34_Cum_Count        0
Num_35_Interval_Last    0
Num_35_Cum_Count        0
Num_36_Interval_Last    0
Num_36_Cum_Count        0
Length: 166, dtype: int64
Datasets with temporal features saved successfully.
Columns in Training/Testing Data after temporal feature addition: Index(['Date', 'Row Number', 'Data_Type',

In [32]:
# CELL 5.3: Feature Engineering (Interaction terms, Polynomial features, Domain-specific transformations, Clustering-based features)

# Import necessary libraries
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd

# Define the directory for file paths
base_dir = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/'

# Load datasets function
def load_dataset(filename):
    full_path = f'{base_dir}{filename}'
    return pd.read_csv(full_path)

# Save dataset function
def save_dataset(df, filename):
    full_path = f'{base_dir}{filename}'
    df.to_csv(full_path, index=False)

# Load your datasets
train_test_data = load_dataset('N_Temporal_Features_Train_Test_Data.csv')
unseen_data = load_dataset('N_Temporal_Features_Unseen_Data.csv')

# Print the original columns
print("Original columns in DataFrame:", train_test_data.columns)

# Function to create specific interaction terms
def create_specific_interaction_terms(data):
    # Define interactions
    interactions = [('Draw1_Change', 'Draw2_Change'), ('Draw1_Change', 'Draw3_Change'), ('Draw1_Change', 'Draw4_Change'), ('Draw2_Change', 'Draw3_Change'), ('Draw2_Change', 'Draw4_Change')]

    # Dynamically generate interaction terms
    for feature_a, feature_b in interactions:
        if feature_a in data.columns and feature_b in data.columns:
            interaction_feature_name = f'interaction_{feature_a}_{feature_b}'
            data[interaction_feature_name] = data[feature_a] * data[feature_b]
    return data

# Update datasets with new interaction terms
train_test_data = create_specific_interaction_terms(train_test_data.copy())
unseen_data = create_specific_interaction_terms(unseen_data.copy())

# Function to add polynomial features
def add_polynomial_features(data, feature_list, degree=2):
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    valid_features = [feature for feature in feature_list if feature in data.columns]
    poly_features = poly.fit_transform(data[valid_features])
    feature_names = poly.get_feature_names_out(valid_features)
    data_poly = pd.DataFrame(poly_features, columns=feature_names, index=data.index)
    return pd.concat([data, data_poly], axis=1)

# Function to add clustering features
def add_clustering_features(data, feature_list, n_clusters=3):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    valid_features = [feature for feature in feature_list if feature in data.columns]
    clusters = kmeans.fit_predict(data[valid_features])
    data['cluster'] = clusters
    return data

# Apply polynomial and clustering features
polynomial_feature_list = ['Draw1_Change', 'Draw2_Change', 'Draw3_Change', 'Draw4_Change']
clustering_feature_list = ['DayofWeek']

train_test_data = add_polynomial_features(train_test_data, polynomial_feature_list)
unseen_data = add_polynomial_features(unseen_data, polynomial_feature_list)

train_test_data = add_clustering_features(train_test_data, clustering_feature_list)
unseen_data = add_clustering_features(unseen_data, clustering_feature_list)

# Print the new columns after feature engineering
print("\nNew columns in DataFrame after feature engineering:", train_test_data.columns)

# Save the updated datasets with engineered features
save_dataset(train_test_data, 'O_Polynomial_Features_Train_Test_Data.csv')
save_dataset(unseen_data, 'O_Polynomial_Features_Unseen_Data.csv')

# Check the shape of the processed datasets
print("Shape of train_test_data after feature engineering:", train_test_data.shape)
print("Shape of unseen_data after feature engineering:", unseen_data.shape)


Original columns in DataFrame: Index(['Date', 'Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week',
       'DR1_2Weeks', 'DR1_Prev_Entry', 'DR1_Prev_Entry-2', 'DR1_Mov_Avg',
       'DR1_Mov_Avg_2',
       ...
       'Num_32_Interval_Last', 'Num_32_Cum_Count', 'Num_33_Interval_Last',
       'Num_33_Cum_Count', 'Num_34_Interval_Last', 'Num_34_Cum_Count',
       'Num_35_Interval_Last', 'Num_35_Cum_Count', 'Num_36_Interval_Last',
       'Num_36_Cum_Count'],
      dtype='object', length=166)

New columns in DataFrame after feature engineering: Index(['Date', 'Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week',
       'DR1_2Weeks', 'DR1_Prev_Entry', 'DR1_Prev_Entry-2', 'DR1_Mov_Avg',
       'DR1_Mov_Avg_2',
       ...
       'Draw1_Change Draw2_Change', 'Draw1_Change Draw3_Change',
       'Draw1_Change Draw4_Change', 'Draw2_Change^2',
       'Draw2_Change Draw3_Change', 'Draw2_Change Draw4_Change',
       'Draw3_Change^2', 'Draw3_Change Draw4_Change', 'Draw4_Change^2',
       'cluster'],
    



In [34]:
# Cell 6.1: Identifying, defining, and dropping "Sensitive" columns to avoid data leakage

import pandas as pd

# Define the directory for file paths
base_dir = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/'

# Load datasets function
def load_dataset(filename):
    full_path = f'{base_dir}{filename}'
    return pd.read_csv(full_path)

# Save dataset function
def save_dataset(df, filename):
    full_path = f'{base_dir}{filename}'
    df.to_csv(full_path, index=False)

# Load your datasets
train_test_data = load_dataset('O_Polynomial_Features_Train_Test_Data.csv')
unseen_data = load_dataset('O_Polynomial_Features_Unseen_Data.csv')

# Function to drop sensitive columns
def drop_sensitive_columns(df, columns_to_drop):
    return df.drop(columns=columns_to_drop, inplace=False)

# Function to keep only the specified columns in the dataset
def keep_only(df, columns_to_keep):
    return df[columns_to_keep]

# List of sensitive columns to drop (replace with actual column names as needed)
sensitive_columns_to_drop = ['Draw1', 'Draw2', 'Draw3', 'Draw4', 'DR2_Prev_Entry', 'DR3_Prev_Entry-2']

# List of columns to keep (adjust as necessary)
columns_to_keep = [
    'Date', 'Row Number', 'Data_Type', 'DR1_Prev_Week', 'DR1_2Weeks', 'DR1_Prev_Entry', 'DR1_Prev_Entry-2',
    'DR1_Mov_Avg', 'DR1_Mov_Avg_2', 'Draw1_Moving_Avg_3', 'Draw1_Median_3', 'Draw1_Std_Dev_3', 'Draw1_RMS_3',
    'Draw1_Rolling_Min_3', 'Draw1_Rolling_Max_3', 'Draw1_Skew_3', 'Draw1_Kurtosis_3', 'Draw1_EMA_3', 'Draw1_Rolling_Var_3',
    'Draw1_Rolling_Range_3', 'Draw1_Moving_Avg_5', 'Draw1_Median_5', 'Draw1_Std_Dev_5', 'Draw1_RMS_5', 'Draw1_Rolling_Min_5',
    'Draw1_Rolling_Max_5', 'Draw1_Skew_5', 'Draw1_Kurtosis_5', 'Draw1_EMA_5', 'Draw1_Rolling_Var_5', 'Draw1_Rolling_Range_5',
    'Draw1_Moving_Avg_10', 'Draw1_Median_10', 'Draw1_Std_Dev_10', 'Draw1_RMS_10', 'Draw1_Rolling_Min_10', 'Draw1_Rolling_Max_10',
    'Draw1_Skew_10', 'Draw1_Kurtosis_10', 'Draw1_EMA_10', 'Draw1_Rolling_Var_10', 'Draw1_Rolling_Range_10', 'Year', 'Month', 'Day',
    'Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prev_Night', 'Line_Prev_Entry', 'Line_PE_Num_1', 'Line_PE_Num_2',
    'Line_PE_Num_3', 'Line_PE_Num_4', 'Spirit_PE_Num', 'Rake_PE_Num_1', 'Rake_PE_Num_2', 'Rake_PE_Num_3', 'Rake_PE_Num_4',
    'DayofWeek', 'Draw1_Prev', 'Draw1_Change', 'Draw2_Prev', 'Draw2_Change', 'Draw3_Prev', 'Draw3_Change', 'Draw4_Prev',
    'Draw4_Change', 'DayOfYear', 'Num_1_Interval_Last', 'Num_1_Cum_Count', 'Num_2_Interval_Last', 'Num_2_Cum_Count',
    'Num_3_Interval_Last', 'Num_3_Cum_Count', 'Num_4_Interval_Last', 'Num_4_Cum_Count', 'Num_5_Interval_Last',
    'Num_5_Cum_Count', 'Num_6_Interval_Last', 'Num_6_Cum_Count', 'Num_7_Interval_Last', 'Num_7_Cum_Count',
    'Num_8_Interval_Last', 'Num_8_Cum_Count', 'Num_9_Interval_Last', 'Num_9_Cum_Count', 'Num_10_Interval_Last',
    'Num_10_Cum_Count', 'Num_11_Interval_Last', 'Num_11_Cum_Count', 'Num_12_Interval_Last', 'Num_12_Cum_Count',
    'Num_13_Interval_Last', 'Num_13_Cum_Count', 'Num_14_Interval_Last', 'Num_14_Cum_Count', 'Num_15_Interval_Last',
    'Num_15_Cum_Count', 'Num_16_Interval_Last', 'Num_16_Cum_Count', 'Num_17_Interval_Last', 'Num_17_Cum_Count',
    'Num_18_Interval_Last', 'Num_18_Cum_Count', 'Num_19_Interval_Last', 'Num_19_Cum_Count', 'Num_20_Interval_Last',
    'Num_20_Cum_Count', 'Num_21_Interval_Last', 'Num_21_Cum_Count', 'Num_22_Interval_Last', 'Num_22_Cum_Count',
    'Num_23_Interval_Last', 'Num_23_Cum_Count', 'Num_24_Interval_Last', 'Num_24_Cum_Count', 'Num_25_Interval_Last',
    'Num_25_Cum_Count', 'Num_26_Interval_Last', 'Num_26_Cum_Count', 'Num_27_Interval_Last', 'Num_27_Cum_Count',
    'Num_28_Interval_Last', 'Num_28_Cum_Count', 'Num_29_Interval_Last', 'Num_29_Cum_Count', 'Num_30_Interval_Last',
    'Num_30_Cum_Count', 'Num_31_Interval_Last', 'Num_31_Cum_Count', 'Num_32_Interval_Last', 'Num_32_Cum_Count',
    'Num_33_Interval_Last', 'Num_33_Cum_Count', 'Num_34_Interval_Last', 'Num_34_Cum_Count', 'Num_35_Interval_Last',
    'Num_35_Cum_Count', 'Num_36_Interval_Last', 'Num_36_Cum_Count', 'Is_Special_Day', 'Is_Holiday', 'Is_Sunday'
]

# Drop sensitive columns for Training/Testing and Unseen Data
train_test_data = drop_sensitive_columns(train_test_data, sensitive_columns_to_drop)
unseen_data = drop_sensitive_columns(unseen_data, sensitive_columns_to_drop)

# Apply 'Keep Only' for Training/Testing and Unseen Data
train_test_data = keep_only(train_test_data, columns_to_keep)
unseen_data = keep_only(unseen_data, columns_to_keep)

# Save the datasets
save_dataset(train_test_data, 'P_Keep_Only_Train_Test_Data.csv')
save_dataset(unseen_data, 'P_Keep_Only_Unseen_Data.csv') # No need for to_frame conversion

# Check the shape of the processed datasets
print("Shape of train_test_data:", train_test_data.shape)
print("Shape of unseen_data:", unseen_data.shape)


Shape of train_test_data: (1409, 144)
Shape of unseen_data: (105, 144)


In [None]:
# CELL 6.2: Splitting Data into Training, Validation, and Test Sets - Chronological Split

import pandas as pd

# Define the directory for file paths
base_dir = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/'

# Load datasets function
def load_dataset(filename):
    full_path = f'{base_dir}{filename}'
    return pd.read_csv(full_path)

# Save dataset function
def save_dataset(df, filename):
    full_path = f'{base_dir}{filename}'
    df.to_csv(full_path, index=False)

# Load your datasets
train_test_data = load_dataset('P_Keep_Only_Train_Test_Data.csv')
unseen_data = load_dataset('P_Keep_Only_Unseen_Data.csv')

# Sort by 'Date'
train_test_data.sort_values(by=['Date'], inplace=True)

# Separate the training/testing dataset into features and target
X = train_test_data.drop(['Prediction1'], axis=1)
y = train_test_data['Prediction1']

# Calculate split indices
train_size = int(len(X) * 0.7)
val_size = int(len(X) * 0.15)
test_size = len(X) - train_size - val_size

# Split the dataset chronologically
X_train, y_train = X.iloc[:train_size], y.iloc[:train_size]
X_val, y_val = X.iloc[train_size:train_size+val_size], y.iloc[train_size:train_size+val_size]
X_test, y_test = X.iloc[train_size+val_size:], pd.Series([0] * (len(X) - train_size - val_size))
unseen_features = unseen_data.drop(['Prediction1'], axis=1)
unseen_target = unseen_data['Prediction1']  # This line should exist before the save_dataset calls

# Create actual results datasets for evaluation purposes
actual_results_train = train_test_data.iloc[:train_size][['Date', 'Draw1']].copy()
actual_results_val = train_test_data.iloc[train_size:train_size+val_size][['Date', 'Draw1']].copy()
actual_results_test = train_test_data.iloc[train_size+val_size:][['Date', 'Draw1']].copy()
actual_results_unseen = unseen_data[['Date', 'Draw1']].copy()

# Save the datasets
save_dataset(X_train, 'K_Train_Features.csv')
save_dataset(y_train.to_frame('Prediction1'), 'K_Train_Target.csv')
save_dataset(X_val, 'L_Val_Features.csv')
save_dataset(y_val.to_frame('Prediction1'), 'L_Val_Target.csv')
save_dataset(X_test, 'M_Test_Features.csv')
save_dataset(y_test.to_frame('Prediction1'), 'M_Test_Target.csv')
save_dataset(unseen_features, 'N_Unseen_Features.csv')
save_dataset(unseen_target.to_frame('Prediction1'), 'N_Unseen_Target.csv')

# Save the actual results datasets
save_dataset(actual_results_train, 'Actual_Results_Train.csv')
save_dataset(actual_results_val, 'Actual_Results_Val.csv')
save_dataset(actual_results_test, 'Actual_Results_Test.csv')
save_dataset(actual_results_unseen, 'Actual_Results_Unseen.csv')

# Print the shapes of the datasets
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of y_val:", y_val.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)
print("Shape of unseen_features:", unseen_features.shape)
print("Shape of unseen_target:", unseen_target.shape)


In [14]:
# Cell 5.1: Final checks and verifications.

# Check for NaN values in train/test data
print("NaN check for train/test data:")
print(train_test_data.isnull().sum())

# Check data types in train/test data
print("\nData types in train/test data:")
print(train_test_data.dtypes)

# Check for NaN values in unseen data
print("\nNaN check for unseen data:")
print(unseen_data.isnull().sum())

# Check data types in unseen data
print("\nData types in unseen data:")
print(unseen_data.dtypes)


NaN check for train/test data:
Date                0
Row Number          0
Data_Type           0
Draw1               0
DR1_Prev_Week       0
DR1_2Weeks          0
DR1_Prev_Entry      0
DR1_Prev_Entry-2    0
DR1_Mov_Avg         0
DR1_Vert_Avg        0
Draw2               0
DR2_Prev_Week       0
DR2_2Weeks          0
DR2_Prev_Entry      0
DR2_Prev_Entry-2    0
DR2_Mov_Avg         0
DR2_Vert_Avg        0
Draw3               0
DR3_Prev_Week       0
DR3_2Weeks          0
DR3_Prev_Entry      0
DR3_Prev_Entry-2    0
DR3_Mov_Avg         0
DR3_Vert_Avg        0
Draw4               0
DR4_Prev_Week       0
DR4_2Weeks          0
DR4_Prev_Entry      0
DR4_Prev_Entry-2    0
DR4_Mov_Avg         0
DR4_Vert_Avg        0
Year                0
Month               0
Day                 0
Prev_Morning        0
Prev_Afternoon      0
Prev_Evening        0
Prev_Night          0
Prediction1         0
Line_Prev_Entry     0
Line_PE_Num_1       0
Line_PE_Num_2       0
Line_PE_Num_3       0
Line_PE_Num_4       0
S

In [15]:
# Save the first copy to the first directory
train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/S_Final_Train_Test_Data.csv', index=False)
unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/T_Final_Unseen_Data.csv', index=False)

# Save the second copy to the second directory AS INITIAL DATASETS FOR DRAW 1 PREDICTIVE SCRIPT
train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Draw1_Predictive_Model/A_Initial_Train_Test_Data.csv', index=False)
unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Draw1_Predictive_Model/B_Initial_Unseen_Data.csv', index=False)


# :::::**THE END** *Thank You*:::::

### ***Last Revision***
***04/02/24***
11:24pm

# ***Revision***
27/3/24
*Updated Unseen Results up to 30/11/23*