<a href="https://colab.research.google.com/github/alvinfranklyndavis/Draw1_Predictive_Model/blob/main/Initial_Data_Prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# CELL 1.1: Package Installation and Library Import

# Check for existing libraries
!pip show pandas numpy

# Install or upgrade required packages
!pip install -U --upgrade-strategy eager pip
!pip install -U --upgrade-strategy eager pandas==<desired_version> numpy==<desired_version>

# Import required libraries
import pandas as pd
import numpy as np
import logging
import os

# Set up logging to save logs in a file
log_file = 'project.log'
logging.basicConfig(filename=log_file, level=logging.INFO)
logger = logging.getLogger(__name__)

# Set up virtual environment (optional but recommended)
# You can create a virtual environment with: !python -m venv myenv
# And activate it with: source myenv/bin/activate (Linux/macOS) or myenv\Scripts\activate (Windows)


Name: pandas
Version: 1.5.3
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: The Pandas Development Team
Author-email: pandas-dev@python.org
License: BSD-3-Clause
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy, python-dateutil, pytz
Required-by: altair, arviz, bigframes, bokeh, bqplot, cmdstanpy, cufflinks, datascience, db-dtypes, dopamine-rl, fastai, geemap, geopandas, google-colab, gspread-dataframe, holoviews, ibis-framework, mizani, mlxtend, pandas-datareader, pandas-gbq, panel, plotnine, prophet, pymc, seaborn, sklearn-pandas, statsmodels, vega-datasets, xarray, yfinance
---
Name: numpy
Version: 1.25.2
Summary: Fundamental package for array computing in Python
Home-page: https://www.numpy.org
Author: Travis E. Oliphant et al.
Author-email: 
License: BSD-3-Clause
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: albumentations, altair, arviz, astropy, autograd, bl

In [2]:
# Cell 1.2: Data Loading from Google Drive Training / Testing  and Unseen datasets

import pandas as pd
import logging
import os
from google.colab import drive

# Set up logging
logger = logging.getLogger(__name__)

# Mount Google Drive
drive.mount('/content/drive')

# Define the directory for datasets in Google Drive
drive_dataset_directory = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/'

# Define the paths to the CSV files
csv_filename_train_test = 'A_Initial_Train_Test_Data.csv'
csv_filename_unseen = 'B_Initial_Unseen_Data.csv'

drive_csv_path_train_test = os.path.join(drive_dataset_directory, csv_filename_train_test)
drive_csv_path_unseen = os.path.join(drive_dataset_directory, csv_filename_unseen)

# Check and load the datasets
def load_dataset(file_path):
    if os.path.isfile(file_path):
        print("File found. Proceeding to load the dataset.")
        return pd.read_csv(file_path)
    else:
        print("File not found. Check the file path or the Google Drive mount.")
        return None

# Load training/testing data
train_test_data = load_dataset(drive_csv_path_train_test)

# Load unseen data
unseen_data = load_dataset(drive_csv_path_unseen)

# Print the first few rows of both datasets for inspection
print("First few rows of training/testing data:")
print(train_test_data.head())

print("\nFirst few rows of unseen data:")
print(unseen_data.head())


Mounted at /content/drive
File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
First few rows of training/testing data:
       Date  Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  \
0  01-08-18           1  Training   19.0            7.0        27.0   
1  02-08-18           2  Training   31.0           11.0         1.0   
2  03-08-18           3  Training   15.0           19.0        21.0   
3  04-08-18           4  Training   31.0           35.0        18.0   
4  05-08-18           5       NaN    NaN            NaN         NaN   

   DR1_Prev_Entry  DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  ...  \
0            23.0              32.0         27.5          17.0  ...   
1             9.0              33.0         21.0           6.0  ...   
2            12.0              35.0         23.5          20.0  ...   
3            35.0              23.0         29.0          26.5  ...   
4             NaN               NaN          NaN           NaN 

In [3]:
# Cell 1.3: Surveillance Check for NaNs within both datasets

# Check for NaN values in training/testing data
print("NaN check for training/testing data:")
print(train_test_data.isna().sum())

# Check for NaN values in unseen data
print("\nNaN check for unseen data:")
print(unseen_data.isna().sum())


NaN check for training/testing data:
Date                  0
Row Number            0
Data_Type           239
Draw1               239
DR1_Prev_Week       239
DR1_2Weeks          239
DR1_Prev_Entry      239
DR1_Prev_Entry-2    239
DR1_Mov_Avg         239
DR1_Vert_Avg        239
Draw2               239
DR2_Prev_Week       239
DR2_2Weeks          239
DR2_Prev_Entry      239
DR2_Prev_Entry-2    239
DR2_Mov_Avg         239
DR2_Vert_Avg        239
Draw3               239
DR3_Prev_Week       239
DR3_2Weeks          239
DR3_Prev_Entry      239
DR3_Prev_Entry-2    239
DR3_Mov_Avg         239
DR3_Vert_Avg        239
Draw4               239
DR4_Prev_Week       239
DR4_2Weeks          239
DR4_Prev_Entry      239
DR4_Prev_Entry-2    239
DR4_Mov_Avg         239
DR4_Vert_Avg        239
dtype: int64

NaN check for unseen data:
Date                 0
Row Number           0
Data_Type           17
Draw1               17
DR1_Prev_Week       17
DR1_2Weeks          17
DR1_Prev_Entry      17
DR1_Prev_Entry-2 

In [4]:
# Cell 2.1: NaN handling and new CSV saving for Training / Testing  and Unseen datasets


# Impute NaN values with zeros in training/testing data
train_test_data = train_test_data.fillna(0)

# Impute NaN values with zeros in unseen data
unseen_data = unseen_data.fillna(0)

# Define new CSV file names
new_csv_filename_train_test = 'C_NaN_Handled_Train_Test_Data.csv'
new_csv_filename_unseen = 'D_NaN_Handled_Unseen_Data.csv'

# Define the paths for saving the new CSV files
new_csv_path_train_test = os.path.join(drive_dataset_directory, new_csv_filename_train_test)
new_csv_path_unseen = os.path.join(drive_dataset_directory, new_csv_filename_unseen)

# Save the preprocessed training/testing data as a new CSV file
train_test_data.to_csv(new_csv_path_train_test, index=False)

# Save the preprocessed unseen data as a new CSV file
unseen_data.to_csv(new_csv_path_unseen, index=False)

# Print a message to confirm that the preprocessing and saving is complete
print("Preprocessing and saving of datasets is complete.")

# Check for NaN values in the preprocessed training/testing data
print("\nNaN check for preprocessed training/testing data:")
print(train_test_data.isna().sum())

# Check for NaN values in the preprocessed unseen data
print("\nNaN check for preprocessed unseen data:")
print(unseen_data.isna().sum())


Preprocessing and saving of datasets is complete.

NaN check for preprocessed training/testing data:
Date                0
Row Number          0
Data_Type           0
Draw1               0
DR1_Prev_Week       0
DR1_2Weeks          0
DR1_Prev_Entry      0
DR1_Prev_Entry-2    0
DR1_Mov_Avg         0
DR1_Vert_Avg        0
Draw2               0
DR2_Prev_Week       0
DR2_2Weeks          0
DR2_Prev_Entry      0
DR2_Prev_Entry-2    0
DR2_Mov_Avg         0
DR2_Vert_Avg        0
Draw3               0
DR3_Prev_Week       0
DR3_2Weeks          0
DR3_Prev_Entry      0
DR3_Prev_Entry-2    0
DR3_Mov_Avg         0
DR3_Vert_Avg        0
Draw4               0
DR4_Prev_Week       0
DR4_2Weeks          0
DR4_Prev_Entry      0
DR4_Prev_Entry-2    0
DR4_Mov_Avg         0
DR4_Vert_Avg        0
dtype: int64

NaN check for preprocessed unseen data:
Date                0
Row Number          0
Data_Type           0
Draw1               0
DR1_Prev_Week       0
DR1_2Weeks          0
DR1_Prev_Entry      0
DR1_Prev_

In [5]:
# Cell 2.2: Extract Y/M/D from Date and new CSV saving for Training / Testing  and Unseen datasets

# Load the NaN-handled training/testing data
nan_handled_train_test_data = load_dataset(new_csv_path_train_test)

# Load the NaN-handled unseen data
nan_handled_unseen_data = load_dataset(new_csv_path_unseen)

# Check and load the datasets
def load_dataset(file_path):
    if os.path.isfile(file_path):
        print("File found. Proceeding to load the dataset.")
        return pd.read_csv(file_path)
    else:
        print("File not found. Check the file path.")
        return None

# Function to extract 'Year', 'Month', and 'Day' from the 'Date' column
def extract_date_features(data):
    if 'Date' in data.columns:
        print("Converting 'Date' to datetime and extracting Year, Month, and Day...")
        date_formats = ['%d-%m-%y', '%d/%m/%Y']
        for date_format in date_formats:
            try:
                data['Date'] = pd.to_datetime(data['Date'], format=date_format)
                data['Year'] = data['Date'].dt.year.fillna(0).astype(int)
                data['Month'] = data['Date'].dt.month.fillna(0).astype(int)
                data['Day'] = data['Date'].dt.day.fillna(0).astype(int)
                print("After extracting Year, Month, and Day:", data.columns)
                break  # Break the loop if successful date conversion
            except ValueError:
                print(f"Failed to convert 'Date' with format: {date_format}")
    else:
        print("'Date' column not found in the dataset.")

# Extract 'Year', 'Month', and 'Day' from the 'Date' column in training/testing data
extract_date_features(nan_handled_train_test_data)

# Extract 'Year', 'Month', and 'Day' from the 'Date' column in unseen data
extract_date_features(nan_handled_unseen_data)

# Define new CSV file names
new_csv_filename_train_test_date = 'E_Date_Extracted_Train_Test_Data.csv'
new_csv_filename_unseen_date = 'F_Date_Extracted_Unseen_Data.csv'

# Define the paths for saving the new CSV files
new_csv_path_train_test_date = os.path.join(drive_dataset_directory, new_csv_filename_train_test_date)
new_csv_path_unseen_date = os.path.join(drive_dataset_directory, new_csv_filename_unseen_date)

# Save the datasets with extracted date features as new CSV files
nan_handled_train_test_data.to_csv(new_csv_path_train_test_date, index=False)
nan_handled_unseen_data.to_csv(new_csv_path_unseen_date, index=False)

# Print a message to confirm that the date extraction and saving is complete
print("Date extraction and saving of datasets is complete.")

# Check for NaN values in the datasets with extracted date features
print("\nNaN check for training/testing data with extracted date features:")
print(nan_handled_train_test_data.isna().sum())

print("\nNaN check for unseen data with extracted date features:")
print(nan_handled_unseen_data.isna().sum())


File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
Converting 'Date' to datetime and extracting Year, Month, and Day...
After extracting Year, Month, and Day: Index(['Date', 'Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week',
       'DR1_2Weeks', 'DR1_Prev_Entry', 'DR1_Prev_Entry-2', 'DR1_Mov_Avg',
       'DR1_Vert_Avg', 'Draw2', 'DR2_Prev_Week', 'DR2_2Weeks',
       'DR2_Prev_Entry', 'DR2_Prev_Entry-2', 'DR2_Mov_Avg', 'DR2_Vert_Avg',
       'Draw3', 'DR3_Prev_Week', 'DR3_2Weeks', 'DR3_Prev_Entry',
       'DR3_Prev_Entry-2', 'DR3_Mov_Avg', 'DR3_Vert_Avg', 'Draw4',
       'DR4_Prev_Week', 'DR4_2Weeks', 'DR4_Prev_Entry', 'DR4_Prev_Entry-2',
       'DR4_Mov_Avg', 'DR4_Vert_Avg', 'Year', 'Month', 'Day'],
      dtype='object')
Converting 'Date' to datetime and extracting Year, Month, and Day...
Failed to convert 'Date' with format: %d-%m-%y
After extracting Year, Month, and Day: Index(['Date', 'Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week',
       

In [6]:
# Cell 2.3: Create shifted columns for previous day's data

# Function to create shifted columns for previous day's data
def create_shifted_columns(data):
    data['Prev_Morning'] = data['Draw1'].shift(1)
    data['Prev_Afternoon'] = data['Draw2'].shift(1)
    data['Prev_Evening'] = data['Draw3'].shift(1)
    data['Prev_Night'] = data['Draw4'].shift(1)
    data[['Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prev_Night']] = data[['Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prev_Night']].fillna(0).astype(int)

# Load the date extracted training/testing data
date_extracted_train_test_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/E_Date_Extracted_Train_Test_Data.csv')

# Apply the function to create shifted columns
create_shifted_columns(date_extracted_train_test_data)

# Save the updated training/testing data with shifted columns
date_extracted_train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/G_Shifted_Train_Test_Data.csv', index=False)

# Load the date extracted unseen data
date_extracted_unseen_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/F_Date_Extracted_Unseen_Data.csv')

# Apply the function to create shifted columns
create_shifted_columns(date_extracted_unseen_data)

# Save the updated unseen data with shifted columns
date_extracted_unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/H_Shifted_Unseen_Data.csv', index=False)

# Print the first few rows of both datasets for inspection
print("First few rows of date extracted training/testing data:")
print(date_extracted_train_test_data.head())

print("\nFirst few rows of date extracted unseen data:")
print(date_extracted_unseen_data.head())


File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
First few rows of date extracted training/testing data:
         Date  Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  \
0  2018-08-01           1  Training   19.0            7.0        27.0   
1  2018-08-02           2  Training   31.0           11.0         1.0   
2  2018-08-03           3  Training   15.0           19.0        21.0   
3  2018-08-04           4  Training   31.0           35.0        18.0   
4  2018-08-05           5         0    0.0            0.0         0.0   

   DR1_Prev_Entry  DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  ...  \
0            23.0              32.0         27.5          17.0  ...   
1             9.0              33.0         21.0           6.0  ...   
2            12.0              35.0         23.5          20.0  ...   
3            35.0              23.0         29.0          26.5  ...   
4             0.0               0.0          0.0           0.0

In [7]:
# Cell 2.4: Handle NaN values for previous day's data

# Load the shifted training/testing data
shifted_train_test_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/G_Shifted_Train_Test_Data.csv')

# Manually set values for the first row of training/testing set
shifted_train_test_data.at[0, 'Prev_Morning'] = 13
shifted_train_test_data.at[0, 'Prev_Afternoon'] = 34
shifted_train_test_data.at[0, 'Prev_Evening'] = 32
shifted_train_test_data.at[0, 'Prev_Night'] = 23

# Save the updated training/testing data
shifted_train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/I_Handled_Shifted_Train_Test_Data.csv', index=False)

# Load the shifted unseen data
shifted_unseen_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/H_Shifted_Unseen_Data.csv')

# Manually set values for the first row of unseen set
shifted_unseen_data.at[0, 'Prev_Morning'] = 25
shifted_unseen_data.at[0, 'Prev_Afternoon'] = 9
shifted_unseen_data.at[0, 'Prev_Evening'] = 7
shifted_unseen_data.at[0, 'Prev_Night'] = 5

# Save the updated unseen data
shifted_unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/J_Handled_Shifted_Unseen_Data.csv', index=False)

# Print the first few rows of both datasets for inspection
print("First few rows of handled shifted training/testing data:")
print(shifted_train_test_data.head())

print("\nFirst few rows of handled shifted unseen data:")
print(shifted_unseen_data.head())


File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
First few rows of handled shifted training/testing data:
         Date  Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  \
0  2018-08-01           1  Training   19.0            7.0        27.0   
1  2018-08-02           2  Training   31.0           11.0         1.0   
2  2018-08-03           3  Training   15.0           19.0        21.0   
3  2018-08-04           4  Training   31.0           35.0        18.0   
4  2018-08-05           5         0    0.0            0.0         0.0   

   DR1_Prev_Entry  DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  ...  \
0            23.0              32.0         27.5          17.0  ...   
1             9.0              33.0         21.0           6.0  ...   
2            12.0              35.0         23.5          20.0  ...   
3            35.0              23.0         29.0          26.5  ...   
4             0.0               0.0          0.0           0.

In [8]:
# Cell 3.1: # Initialize TARGET VARIABLE 'Prediction1' column

# Load the handled shifted training/testing data
handled_shifted_train_test_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/I_Handled_Shifted_Train_Test_Data.csv')

# Set 'Prediction1' column equal to 'Draw1' for training/testing data
handled_shifted_train_test_data['Prediction1'] = handled_shifted_train_test_data['Draw1']

# Save the updated training/testing data
handled_shifted_train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/K_Handled_Prediction1_Train_Test_Data.csv', index=False)

# Load the handled shifted unseen data
handled_shifted_unseen_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/J_Handled_Shifted_Unseen_Data.csv')

# Initialize 'Prediction1' column with zero (0) for the unseen data
handled_shifted_unseen_data['Prediction1'] = 0

# Save the updated unseen data
handled_shifted_unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/L_Handled_Prediction1_Unseen_Data.csv', index=False)

# Print the first few rows of both datasets for inspection
print("First few rows of handled Prediction1 training/testing data:")
print(handled_shifted_train_test_data.head())

print("\nFirst few rows of handled Prediction1 unseen data:")
print(handled_shifted_unseen_data.head())


File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
First few rows of handled Prediction1 training/testing data:
         Date  Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  \
0  2018-08-01           1  Training   19.0            7.0        27.0   
1  2018-08-02           2  Training   31.0           11.0         1.0   
2  2018-08-03           3  Training   15.0           19.0        21.0   
3  2018-08-04           4  Training   31.0           35.0        18.0   
4  2018-08-05           5         0    0.0            0.0         0.0   

   DR1_Prev_Entry  DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  ...  \
0            23.0              32.0         27.5          17.0  ...   
1             9.0              33.0         21.0           6.0  ...   
2            12.0              35.0         23.5          20.0  ...   
3            35.0              23.0         29.0          26.5  ...   
4             0.0               0.0          0.0         

In [9]:
# Cell 3.2: Converting the columns to integer in both datasets (excluding 'Date')

# Load the most recent CSVs
train_test_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/K_Handled_Prediction1_Train_Test_Data.csv')
unseen_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/L_Handled_Prediction1_Unseen_Data.csv')

# List of columns to convert to integer (excluding 'Data_Type' and 'Date')
columns_to_convert_train_test = [col for col in train_test_data.columns if col not in ['Data_Type', 'Date']]
columns_to_convert_unseen = [col for col in unseen_data.columns if col not in ['Data_Type', 'Date']]

# Convert columns to integer
train_test_data[columns_to_convert_train_test] = train_test_data[columns_to_convert_train_test].astype(int)
unseen_data[columns_to_convert_unseen] = unseen_data[columns_to_convert_unseen].astype(int)

# Save the updated datasets
train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/K_Handled_Prediction1_Train_Test_Data.csv', index=False)
unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/L_Handled_Prediction1_Unseen_Data.csv', index=False)

# Display the data types of the columns after conversion
print("Data types of columns in train/test data after conversion:")
print(train_test_data.dtypes)

print("\nData types of columns in unseen data after conversion:")
print(unseen_data.dtypes)


Data types of columns in train/test data after conversion:
Date                object
Row Number           int64
Data_Type           object
Draw1                int64
DR1_Prev_Week        int64
DR1_2Weeks           int64
DR1_Prev_Entry       int64
DR1_Prev_Entry-2     int64
DR1_Mov_Avg          int64
DR1_Vert_Avg         int64
Draw2                int64
DR2_Prev_Week        int64
DR2_2Weeks           int64
DR2_Prev_Entry       int64
DR2_Prev_Entry-2     int64
DR2_Mov_Avg          int64
DR2_Vert_Avg         int64
Draw3                int64
DR3_Prev_Week        int64
DR3_2Weeks           int64
DR3_Prev_Entry       int64
DR3_Prev_Entry-2     int64
DR3_Mov_Avg          int64
DR3_Vert_Avg         int64
Draw4                int64
DR4_Prev_Week        int64
DR4_2Weeks           int64
DR4_Prev_Entry       int64
DR4_Prev_Entry-2     int64
DR4_Mov_Avg          int64
DR4_Vert_Avg         int64
Year                 int64
Month                int64
Day                  int64
Prev_Morning         in

In [10]:
# Cell 4.1: Introducing "Lines" as a new feature in both datasets

import pandas as pd

# Define the directory for file paths
base_dir = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/'

# Load the most recent CSVs
train_test_data = pd.read_csv(base_dir + 'K_Handled_Prediction1_Train_Test_Data.csv')
unseen_data = pd.read_csv(base_dir + 'L_Handled_Prediction1_Unseen_Data.csv')

# Directory of lines with corresponding numbers
lines_directory = {
    1: [1, 10, 19, 28],
    2: [2, 11, 20, 29],
    3: [3, 12, 21, 30],
    4: [4, 13, 22, 31],
    5: [5, 14, 23, 32],
    6: [6, 15, 24, 33],
    7: [7, 16, 25, 34],
    8: [8, 17, 26, 35],
    9: [9, 18, 27, 36],
}

# Function to calculate the sum of digits and map to a 'Line'
def sum_to_line(x):
    try:
        sum_of_digits = sum(int(digit) for digit in str(x))
        # Ensure the sum is between 1 and 9
        while sum_of_digits > 9:
            sum_of_digits = sum(int(digit) for digit in str(sum_of_digits))
        return sum_of_digits
    except ValueError:
        # Return 0 if the value cannot be converted to an integer (e.g., missing or non-numeric data)
        return 0

# Create the 'Line_Prev_Entry' column
train_test_data['Line_Prev_Entry'] = train_test_data['DR1_Prev_Entry'].apply(sum_to_line)
unseen_data['Line_Prev_Entry'] = unseen_data['DR1_Prev_Entry'].apply(sum_to_line)

# Function to extract numbers from the line subset for the active line
def extract_line_numbers(row, lines_dict):
    active_line = row['Line_Prev_Entry']
    # Check if it's a day with no draws or an invalid line number
    if active_line == 0 or active_line not in lines_dict:
        # Set all numbers for this row to zero or NaN
        for i in range(1, 5):
            row[f'Line_PE_Num_{i}'] = 0  # Or use NaN if that's preferred
    else:
        # Populate the row with numbers from the active line
        for i, num in enumerate(lines_dict[active_line], start=1):
            row[f'Line_PE_Num_{i}'] = num
    return row

# Apply the function to each row of the DataFrame
train_test_data = train_test_data.apply(lambda row: extract_line_numbers(row, lines_directory), axis=1)
unseen_data = unseen_data.apply(lambda row: extract_line_numbers(row, lines_directory), axis=1)

# Save the updated datasets with 'Lines' as new features
train_test_data.to_csv(base_dir + 'M_Lines_Train_Test_Data.csv', index=False)
unseen_data.to_csv(base_dir + 'N_Lines_Unseen_Data.csv', index=False)

# Display the first few rows of both datasets to verify the "Lines" assignment
print("First few rows of train/test data with 'Lines' assigned:")
print(train_test_data.head())

print("\nFirst few rows of unseen data with 'Lines' assigned:")
print(unseen_data.head())


First few rows of train/test data with 'Lines' assigned:
         Date  Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  \
0  2018-08-01           1  Training     19              7          27   
1  2018-08-02           2  Training     31             11           1   
2  2018-08-03           3  Training     15             19          21   
3  2018-08-04           4  Training     31             35          18   
4  2018-08-05           5         0      0              0           0   

   DR1_Prev_Entry  DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  ...  \
0              23                32           27            17  ...   
1               9                33           21             6  ...   
2              12                35           23            20  ...   
3              35                23           29            26  ...   
4               0                 0            0             0  ...   

   Prev_Morning  Prev_Afternoon  Prev_Evening  Prev_Night  Prediction1  \
0  

In [11]:
## Cell 4.2: # Introducing "Special Groups" as a new feature in both datasets

# Load the most recent CSVs
#train_test_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/M_Lines_Train_Test_Data.csv')
#unseen_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/N_Lines_Unseen_Data.csv')

# Define the mapping for "Special Groups"
#special_groups_mapping = {
#    2: 1, 15: 1, 16: 1, 24: 1, 31: 1,  # "Ladies"
#    4: 2, 5: 2, 12: 2, 29: 2, 34: 2,  # "Men"
#    11: 3, 17: 3, 26: 3,  # "Birds"
#    7: 4, 9: 4, 19: 4, 20: 4, 22: 4, 30: 4, 36: 4,  # "Domestic Animals"
#    8: 5, 10: 5, 13: 5, 25: 5,  # "Wild Animals"
#    18: 6, 28: 6, 32: 6,  # "Ocean"
#    1: 7, 27: 7, 33: 7, 35: 7,  # "Snakes & Insects"
#    3: 8, 6: 8, 14: 8, 21: 8, 23: 8  # "Home"
#}

# Function to assign "Special Groups" based on the mapping
#def assign_special_groups(data, column_name, special_groups_mapping):
#    data[f'Special_Groups_{column_name}'] = data[column_name].map(special_groups_mapping).fillna(0).astype(int)

# List of columns to assign "Special Groups"
#columns_to_assign_special_groups = ['Draw1', 'DR1_Prev_Week', 'DR1_Prev_Entry']

# Assign "Special Groups" for specified columns in train/test data
#for column in columns_to_assign_special_groups:
#    assign_special_groups(train_test_data, column, special_groups_mapping)

# Assign "Special Groups" for specified columns in unseen data
#for column in columns_to_assign_special_groups:
#    assign_special_groups(unseen_data, column, special_groups_mapping)

# Save the updated datasets
#train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/O_Special_Groups_Train_Test_Data.csv', index=False)
#unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/P_Special_Groups_Unseen_Data.csv', index=False)

# Display the first few rows of both datasets to verify the "Special Groups" assignment
#print("First few rows of train/test data with 'Special Groups' assigned:")
#print(train_test_data.head())

#print("\nFirst few rows of unseen data with 'Special Groups' assigned:")
#print(unseen_data.head())


In [12]:
# Cell 4.3: Introducing "Spirits" as a new feature in both datasets

import pandas as pd

# Define the directory for file paths
base_dir = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/'

# Load the most recent CSVs
train_test_data = pd.read_csv(base_dir + 'M_Lines_Train_Test_Data.csv')
unseen_data = pd.read_csv(base_dir + 'N_Lines_Unseen_Data.csv')

# Define a mapping dictionary for "Spirits" pairs
spirits_mapping = {
    1: 5,
    2: 24,
    3: 19,
    4: 35,
    5: 1,
    6: 15,
    7: 13,
    8: 29,
    9: 33,
    10: 28,
    11: 36,
    12: 32,
    13: 7,
    14: 25,
    15: 6,
    16: 17,
    17: 16,
    18: 30,
    19: 3,
    20: 22,
    21: 23,
    22: 20,
    23: 21,
    24: 2,
    25: 14,
    26: 27,
    27: 26,
    28: 10,
    29: 8,
    30: 18,
    31: 34,
    32: 12,
    33: 9,
    34: 31,
    35: 4,
    36: 11
}

# Function to map "DR1_Prev_Entry" to its spirit pair
def map_to_spirit(x, spirits_dict):
    # Return the corresponding spirit number or None if not found
    return spirits_dict.get(x)

# Create the 'Spirit_PE_Num' column
train_test_data['Spirit_PE_Num'] = train_test_data['DR1_Prev_Entry'].apply(lambda x: map_to_spirit(x, spirits_mapping))
unseen_data['Spirit_PE_Num'] = unseen_data['DR1_Prev_Entry'].apply(lambda x: map_to_spirit(x, spirits_mapping))

# Replace NaNs with 0 and convert to int
train_test_data['Spirit_PE_Num'] = train_test_data['Spirit_PE_Num'].fillna(0).astype(int)
unseen_data['Spirit_PE_Num'] = unseen_data['Spirit_PE_Num'].fillna(0).astype(int)

# Save the updated datasets with 'Spirits' as new features
train_test_data.to_csv(base_dir + 'O_Spirits_Train_Test_Data.csv', index=False)
unseen_data.to_csv(base_dir + 'P_Spirits_Unseen_Data.csv', index=False)

# Display the first few rows of both datasets to verify the "Spirits" assignment
print("First few rows of train/test data with 'Spirits' assigned:")
print(train_test_data.head())

print("\nFirst few rows of unseen data with 'Spirits' assigned:")
print(unseen_data.head())


First few rows of train/test data with 'Spirits' assigned:
         Date  Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  \
0  2018-08-01           1  Training     19              7          27   
1  2018-08-02           2  Training     31             11           1   
2  2018-08-03           3  Training     15             19          21   
3  2018-08-04           4  Training     31             35          18   
4  2018-08-05           5         0      0              0           0   

   DR1_Prev_Entry  DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  ...  \
0              23                32           27            17  ...   
1               9                33           21             6  ...   
2              12                35           23            20  ...   
3              35                23           29            26  ...   
4               0                 0            0             0  ...   

   Prev_Afternoon  Prev_Evening  Prev_Night  Prediction1  Line_Prev_Entry  

In [13]:
# Cell 4.4: Introducing "Rakes" as a new feature in both datasets

import pandas as pd

# Define the directory for file paths
base_dir = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/'

# Load the most recent CSVs
train_test_data = pd.read_csv(base_dir + 'O_Spirits_Train_Test_Data.csv')
unseen_data = pd.read_csv(base_dir + 'P_Spirits_Unseen_Data.csv')

# Define a mapping dictionary for "Rakes" numbers
rakes_mapping = {
    1: [7, 12, 15, 36],
    2: [4, 14, 16, 23],
    3: [19, 22, 34, 35],
    4: [7, 11, 14, 32],
    5: [20, 23, 31, 33],
    6: [14, 24, 30, 32],
    7: [4, 11, 24, 29],
    8: [12, 14, 33, 36],
    9: [7, 11, 24, 32],
    10: [7, 12, 15, 36],
    11: [4, 7, 17, 22],
    12: [1, 10, 28, 35],
    13: [10, 25, 28, 29],
    14: [16, 23, 31, 33],
    15: [5, 6, 23, 36],
    16: [14, 31, 35, 36],
    17: [11, 16, 26, 29],
    18: [5, 7, 28, 33],
    19: [10, 27, 32, 36],
    20: [4, 14, 24, 30],
    21: [16, 22, 24, 29],
    22: [11, 30, 32, 34],
    23: [12, 14, 16, 20],
    24: [2, 6, 12, 21],
    25: [1, 3, 13, 18],
    26: [1, 2, 11, 17],
    27: [14, 16, 19, 35],
    28: [12, 13, 18, 33],
    29: [7, 13, 16, 28],
    30: [6, 8, 20, 22],
    31: [5, 14, 16, 36],
    32: [4, 6, 28, 36],
    33: [5, 10, 14, 20],
    34: [1, 12, 20, 22],
    35: [3, 12, 14, 16],
    36: [1, 8, 16, 32],
}

# Function to assign "Rakes" based on the 'DR1_Prev_Entry' value
def assign_rakes(data, column_name, rakes_dict):
    # Create new columns for Rakes numbers
    for i in range(1, 5):
        data[f'Rake_PE_Num_{i}'] = 0

    # Populate Rakes numbers
    for index, row in data.iterrows():
        rakes_numbers = rakes_dict.get(row[column_name], [0, 0, 0, 0])
        for i, rake_num in enumerate(rakes_numbers, start=1):
            data.at[index, f'Rake_PE_Num_{i}'] = rake_num

    return data

# Apply 'assign_rakes' function to create new Rakes columns
train_test_data = assign_rakes(train_test_data, 'DR1_Prev_Entry', rakes_mapping)
unseen_data = assign_rakes(unseen_data, 'DR1_Prev_Entry', rakes_mapping)

# Save the updated datasets with 'Rakes' as new features
train_test_data.to_csv(base_dir + 'Q_Rakes_Train_Test_Data.csv', index=False)
unseen_data.to_csv(base_dir + 'R_Rakes_Unseen_Data.csv', index=False)

# Display the first few rows of both datasets to verify the "Rakes" assignment
print("First few rows of train/test data with 'Rakes' assigned:")
print(train_test_data[['DR1_Prev_Entry', 'Rake_PE_Num_1', 'Rake_PE_Num_2', 'Rake_PE_Num_3', 'Rake_PE_Num_4']].head())

print("\nFirst few rows of unseen data with 'Rakes' assigned:")
print(unseen_data[['DR1_Prev_Entry', 'Rake_PE_Num_1', 'Rake_PE_Num_2', 'Rake_PE_Num_3', 'Rake_PE_Num_4']].head())


First few rows of train/test data with 'Rakes' assigned:
   DR1_Prev_Entry  Rake_PE_Num_1  Rake_PE_Num_2  Rake_PE_Num_3  Rake_PE_Num_4
0              23             12             14             16             20
1               9              7             11             24             32
2              12              1             10             28             35
3              35              3             12             14             16
4               0              0              0              0              0

First few rows of unseen data with 'Rakes' assigned:
   DR1_Prev_Entry  Rake_PE_Num_1  Rake_PE_Num_2  Rake_PE_Num_3  Rake_PE_Num_4
0               5             20             23             31             33
1              18              5              7             28             33
2              28             12             13             18             33
3               2              4             14             16             23
4              12              

In [14]:
# Cell 5.1: Final checks and verifications.

# Check for NaN values in train/test data
print("NaN check for train/test data:")
print(train_test_data.isnull().sum())

# Check data types in train/test data
print("\nData types in train/test data:")
print(train_test_data.dtypes)

# Check for NaN values in unseen data
print("\nNaN check for unseen data:")
print(unseen_data.isnull().sum())

# Check data types in unseen data
print("\nData types in unseen data:")
print(unseen_data.dtypes)


NaN check for train/test data:
Date                0
Row Number          0
Data_Type           0
Draw1               0
DR1_Prev_Week       0
DR1_2Weeks          0
DR1_Prev_Entry      0
DR1_Prev_Entry-2    0
DR1_Mov_Avg         0
DR1_Vert_Avg        0
Draw2               0
DR2_Prev_Week       0
DR2_2Weeks          0
DR2_Prev_Entry      0
DR2_Prev_Entry-2    0
DR2_Mov_Avg         0
DR2_Vert_Avg        0
Draw3               0
DR3_Prev_Week       0
DR3_2Weeks          0
DR3_Prev_Entry      0
DR3_Prev_Entry-2    0
DR3_Mov_Avg         0
DR3_Vert_Avg        0
Draw4               0
DR4_Prev_Week       0
DR4_2Weeks          0
DR4_Prev_Entry      0
DR4_Prev_Entry-2    0
DR4_Mov_Avg         0
DR4_Vert_Avg        0
Year                0
Month               0
Day                 0
Prev_Morning        0
Prev_Afternoon      0
Prev_Evening        0
Prev_Night          0
Prediction1         0
Line_Prev_Entry     0
Line_PE_Num_1       0
Line_PE_Num_2       0
Line_PE_Num_3       0
Line_PE_Num_4       0
S

In [15]:
# Save the first copy to the first directory
train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/S_Final_Train_Test_Data.csv', index=False)
unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/T_Final_Unseen_Data.csv', index=False)

# Save the second copy to the second directory AS INITIAL DATASETS FOR DRAW 1 PREDICTIVE SCRIPT
train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Draw1_Predictive_Model/A_Initial_Train_Test_Data.csv', index=False)
unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Draw1_Predictive_Model/B_Initial_Unseen_Data.csv', index=False)


# :::::**THE END** *Thank You*:::::

### ***Last Revision***
***04/02/24***
11:24pm

# ***Revision***
27/3/24
*Updated Unseen Results up to 30/11/23*