<a href="https://colab.research.google.com/github/alvinfranklyndavis/Project2023_v3/blob/main/Initial_Data_Prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# CELL 1.1: Package Installation and Library Import

# Check for existing libraries
!pip show pandas numpy

# Install or upgrade required packages
!pip install -U --upgrade-strategy eager pip
!pip install -U --upgrade-strategy eager pandas==<desired_version> numpy==<desired_version>

# Import required libraries
import pandas as pd
import numpy as np
import logging
import os

# Set up logging to save logs in a file
log_file = 'project.log'
logging.basicConfig(filename=log_file, level=logging.INFO)
logger = logging.getLogger(__name__)

# Set up virtual environment (optional but recommended)
# You can create a virtual environment with: !python -m venv myenv
# And activate it with: source myenv/bin/activate (Linux/macOS) or myenv\Scripts\activate (Windows)


Name: pandas
Version: 1.5.3
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: The Pandas Development Team
Author-email: pandas-dev@python.org
License: BSD-3-Clause
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy, python-dateutil, pytz
Required-by: altair, arviz, bigframes, bokeh, bqplot, cmdstanpy, cufflinks, datascience, db-dtypes, dopamine-rl, fastai, geemap, geopandas, google-colab, gspread-dataframe, holoviews, ibis-framework, lida, mizani, mlxtend, pandas-datareader, pandas-gbq, panel, pins, plotnine, prophet, pymc, seaborn, sklearn-pandas, statsmodels, vega-datasets, xarray, yfinance
---
Name: numpy
Version: 1.23.5
Summary: NumPy is the fundamental package for array computing with Python.
Home-page: https://www.numpy.org
Author: Travis E. Oliphant et al.
Author-email: 
License: BSD
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: albumentations, altair, arviz, as

In [2]:
# Cell 1.2: Data Loading from Google Drive Training / Testing  and Unseen datasets

import pandas as pd
import logging
import os
from google.colab import drive

# Set up logging
logger = logging.getLogger(__name__)

# Mount Google Drive
drive.mount('/content/drive')

# Define the directory for datasets in Google Drive
drive_dataset_directory = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/'

# Define the paths to the CSV files
csv_filename_train_test = 'A_Initial_Train_Test_Data.csv'
csv_filename_unseen = 'B_Initial_Unseen_Data.csv'

drive_csv_path_train_test = os.path.join(drive_dataset_directory, csv_filename_train_test)
drive_csv_path_unseen = os.path.join(drive_dataset_directory, csv_filename_unseen)

# Check and load the datasets
def load_dataset(file_path):
    if os.path.isfile(file_path):
        print("File found. Proceeding to load the dataset.")
        return pd.read_csv(file_path)
    else:
        print("File not found. Check the file path or the Google Drive mount.")
        return None

# Load training/testing data
train_test_data = load_dataset(drive_csv_path_train_test)

# Load unseen data
unseen_data = load_dataset(drive_csv_path_unseen)

# Print the first few rows of both datasets for inspection
print("First few rows of training/testing data:")
print(train_test_data.head())

print("\nFirst few rows of unseen data:")
print(unseen_data.head())


Mounted at /content/drive
File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
First few rows of training/testing data:
       Date  Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  \
0  01-08-18           1  Training   19.0            7.0        27.0   
1  02-08-18           2  Training   31.0           11.0         1.0   
2  03-08-18           3  Training   15.0           19.0        21.0   
3  04-08-18           4  Training   31.0           35.0        18.0   
4  05-08-18           5       NaN    NaN            NaN         NaN   

   DR1_Prev_Entry  DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  ...  \
0            23.0              32.0         27.5          17.0  ...   
1             9.0              33.0         21.0           6.0  ...   
2            12.0              35.0         23.5          20.0  ...   
3            35.0              23.0         29.0          26.5  ...   
4             NaN               NaN          NaN           NaN 

In [3]:
# Cell 1.3: Surveillance Check for NaNs within both datasets

# Check for NaN values in training/testing data
print("NaN check for training/testing data:")
print(train_test_data.isna().sum())

# Check for NaN values in unseen data
print("\nNaN check for unseen data:")
print(unseen_data.isna().sum())


NaN check for training/testing data:
Date                  0
Row Number            0
Data_Type           239
Draw1               239
DR1_Prev_Week       239
DR1_2Weeks          239
DR1_Prev_Entry      239
DR1_Prev_Entry-2    239
DR1_Mov_Avg         239
DR1_Vert_Avg        239
Draw2               239
DR2_Prev_Week       239
DR2_2Weeks          239
DR2_Prev_Entry      239
DR2_Prev_Entry-2    239
DR2_Mov_Avg         239
DR2_Vert_Avg        239
Draw3               239
DR3_Prev_Week       239
DR3_2Weeks          239
DR3_Prev_Entry      239
DR3_Prev_Entry-2    239
DR3_Mov_Avg         239
DR3_Vert_Avg        239
Draw4               239
DR4_Prev_Week       239
DR4_2Weeks          239
DR4_Prev_Entry      239
DR4_Prev_Entry-2    239
DR4_Mov_Avg         239
DR4_Vert_Avg        239
dtype: int64

NaN check for unseen data:
Date                0
Row Number          0
Data_Type           4
Draw1               4
DR1_Prev_Week       4
DR1_2Weeks          4
DR1_Prev_Entry      4
DR1_Prev_Entry-2    4
DR

In [4]:
# Cell 2.1: NaN handling and new CSV saving for Training / Testing  and Unseen datasets


# Impute NaN values with zeros in training/testing data
train_test_data = train_test_data.fillna(0)

# Impute NaN values with zeros in unseen data
unseen_data = unseen_data.fillna(0)

# Define new CSV file names
new_csv_filename_train_test = 'C_NaN_Handled_Train_Test_Data.csv'
new_csv_filename_unseen = 'D_NaN_Handled_Unseen_Data.csv'

# Define the paths for saving the new CSV files
new_csv_path_train_test = os.path.join(drive_dataset_directory, new_csv_filename_train_test)
new_csv_path_unseen = os.path.join(drive_dataset_directory, new_csv_filename_unseen)

# Save the preprocessed training/testing data as a new CSV file
train_test_data.to_csv(new_csv_path_train_test, index=False)

# Save the preprocessed unseen data as a new CSV file
unseen_data.to_csv(new_csv_path_unseen, index=False)

# Print a message to confirm that the preprocessing and saving is complete
print("Preprocessing and saving of datasets is complete.")

# Check for NaN values in the preprocessed training/testing data
print("\nNaN check for preprocessed training/testing data:")
print(train_test_data.isna().sum())

# Check for NaN values in the preprocessed unseen data
print("\nNaN check for preprocessed unseen data:")
print(unseen_data.isna().sum())


Preprocessing and saving of datasets is complete.

NaN check for preprocessed training/testing data:
Date                0
Row Number          0
Data_Type           0
Draw1               0
DR1_Prev_Week       0
DR1_2Weeks          0
DR1_Prev_Entry      0
DR1_Prev_Entry-2    0
DR1_Mov_Avg         0
DR1_Vert_Avg        0
Draw2               0
DR2_Prev_Week       0
DR2_2Weeks          0
DR2_Prev_Entry      0
DR2_Prev_Entry-2    0
DR2_Mov_Avg         0
DR2_Vert_Avg        0
Draw3               0
DR3_Prev_Week       0
DR3_2Weeks          0
DR3_Prev_Entry      0
DR3_Prev_Entry-2    0
DR3_Mov_Avg         0
DR3_Vert_Avg        0
Draw4               0
DR4_Prev_Week       0
DR4_2Weeks          0
DR4_Prev_Entry      0
DR4_Prev_Entry-2    0
DR4_Mov_Avg         0
DR4_Vert_Avg        0
dtype: int64

NaN check for preprocessed unseen data:
Date                0
Row Number          0
Data_Type           0
Draw1               0
DR1_Prev_Week       0
DR1_2Weeks          0
DR1_Prev_Entry      0
DR1_Prev_

In [5]:
# Cell 2.2: Extract Y/M/D from Date and new CSV saving for Training / Testing  and Unseen datasets

# Load the NaN-handled training/testing data
nan_handled_train_test_data = load_dataset(new_csv_path_train_test)

# Load the NaN-handled unseen data
nan_handled_unseen_data = load_dataset(new_csv_path_unseen)

# Check and load the datasets
def load_dataset(file_path):
    if os.path.isfile(file_path):
        print("File found. Proceeding to load the dataset.")
        return pd.read_csv(file_path)
    else:
        print("File not found. Check the file path.")
        return None

# Function to extract 'Year', 'Month', and 'Day' from the 'Date' column
def extract_date_features(data):
    if 'Date' in data.columns:
        print("Converting 'Date' to datetime and extracting Year, Month, and Day...")
        date_formats = ['%d-%m-%y', '%d/%m/%Y']
        for date_format in date_formats:
            try:
                data['Date'] = pd.to_datetime(data['Date'], format=date_format)
                data['Year'] = data['Date'].dt.year.fillna(0).astype(int)
                data['Month'] = data['Date'].dt.month.fillna(0).astype(int)
                data['Day'] = data['Date'].dt.day.fillna(0).astype(int)
                data.drop(columns=['Date'], inplace=True)
                print("After extracting Year, Month, and Day:", data.columns)
                break  # Break the loop if successful date conversion
            except ValueError:
                print(f"Failed to convert 'Date' with format: {date_format}")
    else:
        print("'Date' column not found in the dataset.")

# Extract 'Year', 'Month', and 'Day' from the 'Date' column in training/testing data
extract_date_features(nan_handled_train_test_data)

# Extract 'Year', 'Month', and 'Day' from the 'Date' column in unseen data
extract_date_features(nan_handled_unseen_data)

# Define new CSV file names
new_csv_filename_train_test_date = 'E_Date_Extracted_Train_Test_Data.csv'
new_csv_filename_unseen_date = 'F_Date_Extracted_Unseen_Data.csv'

# Define the paths for saving the new CSV files
new_csv_path_train_test_date = os.path.join(drive_dataset_directory, new_csv_filename_train_test_date)
new_csv_path_unseen_date = os.path.join(drive_dataset_directory, new_csv_filename_unseen_date)

# Save the datasets with extracted date features as new CSV files
nan_handled_train_test_data.to_csv(new_csv_path_train_test_date, index=False)
nan_handled_unseen_data.to_csv(new_csv_path_unseen_date, index=False)

# Print a message to confirm that the date extraction and saving is complete
print("Date extraction and saving of datasets is complete.")

# Check for NaN values in the datasets with extracted date features
print("\nNaN check for training/testing data with extracted date features:")
print(nan_handled_train_test_data.isna().sum())

print("\nNaN check for unseen data with extracted date features:")
print(nan_handled_unseen_data.isna().sum())


File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
Converting 'Date' to datetime and extracting Year, Month, and Day...
After extracting Year, Month, and Day: Index(['Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week', 'DR1_2Weeks',
       'DR1_Prev_Entry', 'DR1_Prev_Entry-2', 'DR1_Mov_Avg', 'DR1_Vert_Avg',
       'Draw2', 'DR2_Prev_Week', 'DR2_2Weeks', 'DR2_Prev_Entry',
       'DR2_Prev_Entry-2', 'DR2_Mov_Avg', 'DR2_Vert_Avg', 'Draw3',
       'DR3_Prev_Week', 'DR3_2Weeks', 'DR3_Prev_Entry', 'DR3_Prev_Entry-2',
       'DR3_Mov_Avg', 'DR3_Vert_Avg', 'Draw4', 'DR4_Prev_Week', 'DR4_2Weeks',
       'DR4_Prev_Entry', 'DR4_Prev_Entry-2', 'DR4_Mov_Avg', 'DR4_Vert_Avg',
       'Year', 'Month', 'Day'],
      dtype='object')
Converting 'Date' to datetime and extracting Year, Month, and Day...
Failed to convert 'Date' with format: %d-%m-%y
After extracting Year, Month, and Day: Index(['Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week', 'DR1_2Weeks',
       'D

In [6]:
# Cell 2.3: Create shifted columns for previous day's data

# Function to create shifted columns for previous day's data
def create_shifted_columns(data):
    data['Prev_Morning'] = data['Draw1'].shift(1)
    data['Prev_Afternoon'] = data['Draw2'].shift(1)
    data['Prev_Evening'] = data['Draw3'].shift(1)
    data['Prev_Night'] = data['Draw4'].shift(1)
    data[['Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prev_Night']] = data[['Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prev_Night']].fillna(0).astype(int)

# Load the date extracted training/testing data
date_extracted_train_test_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/E_Date_Extracted_Train_Test_Data.csv')

# Apply the function to create shifted columns
create_shifted_columns(date_extracted_train_test_data)

# Save the updated training/testing data with shifted columns
date_extracted_train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/G_Shifted_Train_Test_Data.csv', index=False)

# Load the date extracted unseen data
date_extracted_unseen_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/F_Date_Extracted_Unseen_Data.csv')

# Apply the function to create shifted columns
create_shifted_columns(date_extracted_unseen_data)

# Save the updated unseen data with shifted columns
date_extracted_unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/H_Shifted_Unseen_Data.csv', index=False)

# Print the first few rows of both datasets for inspection
print("First few rows of date extracted training/testing data:")
print(date_extracted_train_test_data.head())

print("\nFirst few rows of date extracted unseen data:")
print(date_extracted_unseen_data.head())


File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
First few rows of date extracted training/testing data:
   Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  DR1_Prev_Entry  \
0           1  Training   19.0            7.0        27.0            23.0   
1           2  Training   31.0           11.0         1.0             9.0   
2           3  Training   15.0           19.0        21.0            12.0   
3           4  Training   31.0           35.0        18.0            35.0   
4           5         0    0.0            0.0         0.0             0.0   

   DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  Draw2  ...  DR4_Prev_Entry-2  \
0              32.0         27.5          17.0   14.0  ...              14.0   
1              33.0         21.0           6.0    3.0  ...               3.0   
2              35.0         23.5          20.0    9.0  ...               9.0   
3              23.0         29.0          26.5   21.0  ...              2

In [7]:
# Cell 2.4: Handle NaN values for previous day's data

# Load the shifted training/testing data
shifted_train_test_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/G_Shifted_Train_Test_Data.csv')

# Manually set values for the first row of training/testing set
shifted_train_test_data.at[0, 'Prev_Morning'] = 13
shifted_train_test_data.at[0, 'Prev_Afternoon'] = 34
shifted_train_test_data.at[0, 'Prev_Evening'] = 32
shifted_train_test_data.at[0, 'Prev_Night'] = 23

# Save the updated training/testing data
shifted_train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/I_Handled_Shifted_Train_Test_Data.csv', index=False)

# Load the shifted unseen data
shifted_unseen_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/H_Shifted_Unseen_Data.csv')

# Manually set values for the first row of unseen set
shifted_unseen_data.at[0, 'Prev_Morning'] = 25
shifted_unseen_data.at[0, 'Prev_Afternoon'] = 9
shifted_unseen_data.at[0, 'Prev_Evening'] = 7
shifted_unseen_data.at[0, 'Prev_Night'] = 5

# Save the updated unseen data
shifted_unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/J_Handled_Shifted_Unseen_Data.csv', index=False)

# Print the first few rows of both datasets for inspection
print("First few rows of handled shifted training/testing data:")
print(shifted_train_test_data.head())

print("\nFirst few rows of handled shifted unseen data:")
print(shifted_unseen_data.head())


File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
First few rows of handled shifted training/testing data:
   Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  DR1_Prev_Entry  \
0           1  Training   19.0            7.0        27.0            23.0   
1           2  Training   31.0           11.0         1.0             9.0   
2           3  Training   15.0           19.0        21.0            12.0   
3           4  Training   31.0           35.0        18.0            35.0   
4           5         0    0.0            0.0         0.0             0.0   

   DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  Draw2  ...  DR4_Prev_Entry-2  \
0              32.0         27.5          17.0   14.0  ...              14.0   
1              33.0         21.0           6.0    3.0  ...               3.0   
2              35.0         23.5          20.0    9.0  ...               9.0   
3              23.0         29.0          26.5   21.0  ...              

In [8]:
# Cell 3.1: # Initialize TARGET VARIABLE 'Prediction1' column

# Load the handled shifted training/testing data
handled_shifted_train_test_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/I_Handled_Shifted_Train_Test_Data.csv')

# Set 'Prediction1' column equal to 'Draw1' for training/testing data
handled_shifted_train_test_data['Prediction1'] = handled_shifted_train_test_data['Draw1']

# Save the updated training/testing data
handled_shifted_train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/K_Handled_Prediction1_Train_Test_Data.csv', index=False)

# Load the handled shifted unseen data
handled_shifted_unseen_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/J_Handled_Shifted_Unseen_Data.csv')

# Initialize 'Prediction1' column with zero (0) for the unseen data
handled_shifted_unseen_data['Prediction1'] = 0

# Save the updated unseen data
handled_shifted_unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/L_Handled_Prediction1_Unseen_Data.csv', index=False)

# Print the first few rows of both datasets for inspection
print("First few rows of handled Prediction1 training/testing data:")
print(handled_shifted_train_test_data.head())

print("\nFirst few rows of handled Prediction1 unseen data:")
print(handled_shifted_unseen_data.head())


File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
First few rows of handled Prediction1 training/testing data:
   Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  DR1_Prev_Entry  \
0           1  Training   19.0            7.0        27.0            23.0   
1           2  Training   31.0           11.0         1.0             9.0   
2           3  Training   15.0           19.0        21.0            12.0   
3           4  Training   31.0           35.0        18.0            35.0   
4           5         0    0.0            0.0         0.0             0.0   

   DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  Draw2  ...  DR4_Mov_Avg  \
0              32.0         27.5          17.0   14.0  ...         23.5   
1              33.0         21.0           6.0    3.0  ...         19.0   
2              35.0         23.5          20.0    9.0  ...         16.0   
3              23.0         29.0          26.5   21.0  ...         25.0   
4            

In [9]:
# Cell 3.2: # Converting the columns to integer in both datasets

# Load the most recent CSVs
train_test_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/K_Handled_Prediction1_Train_Test_Data.csv')
unseen_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/L_Handled_Prediction1_Unseen_Data.csv')

# List of columns to convert to integer (excluding 'Data_Type')
columns_to_convert_train_test = [col for col in train_test_data.columns if col != 'Data_Type']
columns_to_convert_unseen = [col for col in unseen_data.columns if col != 'Data_Type']

# Convert columns to integer
train_test_data[columns_to_convert_train_test] = train_test_data[columns_to_convert_train_test].astype(int)
unseen_data[columns_to_convert_unseen] = unseen_data[columns_to_convert_unseen].astype(int)

# Save the updated datasets
train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/K_Handled_Prediction1_Train_Test_Data.csv', index=False)
unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/L_Handled_Prediction1_Unseen_Data.csv', index=False)

# Display the data types of the columns after conversion
print("Data types of columns in train/test data after conversion:")
print(train_test_data.dtypes)

print("\nData types of columns in unseen data after conversion:")
print(unseen_data.dtypes)


Data types of columns in train/test data after conversion:
Row Number           int64
Data_Type           object
Draw1                int64
DR1_Prev_Week        int64
DR1_2Weeks           int64
DR1_Prev_Entry       int64
DR1_Prev_Entry-2     int64
DR1_Mov_Avg          int64
DR1_Vert_Avg         int64
Draw2                int64
DR2_Prev_Week        int64
DR2_2Weeks           int64
DR2_Prev_Entry       int64
DR2_Prev_Entry-2     int64
DR2_Mov_Avg          int64
DR2_Vert_Avg         int64
Draw3                int64
DR3_Prev_Week        int64
DR3_2Weeks           int64
DR3_Prev_Entry       int64
DR3_Prev_Entry-2     int64
DR3_Mov_Avg          int64
DR3_Vert_Avg         int64
Draw4                int64
DR4_Prev_Week        int64
DR4_2Weeks           int64
DR4_Prev_Entry       int64
DR4_Prev_Entry-2     int64
DR4_Mov_Avg          int64
DR4_Vert_Avg         int64
Year                 int64
Month                int64
Day                  int64
Prev_Morning         int64
Prev_Afternoon       in

In [19]:
# Cell 4.1: # Introducing "Lines" as a new feature in both datasets

# Load the most recent CSVs
train_test_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/K_Handled_Prediction1_Train_Test_Data.csv')
unseen_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/L_Handled_Prediction1_Unseen_Data.csv')

# Function to assign "Lines" based on the sum of digits
def assign_lines(data, column_name):
    def get_lines(x):
        try:
            # Calculate the sum of digits
            sum_of_digits = sum(map(int, str(x)))
            # Ensure the sum is between 1 and 9
            while sum_of_digits > 9:
                sum_of_digits = sum(map(int, str(sum_of_digits)))
            return sum_of_digits
        except (ValueError, TypeError):
            return None  # Handle non-convertible values by returning None

    data[f'Lines_{column_name}'] = data[column_name].apply(get_lines)

# List of columns to assign "Lines"
columns_to_assign_lines = ['DR1_Prev_Week', 'DR1_2Weeks', 'DR1_Prev_Entry', 'DR1_Prev_Entry-2']

# Assign "Lines" to columns in train/test data
for column in columns_to_assign_lines:
    assign_lines(train_test_data, column)

# Assign "Lines" to columns in unseen data
for column in columns_to_assign_lines:
    assign_lines(unseen_data, column)

# Save the updated datasets
train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/M_Lines_Train_Test_Data.csv', index=False)
unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/N_Lines_Unseen_Data.csv', index=False)

# Display the first few rows of both datasets to verify the "Lines" assignment
print("First few rows of train/test data with 'Lines' assigned:")
print(train_test_data.head())

print("\nFirst few rows of unseen data with 'Lines' assigned:")
print(unseen_data.head())


First few rows of train/test data with 'Lines' assigned:
   Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  DR1_Prev_Entry  \
0           1  Training     19              7          27              23   
1           2  Training     31             11           1               9   
2           3  Training     15             19          21              12   
3           4  Training     31             35          18              35   
4           5         0      0              0           0               0   

   DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  Draw2  ...  Day  Prev_Morning  \
0                32           27            17     14  ...    1            13   
1                33           21             6      3  ...    2            19   
2                35           23            20      9  ...    3            31   
3                23           29            26     21  ...    4            15   
4                 0            0             0      0  ...    5            

In [20]:
# Cell 4.2: # Introducing "Special Groups" as a new feature in both datasets

# Load the most recent CSVs
train_test_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/M_Lines_Train_Test_Data.csv')
unseen_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/N_Lines_Unseen_Data.csv')

# Define the mapping for "Special Groups"
special_groups_mapping = {
    2: 1, 15: 1, 16: 1, 24: 1, 31: 1,  # "Ladies"
    4: 2, 5: 2, 12: 2, 29: 2, 34: 2,  # "Men"
    11: 3, 17: 3, 26: 3,  # "Birds"
    7: 4, 9: 4, 19: 4, 20: 4, 22: 4, 30: 4, 36: 4,  # "Domestic Animals"
    8: 5, 10: 5, 13: 5, 25: 5,  # "Wild Animals"
    18: 6, 28: 6, 32: 6,  # "Ocean"
    1: 7, 27: 7, 33: 7, 35: 7,  # "Snakes & Insects"
    3: 8, 6: 8, 14: 8, 21: 8, 23: 8  # "Home"
}

# Function to assign "Special Groups" based on the mapping
def assign_special_groups(data, column_name, special_groups_mapping):
    data[f'Special_Groups_{column_name}'] = data[column_name].map(special_groups_mapping).fillna(0).astype(int)

# List of columns to assign "Special Groups"
columns_to_assign_special_groups = ['DR1_Prev_Week', 'DR1_2Weeks', 'DR1_Prev_Entry', 'DR1_Prev_Entry-2']

# Assign "Special Groups" for specified columns in train/test data
for column in columns_to_assign_special_groups:
    assign_special_groups(train_test_data, column, special_groups_mapping)

# Assign "Special Groups" for specified columns in unseen data
for column in columns_to_assign_special_groups:
    assign_special_groups(unseen_data, column, special_groups_mapping)

# Save the updated datasets
train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/O_Special_Groups_Train_Test_Data.csv', index=False)
unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/P_Special_Groups_Unseen_Data.csv', index=False)

# Display the first few rows of both datasets to verify the "Special Groups" assignment
print("First few rows of train/test data with 'Special Groups' assigned:")
print(train_test_data.head())

print("\nFirst few rows of unseen data with 'Special Groups' assigned:")
print(unseen_data.head())


First few rows of train/test data with 'Special Groups' assigned:
   Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  DR1_Prev_Entry  \
0           1  Training     19              7          27              23   
1           2  Training     31             11           1               9   
2           3  Training     15             19          21              12   
3           4  Training     31             35          18              35   
4           5         0      0              0           0               0   

   DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  Draw2  ...  Prev_Night  \
0                32           27            17     14  ...          23   
1                33           21             6      3  ...           9   
2                35           23            20      9  ...          12   
3                23           29            26     21  ...          35   
4                 0            0             0      0  ...          16   

   Prediction1  Lines_DR1_

In [21]:
# Cell 4.3: # Introducing "Spirits" as a new feature in both datasets

import pandas as pd

# Load the most recent CSVs
train_test_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/O_Special_Groups_Train_Test_Data.csv')
unseen_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/P_Special_Groups_Unseen_Data.csv')

# Define a mapping dictionary for "Spirits" pairs
spirits_mapping = {
    1: 5,
    2: 24,
    3: 19,
    4: 35,
    5: 1,
    6: 15,
    7: 13,
    8: 29,
    9: 33,
    10: 28,
    11: 36,
    12: 32,
    13: 7,
    14: 25,
    15: 6,
    16: 17,
    17: 16,
    18: 30,
    19: 3,
    20: 22,
    21: 23,
    22: 20,
    23: 21,
    24: 2,
    25: 14,
    26: 27,
    27: 26,
    28: 10,
    29: 8,
    30: 18,
    31: 34,
    32: 12,
    33: 9,
    34: 31,
    35: 4,
    36: 11
}

# Function to assign "Spirits" based on the mapping
def assign_spirits(data, column_name, spirits_mapping):
    data[f'Spirits_{column_name}'] = data[column_name].map(spirits_mapping).fillna(0).astype(int)

# List of columns to assign "Spirits"
columns_to_assign_spirits = ['DR1_Prev_Week', 'DR1_2Weeks', 'DR1_Prev_Entry', 'DR1_Prev_Entry-2']

# Assign "Spirits" for specified columns in train/test data
for column in columns_to_assign_spirits:
    assign_spirits(train_test_data, column, spirits_mapping)

# Assign "Spirits" for specified columns in unseen data
for column in columns_to_assign_spirits:
    assign_spirits(unseen_data, column, spirits_mapping)

# Save the updated datasets
train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/Q_Spirits_Train_Test_Data.csv', index=False)
unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/R_Spirits_Unseen_Data.csv', index=False)

# Display the first few rows of both datasets to verify the "Spirits" assignment
print("First few rows of train/test data with 'Spirits' assigned:")
print(train_test_data.head())

print("\nFirst few rows of unseen data with 'Spirits' assigned:")
print(unseen_data.head())


First few rows of train/test data with 'Spirits' assigned:
   Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  DR1_Prev_Entry  \
0           1  Training     19              7          27              23   
1           2  Training     31             11           1               9   
2           3  Training     15             19          21              12   
3           4  Training     31             35          18              35   
4           5         0      0              0           0               0   

   DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  Draw2  ...  \
0                32           27            17     14  ...   
1                33           21             6      3  ...   
2                35           23            20      9  ...   
3                23           29            26     21  ...   
4                 0            0             0      0  ...   

   Lines_DR1_Prev_Entry  Lines_DR1_Prev_Entry-2  Special_Groups_DR1_Prev_Week  \
0                     5 

In [13]:
# Cell 4.4: # Introducing "Rakes" as a new feature in both datasets

import pandas as pd

# Define a mapping dictionary for "Rakes" associations with lists as values
rakes_mapping = {
    1: ['36 First and Last', '7 Hog and Knife'],
    2: ['16 Old Jamette', '24 Old Fowl'],
    3: ['34 Three Blind Mice', '22 Three Blind Mice2', '35 Carriage On Road', '19 Horse and Carriage'],
    4: ['32 Dead Wood', '21 Death Announcement', '14 Dead Money', '26 Crowd from dead', '7 Sunset Drive', '11 Sunset Drive2'],
    5: ['31 Parson Man, Parson Wife'],
    6: ['25 Back and Belly', '32 Wood in Belly', '14 Bag of Money'],
    7: ['21 Hog Mouth'],
    8: ['3 Tiger In Cage', '33 Lion In Net', '7 Tiger Hunting', '14 Blood Money'],
    9: ['32 Bull Pistle', '11 Sept 11th'],
    10: ['32 Monkey Shrimps', '27 Monkey On Vine'],
    11: ['17 Black and White'],
    12: ['10 King Kong', '9 Clear or dirty water', '1 King and I'],
    13: ['10 Girl Child, Boy Child'],
    14: ['6 Money In Pocket', '23 Money In Bank', '33 Big Money, Small Money'],
    15: ['20 Sick like a Dog'],
    16: ['23 Jamette In Hotel', '35 Jamette Wining'],
    17: ['29 Young Drunk'],
    18: ['29 Rock D Boat', '28 Fish In the Boat'],
    19: ['35 Horse on Track', '32 Horse Wood', '36 Horse and Ass'],
    20: ['5 Worm On Fog', '24 Dog Food', '14 Dog Money', '30 Dog and Cat'],
    21: ['19 Straight from the Horse’s Mouth'],
    22: ['24 Rat Looking For Goods', '30 Tom and Jerry', '32 Rat Wood'],
    23: ['31 House Wife'],
    24: ['21 Food In Mouth'],
    25: ['13 Hard Back, Soft Back'],
    26: ['15 Fowl Sickness'],
    27: ['35 Little Snake, Big Snake', '19 Horse Whip', '14 Coil of Money'],
    28: ['36 Fish in Sea', '12 King Fish', '6 Fish Guts', '33 Fish in Net'],
    29: ['6 Rum Belly', '13 Drunk and Spread Out', '1 Rum Bottle', '16 Drunk like Jamette'],
    30: ['35 Golden Cobra', '6 Gold Sack', '6 Cat in Bag'],
    31: ['16 Big & Small Jamette', '5 Parson Man and Wife'],
    32: ['1 Bottle & Spoon'],
    33: ['9 Cow eating Grass', '5 Spider-man', '10 Spider Monkey'],
    34: ['1 Cemetery & Lights'],
    35: ['12 King Cobra'],
    36: ['18 Bridge & Water']
}

# Save the mapping dictionary
import pickle

with open('rakes_mapping.pkl', 'wb') as f:
    pickle.dump(rakes_mapping, f)

In [23]:
# Cell 4.5: Applying "Rakes" MultiLabelBinarizer Function


import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# Load the most recent CSVs
train_test_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/Q_Spirits_Train_Test_Data.csv')
unseen_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/R_Spirits_Unseen_Data.csv')

# Define the list of columns to assign "Rakes"
columns_to_assign_rakes = ['DR1_Prev_Week', 'DR1_2Weeks', 'DR1_Prev_Entry', 'DR1_Prev_Entry-2']

# Define a function to consolidate "Rakes" for a given number into a list
def consolidate_rakes(row, rakes_mapping, columns_to_assign_rakes):
    rakes_for_row = []
    for column in columns_to_assign_rakes:
        rakes_for_row.extend(rakes_mapping.get(row[column], []))
    return rakes_for_row

# Consolidate "Rakes" for each row in the datasets
train_test_data['Consolidated_Rakes'] = train_test_data.apply(
    consolidate_rakes, axis=1, rakes_mapping=rakes_mapping, columns_to_assign_rakes=columns_to_assign_rakes
)
unseen_data['Consolidated_Rakes'] = unseen_data.apply(
    consolidate_rakes, axis=1, rakes_mapping=rakes_mapping, columns_to_assign_rakes=columns_to_assign_rakes
)

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit MultiLabelBinarizer to the "Consolidated_Rakes" of train_test_data and transform
train_test_rakes_binary = mlb.fit_transform(train_test_data['Consolidated_Rakes'])

# Transform the "Consolidated_Rakes" of unseen_data using the fitted mlb
unseen_rakes_binary = mlb.transform(unseen_data['Consolidated_Rakes'])

# Create DataFrames from the binary arrays
train_test_rakes_binary_df = pd.DataFrame(train_test_rakes_binary, columns=mlb.classes_)
unseen_rakes_binary_df = pd.DataFrame(unseen_rakes_binary, columns=mlb.classes_)

# Concatenate the binary DataFrames with the original datasets
train_test_data = pd.concat([train_test_data, train_test_rakes_binary_df], axis=1)
unseen_data = pd.concat([unseen_data, unseen_rakes_binary_df], axis=1)

# Drop the "Consolidated_Rakes" column after we have created the binary features
train_test_data.drop(columns=['Consolidated_Rakes'], inplace=True)
unseen_data.drop(columns=['Consolidated_Rakes'], inplace=True)

# Save the updated datasets
train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/S_Rakes_Train_Test_Data.csv', index=False)
unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/T_Rakes_Unseen_Data.csv', index=False)

# Display the first few rows of both datasets to verify the new "Rakes" columns
print("First few rows of train/test data with new 'Rakes' columns:")
print(train_test_data.head())

print("\nFirst few rows of unseen data with new 'Rakes' columns:")
print(unseen_data.head())


First few rows of train/test data with new 'Rakes' columns:
   Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  DR1_Prev_Entry  \
0           1  Training     19              7          27              23   
1           2  Training     31             11           1               9   
2           3  Training     15             19          21              12   
3           4  Training     31             35          18              35   
4           5         0      0              0           0               0   

   DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  Draw2  ...  6 Cat in Bag  \
0                32           27            17     14  ...             0   
1                33           21             6      3  ...             0   
2                35           23            20      9  ...             0   
3                23           29            26     21  ...             0   
4                 0            0             0      0  ...             0   

   6 Fish Guts  6 Go

In [24]:
# Cell 5.1: # Changing Data_Type column from categorical to numerical

import pandas as pd

# Load the most recent CSVs
train_test_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/S_Rakes_Train_Test_Data.csv')
unseen_data = pd.read_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/T_Rakes_Unseen_Data.csv')

# Map 'Data_Type' to numerical values, including 0 in the mapping
data_type_mapping = {'Training': 1, 'Testing': 2, 'Unseen': 3}
train_test_data['Data_Type'] = train_test_data['Data_Type'].map(data_type_mapping)
unseen_data['Data_Type'] = train_test_data['Data_Type'].map(data_type_mapping)

# Replace any remaining NaNs with 0 in the 'Data_Type' column
train_test_data['Data_Type'].fillna(0, inplace=True)
unseen_data['Data_Type'].fillna(0, inplace=True)

# Convert the 'Data_Type' column to integers
train_test_data['Data_Type'] = train_test_data['Data_Type'].astype(int)
unseen_data['Data_Type'] = train_test_data['Data_Type'].astype(int)

# Save the updated datasets
train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/U_Encoded_Train_Test_Data.csv', index=False)
unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/V_Encoded_Unseen_Data.csv', index=False)

print("First few rows of training/testing data:")
print(train_test_data.head())

print("First few rows of unseen data:")
print(unseen_data.head())


First few rows of training/testing data:
   Row Number  Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  DR1_Prev_Entry  \
0           1          1     19              7          27              23   
1           2          1     31             11           1               9   
2           3          1     15             19          21              12   
3           4          1     31             35          18              35   
4           5          0      0              0           0               0   

   DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  Draw2  ...  6 Cat in Bag  \
0                32           27            17     14  ...             0   
1                33           21             6      3  ...             0   
2                35           23            20      9  ...             0   
3                23           29            26     21  ...             0   
4                 0            0             0      0  ...             0   

   6 Fish Guts  6 Gold Sack  6 Mo

In [25]:
# Check for NaN values in train/test data
print("NaN check for train/test data:")
print(train_test_data.isnull().sum())

# Check data types in train/test data
print("\nData types in train/test data:")
print(train_test_data.dtypes)

# Check for NaN values in unseen data
print("\nNaN check for unseen data:")
print(unseen_data.isnull().sum())

# Check data types in unseen data
print("\nData types in unseen data:")
print(unseen_data.dtypes)

# Save the updated datasets
train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/U_Encoded_Train_Test_Data.csv', index=False)
unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/V_Encoded_Unseen_Data.csv', index=False)


NaN check for train/test data:
Row Number                0
Data_Type                 0
Draw1                     0
DR1_Prev_Week             0
DR1_2Weeks                0
                         ..
7 Hog and Knife           0
7 Sunset Drive            0
7 Tiger Hunting           0
9 Clear or dirty water    0
9 Cow eating Grass        0
Length: 129, dtype: int64

Data types in train/test data:
Row Number                int64
Data_Type                 int64
Draw1                     int64
DR1_Prev_Week             int64
DR1_2Weeks                int64
                          ...  
7 Hog and Knife           int64
7 Sunset Drive            int64
7 Tiger Hunting           int64
9 Clear or dirty water    int64
9 Cow eating Grass        int64
Length: 129, dtype: object

NaN check for unseen data:
Row Number                0
Data_Type                 0
Draw1                     0
DR1_Prev_Week             0
DR1_2Weeks                0
                         ..
7 Hog and Knife           0


In [26]:
# Save the first copy to the first directory
train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/W_Final_Train_Test_Data.csv', index=False)
unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Initial_Data_Prep/X_Final_Unseen_Data.csv', index=False)

# Save the second copy to the second directory AS INITIAL DATASETS FOR DRAW 1 PREDICTIVE SCRIPT
train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Draw1_Predictive_Model/A_Initial_Train_Test_Data.csv', index=False)
unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model/Draw1_Predictive_Model/B_Initial_Unseen_Data.csv', index=False)


:::::**THE END** *Thank You*:::::