<a href="https://colab.research.google.com/github/alvinfranklyndavis/Project2023_v3/blob/main/Data_Prep_GPT_4_Bard_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# CELL 1.1: Package Installation and Library Import

# Check for existing libraries
!pip show pandas numpy

# Install or upgrade required packages
!pip install -U --upgrade-strategy eager pip
!pip install -U --upgrade-strategy eager pandas==<desired_version> numpy==<desired_version>

# Import required libraries
import pandas as pd
import numpy as np
import logging
import os

# Set up logging to save logs in a file
log_file = 'project.log'
logging.basicConfig(filename=log_file, level=logging.INFO)
logger = logging.getLogger(__name__)

# Set up virtual environment (optional but recommended)
# You can create a virtual environment with: !python -m venv myenv
# And activate it with: source myenv/bin/activate (Linux/macOS) or myenv\Scripts\activate (Windows)


Name: pandas
Version: 1.5.3
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: The Pandas Development Team
Author-email: pandas-dev@python.org
License: BSD-3-Clause
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy, python-dateutil, pytz
Required-by: altair, arviz, bigframes, bokeh, bqplot, cmdstanpy, cufflinks, datascience, db-dtypes, dopamine-rl, fastai, geemap, geopandas, google-colab, gspread-dataframe, holoviews, ibis-framework, lida, mizani, mlxtend, pandas-datareader, pandas-gbq, panel, pins, plotnine, prophet, pymc, seaborn, sklearn-pandas, statsmodels, vega-datasets, xarray, yfinance
---
Name: numpy
Version: 1.23.5
Summary: NumPy is the fundamental package for array computing with Python.
Home-page: https://www.numpy.org
Author: Travis E. Oliphant et al.
Author-email: 
License: BSD
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: albumentations, altair, arviz, as

In [2]:
# Cell 1.2: Data Loading from Google Drive Training / Testing  and Unseen datasets

import pandas as pd
import logging
import os
from google.colab import drive

# Set up logging
logger = logging.getLogger(__name__)

# Mount Google Drive
drive.mount('/content/drive')

# Define the directory for datasets in Google Drive
drive_dataset_directory = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/'

# Define the paths to the CSV files
csv_filename_train_test = 'A_Initial_Train_Test_Data.csv'
csv_filename_unseen = 'B_Initial_Unseen_Data.csv'

drive_csv_path_train_test = os.path.join(drive_dataset_directory, csv_filename_train_test)
drive_csv_path_unseen = os.path.join(drive_dataset_directory, csv_filename_unseen)

# Check and load the datasets
def load_dataset(file_path):
    if os.path.isfile(file_path):
        print("File found. Proceeding to load the dataset.")
        return pd.read_csv(file_path)
    else:
        print("File not found. Check the file path or the Google Drive mount.")
        return None

# Load training/testing data
train_test_data = load_dataset(drive_csv_path_train_test)

# Load unseen data
unseen_data = load_dataset(drive_csv_path_unseen)

# Print the first few rows of both datasets for inspection
print("First few rows of training/testing data:")
print(train_test_data.head())

print("\nFirst few rows of unseen data:")
print(unseen_data.head())


Mounted at /content/drive
File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
First few rows of training/testing data:
       Date  Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  \
0  01-08-18           1  Training   19.0            7.0        27.0   
1  02-08-18           2  Training   31.0           11.0         1.0   
2  03-08-18           3  Training   15.0           19.0        21.0   
3  04-08-18           4  Training   31.0           35.0        18.0   
4  05-08-18           5       NaN    NaN            NaN         NaN   

   DR1_Prev_Entry  DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  ...  \
0            23.0              32.0         27.5          17.0  ...   
1             9.0              33.0         21.0           6.0  ...   
2            12.0              35.0         23.5          20.0  ...   
3            35.0              23.0         29.0          26.5  ...   
4             NaN               NaN          NaN           NaN 

In [3]:
# Cell 1.3: Surveillance Check for NaNs within both datasets

# Check for NaN values in training/testing data
print("NaN check for training/testing data:")
print(train_test_data.isna().sum())

# Check for NaN values in unseen data
print("\nNaN check for unseen data:")
print(unseen_data.isna().sum())


NaN check for training/testing data:
Date                  0
Row Number            0
Data_Type           239
Draw1               239
DR1_Prev_Week       239
DR1_2Weeks          239
DR1_Prev_Entry      239
DR1_Prev_Entry-2    239
DR1_Mov_Avg         239
DR1_Vert_Avg        239
Draw2               239
DR2_Prev_Week       239
DR2_2Weeks          239
DR2_Prev_Entry      239
DR2_Prev_Entry-2    239
DR2_Mov_Avg         239
DR2_Vert_Avg        239
Draw3               239
DR3_Prev_Week       239
DR3_2Weeks          239
DR3_Prev_Entry      239
DR3_Prev_Entry-2    239
DR3_Mov_Avg         239
DR3_Vert_Avg        239
Draw4               239
DR4_Prev_Week       239
DR4_2Weeks          239
DR4_Prev_Entry      239
DR4_Prev_Entry-2    239
DR4_Mov_Avg         239
DR4_Vert_Avg        239
dtype: int64

NaN check for unseen data:
Date                0
Row Number          0
Data_Type           4
Draw1               4
DR1_Prev_Week       4
DR1_2Weeks          4
DR1_Prev_Entry      4
DR1_Prev_Entry-2    4
DR

In [5]:
# Cell 2.1: NaN handling and new CSV saving for Training / Testing  and Unseen datasets


# Impute NaN values with zeros in training/testing data
train_test_data = train_test_data.fillna(0)

# Impute NaN values with zeros in unseen data
unseen_data = unseen_data.fillna(0)

# Define new CSV file names
new_csv_filename_train_test = 'C_NaN_Handled_Train_Test_Data.csv'
new_csv_filename_unseen = 'D_NaN_Handled_Unseen_Data.csv'

# Define the paths for saving the new CSV files
new_csv_path_train_test = os.path.join(drive_dataset_directory, new_csv_filename_train_test)
new_csv_path_unseen = os.path.join(drive_dataset_directory, new_csv_filename_unseen)

# Save the preprocessed training/testing data as a new CSV file
train_test_data.to_csv(new_csv_path_train_test, index=False)

# Save the preprocessed unseen data as a new CSV file
unseen_data.to_csv(new_csv_path_unseen, index=False)

# Print a message to confirm that the preprocessing and saving is complete
print("Preprocessing and saving of datasets is complete.")

# Check for NaN values in the preprocessed training/testing data
print("\nNaN check for preprocessed training/testing data:")
print(train_test_data.isna().sum())

# Check for NaN values in the preprocessed unseen data
print("\nNaN check for preprocessed unseen data:")
print(unseen_data.isna().sum())


Preprocessing and saving of datasets is complete.

NaN check for preprocessed training/testing data:
Date                0
Row Number          0
Data_Type           0
Draw1               0
DR1_Prev_Week       0
DR1_2Weeks          0
DR1_Prev_Entry      0
DR1_Prev_Entry-2    0
DR1_Mov_Avg         0
DR1_Vert_Avg        0
Draw2               0
DR2_Prev_Week       0
DR2_2Weeks          0
DR2_Prev_Entry      0
DR2_Prev_Entry-2    0
DR2_Mov_Avg         0
DR2_Vert_Avg        0
Draw3               0
DR3_Prev_Week       0
DR3_2Weeks          0
DR3_Prev_Entry      0
DR3_Prev_Entry-2    0
DR3_Mov_Avg         0
DR3_Vert_Avg        0
Draw4               0
DR4_Prev_Week       0
DR4_2Weeks          0
DR4_Prev_Entry      0
DR4_Prev_Entry-2    0
DR4_Mov_Avg         0
DR4_Vert_Avg        0
dtype: int64

NaN check for preprocessed unseen data:
Date                0
Row Number          0
Data_Type           0
Draw1               0
DR1_Prev_Week       0
DR1_2Weeks          0
DR1_Prev_Entry      0
DR1_Prev_

In [7]:
# Cell 2.2: Extract Y/M/D from Date and new CSV saving for Training / Testing  and Unseen datasets

# Load the NaN-handled training/testing data
nan_handled_train_test_data = load_dataset(new_csv_path_train_test)

# Load the NaN-handled unseen data
nan_handled_unseen_data = load_dataset(new_csv_path_unseen)

# Check and load the datasets
def load_dataset(file_path):
    if os.path.isfile(file_path):
        print("File found. Proceeding to load the dataset.")
        return pd.read_csv(file_path)
    else:
        print("File not found. Check the file path.")
        return None

# Function to extract 'Year', 'Month', and 'Day' from the 'Date' column
def extract_date_features(data):
    if 'Date' in data.columns:
        print("Converting 'Date' to datetime and extracting Year, Month, and Day...")
        date_formats = ['%d-%m-%y', '%d/%m/%Y']
        for date_format in date_formats:
            try:
                data['Date'] = pd.to_datetime(data['Date'], format=date_format)
                data['Year'] = data['Date'].dt.year.fillna(0).astype(int)
                data['Month'] = data['Date'].dt.month.fillna(0).astype(int)
                data['Day'] = data['Date'].dt.day.fillna(0).astype(int)
                data.drop(columns=['Date'], inplace=True)
                print("After extracting Year, Month, and Day:", data.columns)
                break  # Break the loop if successful date conversion
            except ValueError:
                print(f"Failed to convert 'Date' with format: {date_format}")
    else:
        print("'Date' column not found in the dataset.")

# Extract 'Year', 'Month', and 'Day' from the 'Date' column in training/testing data
extract_date_features(nan_handled_train_test_data)

# Extract 'Year', 'Month', and 'Day' from the 'Date' column in unseen data
extract_date_features(nan_handled_unseen_data)

# Define new CSV file names
new_csv_filename_train_test_date = 'E_Date_Extracted_Train_Test_Data.csv'
new_csv_filename_unseen_date = 'F_Date_Extracted_Unseen_Data.csv'

# Define the paths for saving the new CSV files
new_csv_path_train_test_date = os.path.join(drive_dataset_directory, new_csv_filename_train_test_date)
new_csv_path_unseen_date = os.path.join(drive_dataset_directory, new_csv_filename_unseen_date)

# Save the datasets with extracted date features as new CSV files
nan_handled_train_test_data.to_csv(new_csv_path_train_test_date, index=False)
nan_handled_unseen_data.to_csv(new_csv_path_unseen_date, index=False)

# Print a message to confirm that the date extraction and saving is complete
print("Date extraction and saving of datasets is complete.")

# Check for NaN values in the datasets with extracted date features
print("\nNaN check for training/testing data with extracted date features:")
print(nan_handled_train_test_data.isna().sum())

print("\nNaN check for unseen data with extracted date features:")
print(nan_handled_unseen_data.isna().sum())


File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
Converting 'Date' to datetime and extracting Year, Month, and Day...
After extracting Year, Month, and Day: Index(['Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week', 'DR1_2Weeks',
       'DR1_Prev_Entry', 'DR1_Prev_Entry-2', 'DR1_Mov_Avg', 'DR1_Vert_Avg',
       'Draw2', 'DR2_Prev_Week', 'DR2_2Weeks', 'DR2_Prev_Entry',
       'DR2_Prev_Entry-2', 'DR2_Mov_Avg', 'DR2_Vert_Avg', 'Draw3',
       'DR3_Prev_Week', 'DR3_2Weeks', 'DR3_Prev_Entry', 'DR3_Prev_Entry-2',
       'DR3_Mov_Avg', 'DR3_Vert_Avg', 'Draw4', 'DR4_Prev_Week', 'DR4_2Weeks',
       'DR4_Prev_Entry', 'DR4_Prev_Entry-2', 'DR4_Mov_Avg', 'DR4_Vert_Avg',
       'Year', 'Month', 'Day'],
      dtype='object')
Converting 'Date' to datetime and extracting Year, Month, and Day...
Failed to convert 'Date' with format: %d-%m-%y
After extracting Year, Month, and Day: Index(['Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week', 'DR1_2Weeks',
       'D

In [8]:
# Cell 2.3: Create shifted columns for previous day's data

# Function to create shifted columns for previous day's data
def create_shifted_columns(data):
    data['Prev_Morning'] = data['Draw1'].shift(1)
    data['Prev_Afternoon'] = data['Draw2'].shift(1)
    data['Prev_Evening'] = data['Draw3'].shift(1)
    data['Prev_Night'] = data['Draw4'].shift(1)
    data[['Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prev_Night']] = data[['Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prev_Night']].fillna(0).astype(int)

# Load the date extracted training/testing data
date_extracted_train_test_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/E_Date_Extracted_Train_Test_Data.csv')

# Apply the function to create shifted columns
create_shifted_columns(date_extracted_train_test_data)

# Save the updated training/testing data with shifted columns
date_extracted_train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/G_Shifted_Train_Test_Data.csv', index=False)

# Load the date extracted unseen data
date_extracted_unseen_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/F_Date_Extracted_Unseen_Data.csv')

# Apply the function to create shifted columns
create_shifted_columns(date_extracted_unseen_data)

# Save the updated unseen data with shifted columns
date_extracted_unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/H_Shifted_Unseen_Data.csv', index=False)

# Print the first few rows of both datasets for inspection
print("First few rows of date extracted training/testing data:")
print(date_extracted_train_test_data.head())

print("\nFirst few rows of date extracted unseen data:")
print(date_extracted_unseen_data.head())


File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
First few rows of date extracted training/testing data:
   Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  DR1_Prev_Entry  \
0           1  Training   19.0            7.0        27.0            23.0   
1           2  Training   31.0           11.0         1.0             9.0   
2           3  Training   15.0           19.0        21.0            12.0   
3           4  Training   31.0           35.0        18.0            35.0   
4           5         0    0.0            0.0         0.0             0.0   

   DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  Draw2  ...  DR4_Prev_Entry-2  \
0              32.0         27.5          17.0   14.0  ...              14.0   
1              33.0         21.0           6.0    3.0  ...               3.0   
2              35.0         23.5          20.0    9.0  ...               9.0   
3              23.0         29.0          26.5   21.0  ...              2

In [12]:
# Cell 2.4: Handle NaN values for previous day's data

# Load the shifted training/testing data
shifted_train_test_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/G_Shifted_Train_Test_Data.csv')

# Manually set values for the first row of training/testing set
shifted_train_test_data.at[0, 'Prev_Morning'] = 13
shifted_train_test_data.at[0, 'Prev_Afternoon'] = 34
shifted_train_test_data.at[0, 'Prev_Evening'] = 32
shifted_train_test_data.at[0, 'Prev_Night'] = 23

# Save the updated training/testing data
shifted_train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/I_Handled_Shifted_Train_Test_Data.csv', index=False)

# Load the shifted unseen data
shifted_unseen_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/H_Shifted_Unseen_Data.csv')

# Manually set values for the first row of unseen set
shifted_unseen_data.at[0, 'Prev_Morning'] = 25
shifted_unseen_data.at[0, 'Prev_Afternoon'] = 9
shifted_unseen_data.at[0, 'Prev_Evening'] = 7
shifted_unseen_data.at[0, 'Prev_Night'] = 5

# Save the updated unseen data
shifted_unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/J_Handled_Shifted_Unseen_Data.csv', index=False)

# Print the first few rows of both datasets for inspection
print("First few rows of handled shifted training/testing data:")
print(shifted_train_test_data.head())

print("\nFirst few rows of handled shifted unseen data:")
print(shifted_unseen_data.head())


File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
First few rows of handled shifted training/testing data:
   Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  DR1_Prev_Entry  \
0           1  Training   19.0            7.0        27.0            23.0   
1           2  Training   31.0           11.0         1.0             9.0   
2           3  Training   15.0           19.0        21.0            12.0   
3           4  Training   31.0           35.0        18.0            35.0   
4           5         0    0.0            0.0         0.0             0.0   

   DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  Draw2  ...  DR4_Prev_Entry-2  \
0              32.0         27.5          17.0   14.0  ...              14.0   
1              33.0         21.0           6.0    3.0  ...               3.0   
2              35.0         23.5          20.0    9.0  ...               9.0   
3              23.0         29.0          26.5   21.0  ...              

In [14]:
# Cell 3.1: # Initialize TARGET VARIABLE 'Prediction1' column

# Load the handled shifted training/testing data
handled_shifted_train_test_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/I_Handled_Shifted_Train_Test_Data.csv')

# Set 'Prediction1' column equal to 'Draw1' for training/testing data
handled_shifted_train_test_data['Prediction1'] = handled_shifted_train_test_data['Draw1']

# Save the updated training/testing data
handled_shifted_train_test_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/K_Handled_Prediction1_Train_Test_Data.csv', index=False)

# Load the handled shifted unseen data
handled_shifted_unseen_data = load_dataset('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/J_Handled_Shifted_Unseen_Data.csv')

# Initialize 'Prediction1' column with NaN values for the unseen data
handled_shifted_unseen_data['Prediction1'] = np.nan

# Save the updated unseen data
handled_shifted_unseen_data.to_csv('/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/L_Handled_Prediction1_Unseen_Data.csv', index=False)

# Print the first few rows of both datasets for inspection
print("First few rows of handled Prediction1 training/testing data:")
print(handled_shifted_train_test_data.head())

print("\nFirst few rows of handled Prediction1 unseen data:")
print(handled_shifted_unseen_data.head())


File found. Proceeding to load the dataset.
File found. Proceeding to load the dataset.
First few rows of handled Prediction1 training/testing data:
   Row Number Data_Type  Draw1  DR1_Prev_Week  DR1_2Weeks  DR1_Prev_Entry  \
0           1  Training   19.0            7.0        27.0            23.0   
1           2  Training   31.0           11.0         1.0             9.0   
2           3  Training   15.0           19.0        21.0            12.0   
3           4  Training   31.0           35.0        18.0            35.0   
4           5         0    0.0            0.0         0.0             0.0   

   DR1_Prev_Entry-2  DR1_Mov_Avg  DR1_Vert_Avg  Draw2  ...  DR4_Mov_Avg  \
0              32.0         27.5          17.0   14.0  ...         23.5   
1              33.0         21.0           6.0    3.0  ...         19.0   
2              35.0         23.5          20.0    9.0  ...         16.0   
3              23.0         29.0          26.5   21.0  ...         25.0   
4            

::::

In [69]:
# Cell 1.3: Preprocessing Training/Testing Data

import shutil

# Function to preprocess training/testing data
def preprocess_train_test_data(data):
    print("Initial data columns:", data.columns)

    # Remove rows where 'Draw1' is NaN
    data = data.dropna(subset=['Draw1'])

    # Convert 'Date' to datetime with the correct format
    if 'Date' in data.columns:
        print("Converting 'Date' to datetime...")
        data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%y')  # Specify the correct format here
        data['Year'] = data['Date'].dt.year
        data['Month'] = data['Date'].dt.month
        data['Day'] = data['Date'].dt.day
        print("After extracting Year, Month, Day:", data.columns)
        data.drop(columns=['Date'], inplace=True)
        print("After dropping 'Date':", data.columns)
    else:
        print("Date column not found in the given dataset.")

    # Initialize TARGET VARIABLE 'Prediction1' column with 'Draw1' values
    data['Prediction1'] = data['Draw1']

    # Create shifted columns for previous day's data
    data['Prev_Morning'] = data['Draw1'].shift(1)
    data['Prev_Afternoon'] = data['Draw2'].shift(1)
    data['Prev_Evening'] = data['Draw3'].shift(1)
    data['Prev_Night'] = data['Draw4'].shift(1)

    # Handle NaN values
    data['Prev_Morning'].fillna(13, inplace=True)
    data['Prev_Afternoon'].fillna(34, inplace=True)
    data['Prev_Evening'].fillna(32, inplace=True)
    data['Prev_Night'].fillna(23, inplace=True)

    # Select relevant columns, including 'Prediction1'
    selected_columns = ['Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week', 'DR1_2Weeks',
    'DR1_Prev_Entry', 'DR1_Prev_Entry-2', 'DR1_Mov_Avg', 'DR1_Vert_Avg', 'Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prev_Night', 'Prediction1', 'Year', 'Month', 'Day']
    data = data[selected_columns]

    return data

# Apply preprocessing to the training/testing dataset
train_test_data = preprocess_train_test_data(train_test_data)

# Save the preprocessed model training/testing data directly to your Google Drive folder
save_path = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/C_Preprocessed_Train_Test_Data.csv'
train_test_data.to_csv(save_path, index=False)

# Display the first few rows of the preprocessed data for verification
print("First few rows of preprocessed training/testing data:")
print(train_test_data.head())


Initial data columns: Index(['Date', 'Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week',
       'DR1_2Weeks', 'DR1_Prev_Entry', 'DR1_Prev_Entry-2', 'DR1_Mov_Avg',
       'DR1_Vert_Avg', 'Draw2', 'DR2_Prev_Week', 'DR2_2Weeks',
       'DR2_Prev_Entry', 'DR2_Prev_Entry-2', 'DR2_Mov_Avg', 'DR2_Vert_Avg',
       'Draw3', 'DR3_Prev_Week', 'DR3_2Weeks', 'DR3_Prev_Entry',
       'DR3_Prev_Entry-2', 'DR3_Mov_Avg', 'DR3_Vert_Avg', 'Draw4',
       'DR4_Prev_Week', 'DR4_2Weeks', 'DR4_Prev_Entry', 'DR4_Prev_Entry-2',
       'DR4_Mov_Avg', 'DR4_Vert_Avg'],
      dtype='object')
Converting 'Date' to datetime...
After extracting Year, Month, Day: Index(['Date', 'Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week',
       'DR1_2Weeks', 'DR1_Prev_Entry', 'DR1_Prev_Entry-2', 'DR1_Mov_Avg',
       'DR1_Vert_Avg', 'Draw2', 'DR2_Prev_Week', 'DR2_2Weeks',
       'DR2_Prev_Entry', 'DR2_Prev_Entry-2', 'DR2_Mov_Avg', 'DR2_Vert_Avg',
       'Draw3', 'DR3_Prev_Week', 'DR3_2Weeks', 'DR3_Prev_Entry',
       'DR3_Pr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%y')  # Specify the correct format here
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Year'] = data['Date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Month'] = data['Date'].dt.month
A value is trying to be set on a 

In [70]:
# Cell 1.4: Preprocessing Model Unseen Data

# Define the path to the CSV file for model unseen data
csv_filename_unseen = 'B_Initial_Unseen_Data.csv'
drive_csv_path_unseen = os.path.join(drive_dataset_directory, csv_filename_unseen)

# Load the model unseen data
model_unseen_data = load_dataset(drive_csv_path_unseen)

# Function to preprocess unseen data
def preprocess_unseen_data(data):
    print("Initial data columns:", data.columns)

    # Remove rows where 'Draw1' is NaN
    data = data.dropna(subset=['Draw1'])

    # Define possible date formats
    date_formats = ['%d-%m-%y', '%d/%m/%Y']

    # Try to convert 'Date' to datetime with different formats
    for date_format in date_formats:
        try:
            if 'Date' in data.columns:
                print("Converting 'Date' to datetime...")
                data['Date'] = pd.to_datetime(data['Date'], format=date_format)
                data['Year'] = data['Date'].dt.year
                data['Month'] = data['Date'].dt.month
                data['Day'] = data['Date'].dt.day
                print("After extracting Year, Month, Day:", data.columns)
                data.drop(columns=['Date'], inplace=True)
                print("After dropping 'Date':", data.columns)

            # Initialize TARGET VARIABLE 'Prediction1' column with NaN values
            data['Prediction1'] = np.nan

            # Create shifted columns for previous day's data
            data['Prev_Morning'] = data['Draw1'].shift(1)
            data['Prev_Afternoon'] = data['Draw2'].shift(1)
            data['Prev_Evening'] = data['Draw3'].shift(1)
            data['Prev_Night'] = data['Draw4'].shift(1)

            # Handle NaN values
            data['Prev_Morning'].fillna(25, inplace=True)
            data['Prev_Afternoon'].fillna(9, inplace=True)
            data['Prev_Evening'].fillna(7, inplace=True)
            data['Prev_Night'].fillna(5, inplace=True)

            # Select relevant columns, including 'Prediction1'
            selected_columns = ['Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week', 'DR1_2Weeks',
            'DR1_Prev_Entry', 'DR1_Prev_Entry-2', 'DR1_Mov_Avg', 'DR1_Vert_Avg', 'Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prev_Night', 'Prediction1', 'Year', 'Month', 'Day']
            data = data[selected_columns]

            # Save the preprocessed model unseen data directly to your Google Drive folder
            save_path = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/D_Preprocessed_Unseen_Data.csv'
            model_unseen_data.to_csv(save_path, index=False)

            # Display the first few rows of the preprocessed data for verification
            print("First few rows of preprocessed model unseen data:")
            print(model_unseen_data.head())


            break  # Break the loop if successful date conversion

        except ValueError:
            print("Failed to convert 'Date' with format:", date_format)

# Apply preprocessing to the model unseen data
preprocess_unseen_data(model_unseen_data)


File found. Proceeding to load the dataset.
Initial data columns: Index(['Date', 'Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week',
       'DR1_2Weeks', 'DR1_Prev_Entry', 'DR1_Prev_Entry-2', 'DR1_Mov_Avg',
       'DR1_Vert_Avg', 'Draw2', 'DR2_Prev_Week', 'DR2_2Weeks',
       'DR2_Prev_Entry', 'DR2_Prev_Entry-2', 'DR2_Mov_Avg', 'DR2_Vert_Avg',
       'Draw3', 'DR3_Prev_Week', 'DR3_2Weeks', 'DR3_Prev_Entry',
       'DR3_Prev_Entry-2', 'DR3_Mov_Avg', 'DR3_Vert_Avg', 'Draw4',
       'DR4_Prev_Week', 'DR4_2Weeks', 'DR4_Prev_Entry', 'DR4_Prev_Entry-2',
       'DR4_Mov_Avg', 'DR4_Vert_Avg'],
      dtype='object')
Converting 'Date' to datetime...
Failed to convert 'Date' with format: %d-%m-%y
Converting 'Date' to datetime...
After extracting Year, Month, Day: Index(['Date', 'Row Number', 'Data_Type', 'Draw1', 'DR1_Prev_Week',
       'DR1_2Weeks', 'DR1_Prev_Entry', 'DR1_Prev_Entry-2', 'DR1_Mov_Avg',
       'DR1_Vert_Avg', 'Draw2', 'DR2_Prev_Week', 'DR2_2Weeks',
       'DR2_Prev_Entry', 'DR2_P

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Date'] = pd.to_datetime(data['Date'], format=date_format)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Year'] = data['Date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Month'] = data['Date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.


In [71]:
# Cell 1.5: Print and investigate presence of NaNs in Model Unseen Data

# Check for NaN values in training/testing data
print("NaN check for training/testing data:")
print(train_test_data.isna().sum())

# Check for NaN values in model unseen data
print("\nNaN check for model unseen data:")
print(model_unseen_data.isna().sum())


NaN check for training/testing data:
Row Number          0
Data_Type           0
Draw1               0
DR1_Prev_Week       0
DR1_2Weeks          0
DR1_Prev_Entry      0
DR1_Prev_Entry-2    0
DR1_Mov_Avg         0
DR1_Vert_Avg        0
Prev_Morning        0
Prev_Afternoon      0
Prev_Evening        0
Prev_Night          0
Prediction1         0
Year                0
Month               0
Day                 0
dtype: int64

NaN check for model unseen data:
Date                0
Row Number          0
Data_Type           4
Draw1               4
DR1_Prev_Week       4
DR1_2Weeks          4
DR1_Prev_Entry      4
DR1_Prev_Entry-2    4
DR1_Mov_Avg         4
DR1_Vert_Avg        4
Draw2               4
DR2_Prev_Week       4
DR2_2Weeks          4
DR2_Prev_Entry      4
DR2_Prev_Entry-2    4
DR2_Mov_Avg         4
DR2_Vert_Avg        4
Draw3               4
DR3_Prev_Week       4
DR3_2Weeks          4
DR3_Prev_Entry      4
DR3_Prev_Entry-2    4
DR3_Mov_Avg         4
DR3_Vert_Avg        4
Draw4         

In [72]:
# Cell 2.1: Creating "LINES" feature for Training/Testing and Unseen Datasets

import pandas as pd
import os
import shutil

# Define the path to the CSV file for preprocessed training/testing data
preprocessed_train_test_data_path = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/C_Preprocessed_Train_Test_Data.csv'

# Define the path to the CSV file for preprocessed model unseen data
preprocessed_unseen_data_path = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/D_Preprocessed_Unseen_Data.csv'

# Load preprocessed training/testing data
preprocessed_train_test_data = pd.read_csv(preprocessed_train_test_data_path)

# Load preprocessed unseen data
preprocessed_unseen_data = pd.read_csv(preprocessed_unseen_data_path)

# List of columns to convert to integers
int_columns = ['Draw1', 'DR1_Prev_Week', 'DR1_Prev_Entry', 'DR1_Prev_Entry-2', 'DR1_Mov_Avg', 'DR1_Vert_Avg',
               'Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prev_Night']

# Convert specified columns to integers for both datasets
preprocessed_train_test_data[int_columns] = preprocessed_train_test_data[int_columns].astype(int)
preprocessed_unseen_data[int_columns] = preprocessed_unseen_data[int_columns].astype(int)

# Function to assign "Lines" based on the sum of digits
def assign_lines(data, column_name):
    def get_lines(x):
        try:
            # Calculate the sum of digits
            sum_of_digits = sum(map(int, str(x)))
            # Ensure the sum is between 1 and 9
            while sum_of_digits > 9:
                sum_of_digits = sum(map(int, str(sum_of_digits)))
            return sum_of_digits
        except (ValueError, TypeError):
            return None  # Handle non-convertible values by returning None

    data[f'Lines_{column_name}'] = data[column_name].apply(get_lines)

# Handle NaN values in the 'Prediction1' column for unseen data by filling them with 0
preprocessed_unseen_data['Prediction1'].fillna(0, inplace=True)

# Assign "Lines" for specified columns in both datasets
columns_to_assign_lines = ['Draw1', 'DR1_Prev_Week', 'DR1_Prev_Entry']
for column in columns_to_assign_lines:
    assign_lines(preprocessed_train_test_data, column)
    assign_lines(preprocessed_unseen_data, column)

# Define file paths for the new CSVs with "Lines"
lines_train_test_data_path = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/E_Lines_Train_Test_Data.csv'
lines_unseen_data_path = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/F_Lines_Unseen_Data.csv'

# Save the datasets with "Lines" to new CSVs
preprocessed_train_test_data.to_csv(lines_train_test_data_path, index=False)
preprocessed_unseen_data.to_csv(lines_unseen_data_path, index=False)

# Display a sample of the processed data for verification
print("Sample of preprocessed training/testing data with 'Lines_Draw1':")
print(preprocessed_train_test_data[['Row Number', 'Draw1', 'Lines_Draw1']].head())

print("\nSample of preprocessed unseen data with 'Lines_Draw1':")
print(preprocessed_unseen_data[['Row Number', 'Draw1', 'Lines_Draw1']].head())


KeyError: "['Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prev_Night'] not in index"

In [None]:
# Cell 2.2: Loading Datasets for "Special Groups"

import pandas as pd

# Define file paths for the preprocessed training/testing and unseen data
preprocessed_train_test_data_path = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/C_Preprocessed_Train_Test_Data.csv'
preprocessed_unseen_data_path = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/D_Preprocessed_Unseen_Data.csv'

# Load preprocessed training/testing data
preprocessed_train_test_data = pd.read_csv(preprocessed_train_test_data_path)

# Load preprocessed unseen data
preprocessed_unseen_data = pd.read_csv(preprocessed_unseen_data_path)

# Display the first few rows of the loaded data to verify
preprocessed_train_test_data.head()
preprocessed_unseen_data.head()


In [None]:
# Cell 2.3: Creating "Special Groups" feature for Training/Testing and Unseen Datasets

# Define the mapping for "Special Groups"
special_groups_mapping = {
    2: 1, 15: 1, 16: 1, 24: 1, 31: 1,  # "Ladies"
    4: 2, 5: 2, 12: 2, 29: 2, 34: 2,  # "Men"
    11: 3, 17: 3, 26: 3,  # "Birds"
    7: 4, 9: 4, 19: 4, 20: 4, 22: 4, 30: 4, 36: 4,  # "Domestic Animals"
    8: 5, 10: 5, 13: 5, 25: 5,  # "Wild Animals"
    18: 6, 28: 6, 32: 6,  # "Ocean"
    1: 7, 27: 7, 33: 7, 35: 7,  # "Snakes & Insects"
    3: 8, 6: 8, 14: 8, 21: 8, 23: 8  # "Home"
}

# Function to assign "Special Groups" based on the mapping
def assign_special_groups(data, column_name, special_groups_mapping):
    data[f'Special_Groups_{column_name}'] = data[column_name].map(special_groups_mapping).fillna(0).astype(int)

# Assign "Special Groups" for specified columns in both datasets
columns_to_assign_special_groups = ['Draw1', 'DR1_Prev_Week', 'DR1_Prev_Entry']
for column in columns_to_assign_special_groups:
    assign_special_groups(train_test_data, column, special_groups_mapping)
    assign_special_groups(unseen_data, column, special_groups_mapping)

# Save the datasets with "Special Groups" to new CSVs
special_groups_train_test_data_path = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/G_Special_Groups_Train_Test_Data.csv'
special_groups_unseen_data_path = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/Initial_Data_Prep/H_Special_Groups_Unseen_Data.csv'

train_test_data.to_csv(special_groups_train_test_data_path, index=False)
unseen_data.to_csv(special_groups_unseen_data_path, index=False)

# Load the datasets with "Special Groups"
special_groups_train_test_data = pd.read_csv(special_groups_train_test_data_path)
special_groups_unseen_data = pd.read_csv(special_groups_unseen_data_path)

# Display the first few rows of the datasets to investigate the issue
print("First few rows of special_groups_train_test_data:")
print(special_groups_train_test_data.head())

print("\nFirst few rows of special_groups_unseen_data:")
print(special_groups_unseen_data.head())

In [None]:
# Cell 1.3: Data Loading from Google Drive and Preprocessing Unseen Dataset

import pandas as pd
import logging
import os
from google.colab import drive

# Set up logging
logger = logging.getLogger(__name__)

# Mount Google Drive
drive.mount('/content/drive')

# Define the directory for datasets in Google Drive
drive_dataset_directory = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/'

# Define the path to the CSV file for unseen data
csv_filename_unseen = 'Model_Unseen_Data.csv'
drive_csv_path_unseen = os.path.join(drive_dataset_directory, csv_filename_unseen)

# Check and load the dataset
def load_dataset(file_path):
    if os.path.isfile(file_path):
        print("File found. Proceeding to load the dataset.")
        return pd.read_csv(file_path)
    else:
        print("File not found. Check the file path or the Google Drive mount.")
        return None

unseen_data = load_dataset(drive_csv_path_unseen)

# Function to preprocess unseen data
def preprocess_unseen_data(data):
    print("Initial data columns:", data.columns)

    # Convert 'Date' to datetime and extract 'Year', 'Month', and 'Day'
    if 'Date' in data.columns:
        print("Converting 'Date' to datetime...")
        data['Date'] = pd.to_datetime(data['Date'])
        data['Year'] = data['Date'].dt.year
        data['Month'] = data['Date'].dt.month
        data['Day'] = data['Date'].dt.day
        print("After extracting Year, Month, Day:", data.columns)
        data.drop(columns=['Date'], inplace=True)
        print("After dropping 'Date':", data.columns)
    else:
        print("Date column not found in the given dataset.")

    # Initialize 'Prediction1' column with NaNs for unseen data
    data['Prediction1'] = np.nan

    # Create shifted columns for previous day's data
    data['Prev_Morning'] = data['Morning'].shift(1)
    data['Prev_Afternoon'] = data['Afternoon'].shift(1)
    data['Prev_Evening'] = data['Evening'].shift(1)

    # Calculate moving averages excluding current row
    initial_window_size = 3  # Increased by 1 to exclude the current row
    columns_to_average = ['Morning', 'Afternoon', 'Evening', 'Night']
    target_columns = ['Mov_Avg_Mor', 'Mov_Avg_Aft', 'Mov_Avg_Eve', 'Mov_Avg_Nig']

    for col, target_col in zip(columns_to_average, target_columns):
    # Roll over an additional row and then shift to exclude the current row
        data[target_col] = data[col].rolling(window=initial_window_size, min_periods=1).mean().shift(1)
    # Manually set the value for the first row
    unseen_data.at[0, 'Mov_Avg_Mor'] = 6
    unseen_data.at[1, 'Mov_Avg_Mor'] = 22
    unseen_data.at[2, 'Mov_Avg_Mor'] = 17.5
    unseen_data.at[3, 'Mov_Avg_Mor'] = 2
    unseen_data.at[4, 'Mov_Avg_Mor'] = 17

    # Calculate vertical averages excluding current row
    vertical_target_columns = ['Vert_Avg_Mor', 'Vert_Avg_Aft', 'Vert_Avg_Eve', 'Vert_Avg_Nig']
    for col, target_col in zip(columns_to_average, vertical_target_columns):
        data[target_col] = data[col].rolling(window=3, min_periods=1).mean().shift(1)

    # Handle NaN values
    data['Prev_Morning'].fillna(25, inplace=True)
    data['Prev_Afternoon'].fillna(9, inplace=True)
    data['Prev_Evening'].fillna(7, inplace=True)

    # Select relevant columns, including 'Prediction1'
    selected_columns = ['Row Number', 'Data_Type', 'Year', 'Month', 'Day', 'Prev_Week', 'Prev_Entry', 'Mov_Avg_Mor', 'Vert_Avg_Mor', 'Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prediction1']
    data[selected_columns]

    return data

# Apply preprocessing to the unseen dataset
unseen_data = preprocess_unseen_data(unseen_data)

# Display the preprocessed unseen data
print("First few rows of preprocessed unseen data:")
print(unseen_data.head())

In [None]:
# Cell 1.4: # Save the preprocessed training/testing dataset
preprocessed_train_test_path = os.path.join(drive_dataset_directory, '1_preprocessed_train_test_data.csv')
train_test_data.to_csv(preprocessed_train_test_path, index=False)
print("Preprocessed training/testing data saved to Google Drive.")

# Display the first few rows of the preprocessed training/testing data
print("First few rows of preprocessed training/testing data:")
print(train_test_data.head())

# Check for NaN values in the entire dataset
nan_counts = train_test_data.isnull().sum()
print("Count of NaN values in training/testing data:")
print(nan_counts)

# Save the preprocessed unseen dataset
preprocessed_unseen_path = os.path.join(drive_dataset_directory, '2_preprocessed_unseen_data.csv')
unseen_data.to_csv(preprocessed_unseen_path, index=False)
print("Preprocessed unseen data saved to Google Drive.")

# Display the first few rows of the preprocessed unseen data
print("First few rows of preprocessed unseen data:")
print(unseen_data.head())

# Check for NaN values in the entire dataset
nan_counts = unseen_data.isnull().sum()
print("Count of NaN values in unseen data:")
print(nan_counts)


In [None]:
import pandas as pd
import logging
import os

# Set up logging
logger = logging.getLogger(__name__)

# Define the directory for datasets in Google Drive
drive_dataset_directory = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/'

# Define the path to the preprocessed unseen data
preprocessed_unseen_path = os.path.join(drive_dataset_directory, '2_preprocessed_unseen_data.csv')

# Load the preprocessed unseen data
unseen_data = pd.read_csv(preprocessed_unseen_path)
logger.info("Preprocessed unseen data loaded successfully.")

# Define the provided data for imputation
provided_data = [
    {
        'Row Number': 1410,
        'Morning': 13,
        'Prev_Week': 27,
        '2WeeksM': 25,
        'Prev_Entry': 5,
        'Prev_Entry-2': 7,
        'Mov_Avg_Mor': 6,
        'Vert_Avg_Mor': 26,
        'Afternoon': 20,
        'Prev_Week': 7,
        '2WeeksA': 34,
        'Prev_Entry': 13,
        'Prev_Entry-2': 5,
        'Mov_Avg_Aft': 9,
        'Vert_Avg_Aft': 20.5,
        'Evening': 26,
        'Prev_Week': 26,
        '2WeeksE': 24,
        'Prev_Entry': 20,
        'Prev_Entry-2': 13,
        'Mov_Avg_Eve': 16.5,
        'Vert_Avg_Eve': 25,
        'Night': 18,
        'Prev_Week': 26,
        '2WeeksN': 3,
        'Prev_Entry': 26,
        'Prev_Entry-2': 20,
        'Mov_Avg_Nig': 23,
        'Vert_Avg_Nig': 14.5
    },
    {
        'Row Number': 1411,
        'Morning': 21,
        'Prev_Week': 33,
        '2WeeksM': 12,
        'Prev_Entry': 18,
        'Prev_Entry-2': 26,
        'Mov_Avg_Mor': 22,
        'Vert_Avg_Mor': 22.5,
        'Afternoon': 31,
        'Prev_Week': 18,
        '2WeeksA': 36,
        'Prev_Entry': 21,
        'Prev_Entry-2': 18,
        'Mov_Avg_Aft': 19.5,
        'Vert_Avg_Aft': 27,
        'Evening': 7,
        'Prev_Week': 9,
        '2WeeksE': 3,
        'Prev_Entry': 31,
        'Prev_Entry-2': 21,
        'Mov_Avg_Eve': 26,
        'Vert_Avg_Eve': 6,
        'Night': 28,
        'Prev_Week': 8,
        '2WeeksN': 5,
        'Prev_Entry': 7,
        'Prev_Entry-2': 31,
        'Mov_Avg_Nig': 19,
        'Vert_Avg_Nig': 6.5
    },
    {
        'Row Number': 1412,
        'Morning': 15,
        'Prev_Week': 27,
        '2WeeksM': 3,
        'Prev_Entry': 28,
        'Prev_Entry-2': 7,
        'Mov_Avg_Mor': 17.5,
        'Vert_Avg_Mor': 15,
        'Afternoon': 5,
        'Prev_Week': 22,
        '2WeeksA': 10,
        'Prev_Entry': 15,
        'Prev_Entry-2': 28,
        'Mov_Avg_Aft': 21.5,
        'Vert_Avg_Aft': 16,
        'Evening': 2,
        'Prev_Week': 32,
        '2WeeksE': 4,
        'Prev_Entry': 5,
        'Prev_Entry-2': 15,
        'Mov_Avg_Eve': 10,
        'Vert_Avg_Eve': 18,
        'Night': 2,
        'Prev_Week': 30,
        '2WeeksN': 6,
        'Prev_Entry': 2,
        'Prev_Entry-2': 5,
        'Mov_Avg_Nig': 3.5,
        'Vert_Avg_Nig': 18
    },
    {
        'Row Number': 1413,
        'Morning': 13,
        'Prev_Week': 20,
        '2WeeksM': 11,
        'Prev_Entry': 2,
        'Prev_Entry-2': 2,
        'Mov_Avg_Mor': 2,
        'Vert_Avg_Mor': 15.5,
        'Row Number': 1413,
        'Afternoon': 28,
        'Prev_Week': 29,
        '2WeeksA': 19,
        'Prev_Entry': 13,
        'Prev_Entry-2': 2,
        'Mov_Avg_Aft': 7.5,
        'Vert_Avg_Aft': 24,
        'Evening': 22,
        'Prev_Week': 23,
        '2WeeksE': 29,
        'Prev_Entry': 28,
        'Prev_Entry-2': 13,
        'Mov_Avg_Eve': 20.5,
        'Vert_Avg_Eve': 26,
        'Night': 12,
        'Prev_Week': 2,
        '2WeeksN': 7,
        'Prev_Entry': 22,
        'Prev_Entry-2': 28,
        'Mov_Avg_Nig': 25,
        'Vert_Avg_Nig': 4.5
    },
    {
        'Row Number': 1414,
        'Morning': 12,
        'Prev_Week': 29,
        '2WeeksM': 14,
        'Prev_Entry': 12,
        'Prev_Entry-2': 22,
        'Mov_Avg_Mor': 17,
        'Vert_Avg_Mor': 21.5,
        'Row Number': 1414,
        'Afternoon': 35,
        'Prev_Week': 7,
        '2WeeksA': 31,
        'Prev_Entry': 12,
        'Prev_Entry-2': 12,
        'Mov_Avg_Aft': 12,
        'Vert_Avg_Aft': 19,
        'Evening': 31,
        'Prev_Week': 5,
        '2WeeksE': 32,
        'Prev_Entry': 35,
        'Prev_Entry-2': 12,
        'Mov_Avg_Eve': 23.5,
        'Vert_Avg_Eve': 18.5,
        'Night': 11,
        'Prev_Week': 3,
        '2WeeksN': 18,
        'Prev_Entry': 31,
        'Prev_Entry-2': 35,
        'Mov_Avg_Nig': 33,
        'Vert_Avg_Nig': 10.5
    },
    {
        'Row Number': 1415,
        'Morning': 14,
        'Prev_Week': 25,
        '2WeeksM': 5,
        'Prev_Entry': 11,
        'Prev_Entry-2': 31,
        'Mov_Avg_Mor': 21,
        'Vert_Avg_Mor': 15,
        'Row Number': 1415,
        'Afternoon': 2,
        'Prev_Week': 9,
        '2WeeksA': 14,
        'Prev_Entry': 14,
        'Prev_Entry-2': 11,
        'Mov_Avg_Aft': 12.5,
        'Vert_Avg_Aft': 11.5,
        'Evening': 23,
        'Prev_Week': 7,
        '2WeeksE': 30,
        'Prev_Entry': 2,
        'Prev_Entry-2': 14,
        'Mov_Avg_Eve': 8,
        'Vert_Avg_Eve': 18.5,
        'Row Number': 1415,
        'Night': 25,
        'Prev_Week': 5,
        '2WeeksN': 22,
        'Prev_Entry': 23,
        'Prev_Entry-2': 2,
        'Mov_Avg_Nig': 12.5,
        'Vert_Avg_Nig': 13.5
    }
]

# Iterate through the provided data to update the corresponding columns in the DataFrame
for data in provided_data:
    row_number = data['Row Number']
    for column in data.keys():
        if column != 'Row Number':
            unseen_data.loc[unseen_data['Row Number'] == row_number, column] = data[column]

# Display the dataset after NaN handling
print("First few rows of unseen data after NaN handling:")
print(unseen_data.head())

# Check for NaN values in the entire dataset
nan_counts = unseen_data.isnull().sum()
print("Count of NaN values in unseen data:")
print(nan_counts)

# Save the updated unseen data
updated_unseen_path = os.path.join(drive_dataset_directory, '2_preprocessed_unseen_data.csv')
unseen_data.to_csv(updated_unseen_path, index=False)
logger.info("Updated unseen data saved successfully.")
