<a href="https://colab.research.google.com/github/alvinfranklyndavis/Project2023_v3/blob/main/Data_Prep_GPT_4_Bard_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# CELL 1.1: Package Installation and Library Import

# Upgrade pip and install required packages
!pip install -U --upgrade-strategy eager pip
!pip install -U --upgrade-strategy eager pandas numpy

# Import required libraries
import pandas as pd
import numpy as np
import logging
import os

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


Collecting pip
  Downloading pip-23.3.2-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.3.2
Collecting pandas
  Downloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting numpy
  Downloading numpy-1.26.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.4-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [9]:
# Cell 1.2: Data Loading from Google Drive and preprocessing Training / Testing dataset

import pandas as pd
import logging
import os
from google.colab import drive

# Set up logging
logger = logging.getLogger(__name__)

# Mount Google Drive
drive.mount('/content/drive')

# Define the directory for datasets in Google Drive
drive_dataset_directory = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/'

# Define the path to the comprehensive CSV file for training and testing
csv_filename_train_test = 'Model_Train_Test_Data.csv'
drive_csv_path_train_test = os.path.join(drive_dataset_directory, csv_filename_train_test)

# Check and load the datasets
def load_dataset(file_path):
    if os.path.isfile(file_path):
        print("File found. Proceeding to load the dataset.")
        return pd.read_csv(file_path)
    else:
        print("File not found. Check the file path or the Google Drive mount.")
        return None

train_test_data = load_dataset(drive_csv_path_train_test)

# Function to preprocess training/testing data
def preprocess_train_test_data(data):
    print("Initial data columns:", data.columns)

    # Convert 'Date' to datetime and extract 'Year', 'Month', and 'Day'
    if 'Date' in data.columns:
        print("Converting 'Date' to datetime...")
        data['Date'] = pd.to_datetime(data['Date'])
        data['Year'] = data['Date'].dt.year
        data['Month'] = data['Date'].dt.month
        data['Day'] = data['Date'].dt.day
        print("After extracting Year, Month, Day:", data.columns)
        data.drop(columns=['Date'], inplace=True)
        print("After dropping 'Date':", data.columns)
    else:
        print("Date column not found in the given dataset.")

    # Initialize 'Prediction1' column with 'Morning' values
    data['Prediction1'] = data['Morning']

    # Create shifted columns for previous day's data
    data['Prev_Morning'] = data['Morning'].shift(1)
    data['Prev_Afternoon'] = data['Afternoon'].shift(1)
    data['Prev_Evening'] = data['Evening'].shift(1)

    # Calculate moving averages excluding current row
    initial_window_size = 3  # Increased by 1 to exclude the current row
    columns_to_average = ['Morning', 'Afternoon', 'Evening', 'Night']
    target_columns = ['Mov_Avg_Mor', 'Mov_Avg_Aft', 'Mov_Avg_Eve', 'Mov_Avg_Nig']

    for col, target_col in zip(columns_to_average, target_columns):
    # Roll over an additional row and then shift to exclude the current row
        data[target_col] = data[col].rolling(window=initial_window_size, min_periods=1).mean().shift(1)

    # Calculate vertical average for 'Morning' excluding the current row
    vertical_avg = data['Morning'].rolling(window=3, min_periods=1).mean().shift(1)
    data['Vert_Avg_Mor'] = vertical_avg

    # Handle NaN values
    data['Prev_Morning'].fillna(18, inplace=True)
    data['Prev_Afternoon'].fillna(18, inplace=True)
    data['Prev_Evening'].fillna(18, inplace=True)

    # Select relevant columns, including 'Prediction1'
    selected_columns = ['Row Number', 'Data_Type', 'Year', 'Month', 'Day', 'Prev_Week', 'Prev_Entry', 'Mov_Avg_Mor', 'Vert_Avg_Mor', 'Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prediction1']
    data[selected_columns]

    return data

# Apply preprocessing to the training/testing dataset
train_test_data = preprocess_train_test_data(train_test_data)

# Display the preprocessed data
print("First few rows of preprocessed training/testing data:")
print(train_test_data.head())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File found. Proceeding to load the dataset.
Initial data columns: Index(['Date', 'Row Number', 'Data_Type', 'Morning', 'Prev_Week',
       'Rep_Prev_Week', 'Prev_Entry', 'Rep_Prev_Entry', 'Mov_Avg_Mor',
       'Afternoon', 'Prev_Week.1', 'Rep_Prev_Week.1', 'Prev_Entry.1',
       'Rep_Prev_Entry.1', 'Mov_Avg_Aft', 'Evening', 'Prev_Week.2',
       'Rep_Prev_Week.2', 'Prev_Entry.2', 'Rep_Prev_Entry.2', 'Mov_Avg_Eve',
       'Night', 'Prev_Week.3', 'Rep_Prev_Week.3', 'Prev_Entry.3',
       'Rep_Prev_Entry.3', 'Mov_Avg_Nig'],
      dtype='object')
Converting 'Date' to datetime...
After extracting Year, Month, Day: Index(['Date', 'Row Number', 'Data_Type', 'Morning', 'Prev_Week',
       'Rep_Prev_Week', 'Prev_Entry', 'Rep_Prev_Entry', 'Mov_Avg_Mor',
       'Afternoon', 'Prev_Week.1', 'Rep_Prev_Week.1', 'Prev_Entry.1',
       'Rep_Prev_Entry.1', 'Mov_Avg_Aft', 'Even

In [11]:
# Cell 1.3: Data Loading from Google Drive and Preprocessing Unseen Dataset

import pandas as pd
import logging
import os
from google.colab import drive

# Set up logging
logger = logging.getLogger(__name__)

# Mount Google Drive
drive.mount('/content/drive')

# Define the directory for datasets in Google Drive
drive_dataset_directory = '/content/drive/My Drive/Predictive_Modeling_Four_Draws/Morning_Draw_Model_Docs/'

# Define the path to the CSV file for unseen data
csv_filename_unseen = 'Model_Unseen_Data.csv'
drive_csv_path_unseen = os.path.join(drive_dataset_directory, csv_filename_unseen)

# Check and load the dataset
def load_dataset(file_path):
    if os.path.isfile(file_path):
        print("File found. Proceeding to load the dataset.")
        return pd.read_csv(file_path)
    else:
        print("File not found. Check the file path or the Google Drive mount.")
        return None

unseen_data = load_dataset(drive_csv_path_unseen)

# Function to preprocess unseen data
def preprocess_unseen_data(data):
    print("Initial data columns:", data.columns)

    # Convert 'Date' to datetime and extract 'Year', 'Month', and 'Day'
    if 'Date' in data.columns:
        print("Converting 'Date' to datetime...")
        data['Date'] = pd.to_datetime(data['Date'])
        data['Year'] = data['Date'].dt.year
        data['Month'] = data['Date'].dt.month
        data['Day'] = data['Date'].dt.day
        print("After extracting Year, Month, Day:", data.columns)
        data.drop(columns=['Date'], inplace=True)
        print("After dropping 'Date':", data.columns)
    else:
        print("Date column not found in the given dataset.")

    # Initialize 'Prediction1' column with NaNs for unseen data
    data['Prediction1'] = np.nan

    # Create shifted columns for previous day's data
    data['Prev_Morning'] = data['Morning'].shift(1)
    data['Prev_Afternoon'] = data['Afternoon'].shift(1)
    data['Prev_Evening'] = data['Evening'].shift(1)

    # Calculate moving averages excluding current row
    initial_window_size = 3  # Increased by 1 to exclude the current row
    columns_to_average = ['Morning', 'Afternoon', 'Evening', 'Night']
    target_columns = ['Mov_Avg_Mor', 'Mov_Avg_Aft', 'Mov_Avg_Eve', 'Mov_Avg_Nig']

    for col, target_col in zip(columns_to_average, target_columns):
    # Roll over an additional row and then shift to exclude the current row
        data[target_col] = data[col].rolling(window=initial_window_size, min_periods=1).mean().shift(1)
    # Manually set the value for the first row
    unseen_data.at[0, 'Mov_Avg_Mor'] = 6
    unseen_data.at[1, 'Mov_Avg_Mor'] = 22
    unseen_data.at[2, 'Mov_Avg_Mor'] = 17.5
    unseen_data.at[3, 'Mov_Avg_Mor'] = 2
    unseen_data.at[4, 'Mov_Avg_Mor'] = 17

    # Calculate vertical average for 'Morning' excluding the current row
    vertical_avg = data['Morning'].rolling(window=3, min_periods=1).mean().shift(1)
    data['Vert_Avg_Mor'] = vertical_avg

    # Handle NaN values
    data['Prev_Morning'].fillna(25, inplace=True)
    data['Prev_Afternoon'].fillna(9, inplace=True)
    data['Prev_Evening'].fillna(7, inplace=True)

    # Select relevant columns, including 'Prediction1'
    selected_columns = ['Row Number', 'Data_Type', 'Year', 'Month', 'Day', 'Prev_Week', 'Prev_Entry', 'Mov_Avg_Mor', 'Vert_Avg_Mor', 'Prev_Morning', 'Prev_Afternoon', 'Prev_Evening', 'Prediction1']
    data[selected_columns]

    return data

# Apply preprocessing to the unseen dataset
unseen_data = preprocess_unseen_data(unseen_data)

# Display the preprocessed unseen data
print("First few rows of preprocessed unseen data:")
print(unseen_data.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File found. Proceeding to load the dataset.
Initial data columns: Index(['Date', 'Row Number', 'Data_Type', 'Morning', 'Prev_Week',
       'Rep_Prev_Week', 'Prev_Entry', '2WeeksM', 'Mov_Avg_Mor', 'Afternoon',
       'Prev_Week.1', 'Rep_Prev_Week.1', 'Prev_Entry.1', '2WeeksA',
       'Mov_Avg_Aft', 'Evening', 'Prev_Week.2', 'Rep_Prev_Week.2',
       'Prev_Entry.2', '2WeeksE', 'Mov_Avg_Eve', 'Night', 'Prev_Week.3',
       'Rep_Prev_Week.3', 'Prev_Entry.3', '2WeeksN', 'Mov_Avg_Nig'],
      dtype='object')
Converting 'Date' to datetime...
After extracting Year, Month, Day: Index(['Date', 'Row Number', 'Data_Type', 'Morning', 'Prev_Week',
       'Rep_Prev_Week', 'Prev_Entry', '2WeeksM', 'Mov_Avg_Mor', 'Afternoon',
       'Prev_Week.1', 'Rep_Prev_Week.1', 'Prev_Entry.1', '2WeeksA',
       'Mov_Avg_Aft', 'Evening', 'Prev_Week.2', 'Rep_Prev_Week.2',
       'Prev_Entr

In [4]:
# Save the preprocessed training/testing dataset
preprocessed_train_test_path = os.path.join(drive_dataset_directory, '15_preprocessed_train_test_data.csv')
train_test_data.to_csv(preprocessed_train_test_path, index=False)
print("Preprocessed training/testing data saved to Google Drive.")

# Save the preprocessed unseen dataset
preprocessed_unseen_path = os.path.join(drive_dataset_directory, '16_preprocessed_unseen_data.csv')
unseen_data.to_csv(preprocessed_unseen_path, index=False)
print("Preprocessed unseen data saved to Google Drive.")



Preprocessed training/testing data saved to Google Drive.
Preprocessed unseen data saved to Google Drive.
