<a href="https://colab.research.google.com/github/alvinfranklyndavis/Project2023_v3/blob/main/Project2023_v4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import gdown
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler  # For feature scaling if needed
from sklearn.pipeline import make_pipeline  # For creating a pipeline if needed

# Define the URL of the CSV file
csv_url = 'https://drive.google.com/uc?id=1o87z4evvCLwBtqX8ocZl3I2nIDYS8mtH'

# Define the local file path to save the CSV
csv_path = 'Training_Testing_Hybrid_Mod.csv'

# Download the CSV file from the Google Drive link
gdown.download(csv_url, csv_path, quiet=False)

# Load the dataset into a Pandas DataFrame
df = pd.read_csv(csv_path)

# Display the first few rows and data types
print(df.head())
print(df.dtypes)

Downloading...
From: https://drive.google.com/uc?id=1o87z4evvCLwBtqX8ocZl3I2nIDYS8mtH
To: /content/Training_Testing_Hybrid_Mod.csv
100%|██████████| 71.9k/71.9k [00:00<00:00, 66.4MB/s]

       Date  Day of the Week  Morning  Prev_Week  Rep_Prev_Week  \
0  8/1/2018                3       19          7              0   
1  8/2/2018                4       31         11              0   
2  8/3/2018                5       15         19              0   
3  8/4/2018                6       31         35              0   
4  8/6/2018                1       31         18              0   

   Rep_Prev_Entry  Afternoon  Prev_Week.1  Rep_Prev_Week.1  Rep_Prev_Entry.1  \
0               0         14           13                0                 0   
1               0          3           21                0                 0   
2               0          9           19                0                 0   
3               0         21           20                0                 0   
4               0         31           30                0                 1   

   Evening  Prev_Week.2  Rep_Prev_Week.2  Rep_Prev_Entry.2  Night  \
0       33           28                0       




In [10]:
# Convert 'Date' column to datetime type
df['Date'] = pd.to_datetime(df['Date'])

# Extract relevant date features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# Drop the original 'Date' column
df = df.drop(columns=['Date'])

# Create Target Variable columns
df['Prediction1'] = np.nan
df['Prediction2'] = np.nan
df['Prediction3'] = np.nan
df['Prediction4'] = np.nan

# Display the updated DataFrame with target variables
print(df.head())
print(df.dtypes)

   Day of the Week  Morning  Prev_Week  Rep_Prev_Week  Rep_Prev_Entry  \
0                3       19          7              0               0   
1                4       31         11              0               0   
2                5       15         19              0               0   
3                6       31         35              0               0   
4                1       31         18              0               0   

   Afternoon  Prev_Week.1  Rep_Prev_Week.1  Rep_Prev_Entry.1  Evening  ...  \
0         14           13                0                 0       33  ...   
1          3           21                0                 0       35  ...   
2          9           19                0                 0       23  ...   
3         21           20                0                 0       29  ...   
4         31           30                0                 1       15  ...   

   Prev_Week.3  Rep_Prev_Week.3  Rep_Prev_Entry.3  Year  Month  Day  \
0            7       

In [16]:
# Check for missing values
missing_values = df.isnull().sum()

# Display the count of missing values for each column
print("Missing Values:\n", missing_values)

# Identify missing values in target variables
missing_values_targets = df[['Prediction1', 'Prediction2', 'Prediction3', 'Prediction4']].isnull().sum()
print("Missing Values in Target Variables:\n", missing_values_targets)

# Iterate through each row with missing values in target variables
for index, row in df[df[['Prediction1', 'Prediction2', 'Prediction3', 'Prediction4']].isnull().any(axis=1)].iterrows():
    # Fill missing values with the corresponding historical entry
    df.at[index, 'Prediction1'] = df.at[index, 'Morning']
    df.at[index, 'Prediction2'] = df.at[index, 'Afternoon']
    df.at[index, 'Prediction3'] = df.at[index, 'Evening']
    df.at[index, 'Prediction4'] = df.at[index, 'Night']

# Display the updated DataFrame
print(df.head())


Missing Values:
 Day of the Week     0
Morning             0
Prev_Week           0
Rep_Prev_Week       0
Rep_Prev_Entry      0
Afternoon           0
Prev_Week.1         0
Rep_Prev_Week.1     0
Rep_Prev_Entry.1    0
Evening             0
Prev_Week.2         0
Rep_Prev_Week.2     0
Rep_Prev_Entry.2    0
Night               0
Prev_Week.3         0
Rep_Prev_Week.3     0
Rep_Prev_Entry.3    0
Year                0
Month               0
Day                 0
Prediction1         0
Prediction2         0
Prediction3         0
Prediction4         0
dtype: int64
Missing Values in Target Variables:
 Prediction1    0
Prediction2    0
Prediction3    0
Prediction4    0
dtype: int64
   Day of the Week  Morning  Prev_Week  Rep_Prev_Week  Rep_Prev_Entry  \
0                3       19          7              0               0   
1                4       31         11              0               0   
2                5       15         19              0               0   
3                6       31     

In [17]:
# Split the data into features (X) and target variables (y)
X = df[['Day of the Week', 'Morning', 'Prev_Week', 'Rep_Prev_Week', 'Rep_Prev_Entry', 'Afternoon', 'Prev_Week.1', 'Rep_Prev_Week.1', 'Rep_Prev_Entry.1', 'Evening', 'Prev_Week.2', 'Rep_Prev_Week.2', 'Rep_Prev_Entry.2', 'Night', 'Prev_Week.3', 'Rep_Prev_Week.3', 'Rep_Prev_Entry.3', 'Year', 'Month', 'Day']]
y = df[['Prediction1', 'Prediction2', 'Prediction3', 'Prediction4']]

# Use an 80/20 split for training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Confirm the shapes of X_train, X_test, y_train, and y_test
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (1127, 20)
X_test shape: (282, 20)
y_train shape: (1127, 4)
y_test shape: (282, 4)


In [20]:
# Check for missing values
missing_values = df.isnull().sum()

# Display the count of missing values for each column
print("Missing Values:\n", missing_values)

# Display the count of missing values in the training sets
print("Missing Values in X_train:\n", X_train.isnull().sum())
print("\nMissing Values in y_train:\n", y_train.isnull().sum())


Missing Values:
 Day of the Week     0
Morning             0
Prev_Week           0
Rep_Prev_Week       0
Rep_Prev_Entry      0
Afternoon           0
Prev_Week.1         0
Rep_Prev_Week.1     0
Rep_Prev_Entry.1    0
Evening             0
Prev_Week.2         0
Rep_Prev_Week.2     0
Rep_Prev_Entry.2    0
Night               0
Prev_Week.3         0
Rep_Prev_Week.3     0
Rep_Prev_Entry.3    0
Year                0
Month               0
Day                 0
Prediction1         0
Prediction2         0
Prediction3         0
Prediction4         0
dtype: int64
Missing Values in X_train:
 Day of the Week     0
Morning             0
Prev_Week           0
Rep_Prev_Week       0
Rep_Prev_Entry      0
Afternoon           0
Prev_Week.1         0
Rep_Prev_Week.1     0
Rep_Prev_Entry.1    0
Evening             0
Prev_Week.2         0
Rep_Prev_Week.2     0
Rep_Prev_Entry.2    0
Night               0
Prev_Week.3         0
Rep_Prev_Week.3     0
Rep_Prev_Entry.3    0
Year                0
Month             