<a href="https://colab.research.google.com/github/alvinfranklyndavis/Project2023_v3/blob/main/GPT_4_Bard_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1.1: Package Installation

# Upgrade pip and install required packages
!pip install -U --upgrade-strategy eager pip
!pip install -U --upgrade-strategy eager pandas gdown numpy matplotlib scikit-learn xgboost shap
!pip install -U scikit-learn
!pip install -U imbalanced-learn
!pip install black  # Install Black for code formatting


[0m

In [6]:
# Cell 1.2: Import Libraries and Set Up Logging

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import logging
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.impute import KNNImputer
import shap

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [7]:
# Cell 1.3: Load Data from Google Drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the directory for datasets in Google Drive (root directory of Google Drive)
drive_dataset_directory = '/content/drive/My Drive/'

# Define the path to the CSV file
drive_csv_path = os.path.join(drive_dataset_directory, 'initial_data.csv')

# Log the start of dataset loading
logger.info("Reading the dataset from Google Drive...")

# Load the dataset
data = pd.read_csv(drive_csv_path)
logger.info("Dataset loaded successfully from Google Drive.")

# Basic Data Exploration
logger.info("Performing basic data exploration...")
logger.info(f"Dataset Size: {data.shape}")
logger.info(f"First 5 Rows:\n{data.head()}")
logger.info(f"Missing Values:\n{data.isnull().sum()}")

# Proceed with further data processing...

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# Cell 2.1: Date Feature Processing and Prediction1 Setup

# Import necessary libraries
import logging
import pandas as pd

# Set up logging
logger = logging.getLogger(__name__)

logger.info("Processing date features...")

# Assuming 'data' is your DataFrame
# Convert 'Date' to datetime and extract 'Year', 'Month', and 'Day'
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day

# Display data types and check for missing values
logger.info("Data types:\n%s", data.dtypes)
logger.info("Missing values:\n%s", data.isnull().sum())

# Calculate Moving Averages for specified columns
window_size = 3
columns_to_average = ['Morning', 'Afternoon', 'Evening', 'Night']
target_columns = ['Mov_Avg_Mor', 'Mov_Avg_Aft', 'Mov_Avg_Eve', 'Mov_Avg_Nig']

# Initialize moving average columns with default values (e.g., 0)
for col in target_columns:
    data[col] = 0

try:
    for col, target_col in zip(columns_to_average, target_columns):
        data[target_col] = data[col].rolling(window=window_size, min_periods=1).mean()
except Exception as e:
    logger.error("Error in moving average calculation: %s", e)

# Adjust entries to use previous day's data
data['Prev_Morning'] = data['Morning'].shift(1)
data['Prev_Afternoon'] = data['Afternoon'].shift(1)
data['Prev_Evening'] = data['Evening'].shift(1)

# Keep only relevant columns for Prediction1
selected_columns = ['Year', 'Month', 'Day', 'Prev_Week', 'Prev_Entry', 'Mov_Avg_Mor', 'Prev_Morning', 'Prev_Afternoon', 'Prev_Evening']
X = data[selected_columns]
y = data['Morning']  # Using 'Morning' as the target variable

logger.info("Saving the Prediction1 data to CSV...")
X.to_csv('/content/processed_date_features_for_prediction1.csv', index=False)
y.to_csv('/content/target_variable_for_prediction1.csv', index=False)
logger.info("Date features processed and saved successfully.")
logger.info("First few rows of feature data:\n%s", X.head())
logger.info("First few rows of target data:\n%s", y.head())


In [9]:
# Cell 2.2: Additional Data Insights

logger.info("Exploring additional data insights...")

# Print the number of missing values in each column
print("Missing values in each column:")
print(data.isnull().sum())

# Print the percentage of missing values in each column
print("\nPercentage of missing values in each column:")
print(data.isnull().mean() * 100)

# Print summary statistics of the features (X)
print("\nSummary statistics of features (X):")
print(X.describe())

# Print summary statistics of the target variable (y)
print("\nSummary statistics of target variable (y):")
print(y.describe())

# Print first few rows of the features (X)
print("\nFirst few rows of feature data (X):")
print(X.head())

# Print first few rows of the target variable (y)
print("\nFirst few rows of target data (y):")
print(y.head())


Missing values in each column:
Date                0
Day of the Week     0
Morning             0
Prev_Week           0
Rep_Prev_Week       0
Prev_Entry          0
Rep_Prev_Entry      0
Mov_Avg_Mor         0
Afternoon           0
Prev_Week.1         0
Rep_Prev_Week.1     0
Prev_Entry.1        0
Rep_Prev_Entry.1    0
Mov_Avg_Aft         0
Evening             0
Prev_Week.2         0
Rep_Prev_Week.2     0
Prev_Entry.2        0
Rep_Prev_Entry.2    0
Mov_Avg_Eve         0
Night               0
Prev_Week.3         0
Rep_Prev_Week.3     0
Prev_Entry.3        0
Rep_Prev_Entry.3    0
Mov_Avg_Nig         0
Year                0
Month               0
Day                 0
Prev_Morning        1
Prev_Afternoon      1
Prev_Evening        1
dtype: int64

Percentage of missing values in each column:
Date                0.000000
Day of the Week     0.000000
Morning             0.000000
Prev_Week           0.000000
Rep_Prev_Week       0.000000
Prev_Entry          0.000000
Rep_Prev_Entry      0.000000
Mov

In [10]:
# Cell 2.3: Setting Bounds for Numerical Range

logger.info("Enforcing numerical bounds...")

# Define the columns that should have values in the range of 1 to 36
columns_to_check = ['Morning', 'Afternoon', 'Evening', 'Night', 'Prev_Morning', 'Prev_Afternoon', 'Prev_Evening']

# Loop through these columns and enforce the range
for col in columns_to_check:
    # Find values outside the range
    outliers = data[(data[col] < 1) | (data[col] > 36)]

    # Report if any outliers are found
    if not outliers.empty:
        print(f"Outliers found in {col}:")
        print(outliers)

    # Enforce the range by clipping values
    data[col] = data[col].clip(lower=1, upper=36)

# Ensure changes are reflected
print(data[columns_to_check].describe())

# Prepare the current data with NaNs in 'Prediction1' for testing
current_data = data[selected_columns].copy()  # Use .copy() to create an independent copy
current_data['Prediction1'] = np.nan  # Initialize 'Prediction1' with NaN

# Save 'current_data' as a CSV file for loading in Step 3.1
current_data.to_csv('/content/current_data_for_prediction1.csv', index=False)
logger.info("Current data with 'Prediction1' as NaN saved as 'current_data_for_prediction1.csv'")


           Morning    Afternoon      Evening        Night  Prev_Morning  \
count  1409.000000  1409.000000  1409.000000  1409.000000   1408.000000   
mean     18.766501    18.613911    18.527324    18.109297     18.762074   
std      10.276234    10.356362    10.240544    10.375588     10.278541   
min       1.000000     1.000000     1.000000     1.000000      1.000000   
25%      10.000000     9.000000    10.000000     9.000000     10.000000   
50%      19.000000    19.000000    19.000000    18.000000     19.000000   
75%      28.000000    27.000000    27.000000    27.000000     28.000000   
max      36.000000    36.000000    36.000000    36.000000     36.000000   

       Prev_Afternoon  Prev_Evening  
count     1408.000000   1408.000000  
mean        18.620739     18.535511  
std         10.356869     10.239569  
min          1.000000      1.000000  
25%          9.000000     10.000000  
50%         19.000000     19.000000  
75%         27.000000     27.000000  
max         36.00000

In [20]:
# Cell 3.1: Data Preparation for Prediction

import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split
import joblib

logger = logging.getLogger(__name__)

logger.info("Loading historical dataset for data preparation...")

# Load the historical dataset
historical_data = pd.read_csv('/content/processed_date_features_for_prediction1.csv')
y_hist = pd.read_csv('/content/target_variable_for_prediction1.csv')
logger.info("Historical dataset loaded successfully.")

# Fill NaN values and exclude specific rows in the entire dataset
historical_data['Prev_Morning'].fillna(18, inplace=True)
historical_data['Prev_Afternoon'].fillna(18, inplace=True)
historical_data['Prev_Evening'].fillna(18, inplace=True)

# Exclude the first row and row 518 from the entire dataset
historical_data = historical_data.iloc[1:].drop(index=518).reset_index(drop=True)
y_hist = y_hist.iloc[1:].drop(index=518).reset_index(drop=True)

# Split historical data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(historical_data, y_hist, test_size=0.20, random_state=42)
logger.info(f"Data split into training and validation sets. Training set size: {X_train.shape}, Validation set size: {X_val.shape}")

# Save the split datasets to CSV files
X_train.to_csv('/content/inspected_X_train.csv', index=False)
y_train.to_csv('/content/y_train_split_for_training.csv', index=False)
X_val.to_csv('/content/inspected_X_val.csv', index=False)
y_val.to_csv('/content/inspected_y_val.csv', index=False)

logger.info("Datasets saved for manual inspection and model training.")

# Save the cleaned X_train and X_val for inspection
X_train.to_csv('/content/inspected_X_train.csv', index=False)
X_val.to_csv('/content/inspected_X_val.csv', index=False)
logger.info("X_train and X_val saved for manual inspection.")

# Save the cleaned y_val for evaluation
y_val.to_csv('/content/inspected_y_val.csv', index=False)
logger.info("y_val saved for evaluation.")

# Load the current dataset with NaNs in 'Prediction1'
current_data = pd.read_csv('/content/current_data_for_prediction1.csv')
X_current = current_data.drop('Prediction1', axis=1)

# Fill NaN values in the first row of specific columns of X_current
X_current.at[0, 'Prev_Morning'] = 18
X_current.at[0, 'Prev_Afternoon'] = 18
X_current.at[0, 'Prev_Evening'] = 18

# Save the prepared current_data
current_data.to_csv('/content/prepared_current_data_for_prediction.csv', index=False)
logger.info("Prepared current data saved.")

# Check for remaining NaNs in X_train and X_current
logger.info("Checking for NaNs in X_train and X_current...")
logger.info(f"NaN values in X_train: {X_train.isnull().sum()}")
logger.info(f"NaN values in X_current: {X_current.isnull().sum()}")


In [22]:
# Cell 3.2: Model Training and Prediction

import pandas as pd
import numpy as np
import logging
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

logger = logging.getLogger(__name__)

logger.info("Loading training and validation data for model training...")

# Load the training data
X_train = pd.read_csv('/content/inspected_X_train.csv')
y_train = pd.read_csv('/content/y_train_split_for_training.csv')
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)

# Initialize and train the Random Forest model
random_forest_model = RandomForestRegressor(random_state=42)
random_forest_model.fit(X_train, y_train.values.ravel())
logger.info("Random Forest model trained on historical data.")

# Load the validation set for model evaluation
X_val = pd.read_csv('/content/inspected_X_val.csv')
y_val = pd.read_csv('/content/inspected_y_val.csv')

# Evaluation on the validation set
y_val_pred = random_forest_model.predict(X_val)
mse_val = mean_squared_error(y_val, y_val_pred)
r2_score_val = r2_score(y_val, y_val_pred)
logger.info(f"Validation MSE: {mse_val}, R2 Score: {r2_score_val}")

# Save the trained model
model_path = '/content/random_forest_prediction_model.pkl'
joblib.dump(random_forest_model, model_path)
logger.info("Trained RandomForest model saved.")

# Load the prepared current_data for prediction
current_data = pd.read_csv('/content/prepared_current_data_for_prediction.csv')
X_current_for_prediction = current_data.drop('Prediction1', axis=1)

# Generate predictions for the current data
y_pred_current = random_forest_model.predict(X_current_for_prediction)
current_data['Prediction1'] = y_pred_current

# Save the current data with predictions
current_data.to_csv('/content/current_data_with_predictions.csv', index=False)


Shape of X_train: (1125, 9)
Shape of y_train: (1125, 1)


ValueError: ignored

In [15]:
# Cell 3.3: NaN Check in 'Prediction1' (Optional)

import pandas as pd
import logging

logger = logging.getLogger(__name__)

logger.info("Loading current data with predictions for NaN check...")

# Load the current data with predictions
current_data_with_predictions = pd.read_csv('/content/current_data_with_predictions.csv')

# Check for NaN values in 'Prediction1'
nan_count_prediction1 = current_data_with_predictions['Prediction1'].isnull().sum()
logger.info(f"Number of NaN values in 'Prediction1': {nan_count_prediction1}")

# Perform analysis of any remaining NaNs
if nan_count_prediction1 > 0:
    logger.info("Rows with NaN in 'Prediction1':")
    logger.info(current_data_with_predictions[current_data_with_predictions['Prediction1'].isnull()])

    # Additional analysis or handling of NaNs can be added here


FileNotFoundError: ignored

In [None]:
# Quick Check for NaN Values in All Axes

# Check in X_train
print("NaN values in X_train:")
print(X_train.isnull().sum())

# Check in y_train
print("\nNaN values in y_train:")
print(y_train.isnull().sum())

# Check in X_current
print("\nNaN values in X_current:")
print(X_current.isnull().sum())

# Check in current_data
print("\nNaN values in current_data:")
print(current_data.isnull().sum())



In [None]:
# STEP 4.1. MODEL INTERPRETATION

# Check for NaN values in y_test_p1
nan_count = y_test_p1.isnull().sum()
logger.info(f"Number of NaN values in y_test_p1: {nan_count}")

# If NaN values exist, print some examples
if nan_count > 0:
    logger.info("Examples of NaN values in y_test_p1:")
    logger.info(y_test_p1[y_test_p1.isnull()])

import shap
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

logger = logging.getLogger(__name__)

# Load the finalized model
final_model = joblib.load('/content/random_forest_prediction_model_imputed.pkl')

# Apply the same imputation and feature modification to X_test_p1
imputed_test = imputer.transform(X_test_p1)
X_test_p1_imputed, missing_indicator_test = imputed_test[:, :-1], imputed_test[:, -1]
X_test_p1_imputed['Target_Missing'] = missing_indicator_test

# Using SHAP to interpret the model
explainer = shap.TreeExplainer(final_model)
shap_values = explainer.shap_values(X_test_p1_imputed)
shap.summary_plot(shap_values, X_test_p1_imputed, plot_type="bar")

# STEP 4.2. FINAL MODEL SELECTION AND REPORTING

# Report the chosen model's evaluation metrics
y_pred_final = final_model.predict(X_test_p1_imputed)

# Evaluate the model
accuracy = accuracy_score(y_test_p1, y_pred_final)
precision = precision_score(y_test_p1, y_pred_final, average='macro')
recall = recall_score(y_test_p1, y_pred_final, average='macro')
f1 = f1_score(y_test_p1, y_pred_final, average='macro')

logger.info(f"Model Performance Metrics:\n Accuracy: {accuracy}\n Precision: {precision}\n Recall: {recall}\n F1 Score: {f1}")

# Add cross-validation implementation here if applicable
# ...code for cross-validation...

# Analysis of model performance
# ...code/logic for detailed analysis of errors, biases, etc...

# STEP 4.3. PREPARATION FOR DEPLOYMENT

# ...code for deployment preparation...
# Assuming the model will be deployed in a specific environment
# Include any necessary steps for preparing the model for deployment
# This might include serialization, testing the model in a deployment-like environment, etc.

# STEP 4.4. DOCUMENTATION AND REPORTING

# Prepare a comprehensive report
report = f"""
Model Selection Rationale:
- The chosen model (e.g., Random Forest) was selected due to its superior performance in terms of accuracy, precision, and recall.

Model Performance:
- Accuracy: {accuracy}
- Precision: {precision}
- Recall: {recall}
- F1-Score: {f1}

Additional Model Analysis:
- Detailed error analysis, biases, etc.
- Results from cross-validation (if performed).

Limitations and Recommendations:
- The model may have limitations in terms of scalability or real-time prediction.
- Future work could explore more advanced models or feature engineering techniques.

Deployment Steps:
- The model will be deployed in a cloud-based environment.
- Necessary steps for deployment include serialization and environment setup.
"""

logger.info("Model documentation and reporting completed.")

# Final Checks and Tests (if applicable)
# Include any additional code for final testing or checks before deployment

logger.info("Final checks and tests completed.")

logger.info("Cell 4 tasks completed successfully.")


In [None]:
# STEP 5. CROSS-VALIDATION AND ADDITIONAL METRICS ANALYSIS

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestRegressor

# Assuming you are using RandomForestRegressor as your model
model = RandomForestRegressor(random_state=random_seed)

# Define your scoring metrics
# Remove ROC AUC score if your task is regression
scoring_metrics = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro'),
    'f1': make_scorer(f1_score, average='macro')
}

# Perform 10-fold cross-validation
k_folds = 10  # Number of folds
cv_results = {}
for metric_name, scorer in scoring_metrics.items():
    scores = cross_val_score(model, X_p1, y_p1.fillna(-999), scoring=scorer, cv=k_folds)
    cv_results[metric_name] = scores
    logger.info(f"{metric_name} scores for each fold: {scores}")
    logger.info(f"Average {metric_name} over {k_folds} folds: {scores.mean()}")

# Additional metrics analysis and error/bias exploration
# ... Add your code for detailed analysis of errors, biases, etc. ...
logger.info("Cross-validation and additional metrics analysis completed.")

# Feature Importance Analysis using SHAP
# Assuming 'final_prediction_model' is your trained RandomForestRegressor model
import shap

# Load the trained model (if not already loaded)
final_prediction_model = joblib.load('/content/final_prediction_model.pkl')

# Explain the model's predictions using SHAP
explainer = shap.TreeExplainer(final_prediction_model)
shap_values = explainer.shap_values(X_train_p1)

# Plot summary plot using SHAP values
shap.summary_plot(shap_values, X_train_p1)

logger.info("Feature importance analysis using SHAP completed.")

In [None]:
# STEP 6. DETAILED ERROR AND BIAS ANALYSIS

import pandas as pd
import matplotlib.pyplot as plt
import joblib
import os

# Set up logging
import logging
logger = logging.getLogger(__name__)

# Mount Google Drive to access the predictions_df.csv file
drive.mount('/content/drive')

# Define the path in Google Drive where the predictions DataFrame is saved
predictions_df_path = '/content/drive/My Drive/predictions_df.csv'

# Load the DataFrame from the CSV file if it exists, otherwise create it
if os.path.exists(predictions_df_path):
    predictions_df = pd.read_csv(predictions_df_path)
    logger.info("predictions_df loaded from Google Drive successfully.")
else:
    # Make sure final_model is loaded
    final_model = joblib.load('/content/final_prediction_model.pkl')

    # Predict on the test set
    y_pred_final = final_model.predict(X_test_p1)

    # Create the predictions DataFrame
    predictions_df = pd.DataFrame({'Actual': y_test_p1, 'Predicted': y_pred_final})

    # Save the new predictions_df to Google Drive for future use
    predictions_df.to_csv(predictions_df_path, index=False)
    logger.info("predictions_df saved to Google Drive successfully.")

# Merge 'Prev_Week' into predictions_df
predictions_df = predictions_df.merge(data[['Prev_Week']], left_index=True, right_index=True)
# Merge 'Prev_Entry' into predictions_df
predictions_df = predictions_df.merge(data[['Prev_Entry']], left_index=True, right_index=True)

# Proceed with error analysis only if predictions_df is loaded or created
if 'predictions_df' in locals():
    # Analyze error distribution
    predictions_df['Error'] = predictions_df['Predicted'] - predictions_df['Actual']
    predictions_df['Absolute_Error'] = predictions_df['Error'].abs()

    # Plotting error distribution
    plt.hist(predictions_df['Error'], bins=30)
    plt.title('Error Distribution')
    plt.xlabel('Prediction Error')
    plt.ylabel('Frequency')
    plt.show()

# Subgroup analysis based on 'Prev_Week'
prev_week_performance = predictions_df.groupby('Prev_Week').mean()['Absolute_Error']
plt.figure(figsize=(10, 6))
prev_week_performance.plot(kind='bar')
plt.title('Performance by Previous Week')
plt.xlabel('Previous Week Draw')
plt.ylabel('Average Absolute Error')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Subgroup analysis based on 'Prev_Entry'
prev_entry_performance = predictions_df.groupby('Prev_Entry').mean()['Absolute_Error']
plt.figure(figsize=(10, 6))
prev_entry_performance.plot(kind='bar')
plt.title('Performance by Previous Entry')
plt.xlabel('Previous Entry')
plt.ylabel('Average Absolute Error')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Document findings
error_bias_report = """
Detailed Error Analysis:
- Error Distribution Insights: {'Describe your findings from the error distribution here'}
- Largest Errors: {'Describe characteristics of instances with largest errors here'}

Bias Exploration:
- Performance by Previous Week: {'Describe performance variations based on the previous week here'}
- Performance by Previous Entry: {'Describe performance variations based on the previous entry here'}
"""

logger.info("Error and bias analysis completed.")
logger.info(error_bias_report)


In [None]:
# STEP 7. FINAL REVIEW, DEPLOYMENT PREPARATION, AND DOCUMENTATION

# Final Model Review and Refinement
# ... Code/comments for any last adjustments to the model ...

# Deployment Preparation
# Serialize the final model
joblib.dump(final_prediction_model, '/content/final_prediction_model_for_deployment.pkl')

# Comprehensive Documentation Update
# ... Update your comprehensive report with all final findings and methodologies ...

# Final Checks and Tests
# ... Code/comments for final tests and checks ...

# Planning for Future Improvements
future_improvement_plan = """
Future Improvement Plans:
- Areas for further research: {describe areas for future research}
- Methodologies to explore: {describe potential methodologies for future iterations}
"""

logger.info("Final review and deployment preparation completed.")
logger.info(future_improvement_plan)


# New Section

# New Section