<a href="https://colab.research.google.com/github/alvinfranklyndavis/Project2023_v3/blob/main/GPT_4_Bard_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# STEP 1. INSTALL PACKAGES AND IMPORT DATA

# Upgrade pip and install required packages
!pip install -U --upgrade-strategy eager pip
!pip install -U --upgrade-strategy eager pandas gdown numpy matplotlib scikit-learn xgboost shap
!pip install -U scikit-learn
!pip install -U imbalanced-learn

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import logging
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.impute import KNNImputer
import shap

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Mount Google Drive
drive.mount('/content/drive')

# Define the directory for datasets in Google Drive (root directory of Google Drive)
drive_dataset_directory = '/content/drive/My Drive/'

# Define the local file path in Google Drive for the CSV
drive_csv_path = os.path.join(drive_dataset_directory, 'Training_Testing_Hybrid_MA.csv')

# Log the start of the dataset loading
logger.info("Reading the dataset from Google Drive...")

# Read the dataset from Google Drive
data = pd.read_csv(drive_csv_path)
logger.info("Dataset loaded successfully from Google Drive.")

# Basic Data Exploration
logger.info("Performing basic data exploration...")
logger.info(f"Dataset Size: {data.shape}")
logger.info(f"First 5 Rows:\n{data.head()}")
logger.info(f"Missing Values:\n{data.isnull().sum()}")

# Proceed with further data processing...


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.6/235.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.10.1
    Uninstalling imbalanced-learn-0.10.1:
      Successfully uninstalled imbalanced-learn-0.10.1
Successfully installed imbalanced-learn-0.11.0
[0mDrive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# STEP 2. PROCESS DATE FEATURES AND SET UP PREDICTION1 COLUMN

import logging
logger = logging.getLogger(__name__)

logger.info("Processing date features...")

# Display data types and check for missing values
logger.info("Data types:\n%s", data.dtypes)
logger.info("Missing values:\n%s", data.isnull().sum())

# Calculate Moving Averages for specified columns
window_size = 3
columns_to_average = ['Morning', 'Afternoon', 'Evening', 'Night']
target_columns = ['Mov_Avg_Mor', 'Mov_Avg_Aft', 'Mov_Avg_Eve', 'Mov_Avg_Nig']

# Initialize moving average columns with default values (e.g., 0)
for col in target_columns:
    data[col] = 0

try:
    for col, target_col in zip(columns_to_average, target_columns):
        data[target_col] = data[col].rolling(window=window_size, min_periods=1).mean()
except Exception as e:
    logger.error("Error in moving average calculation: %s", e)

# Create Target Variable Column for Prediction1
data['Prediction1'] = np.nan

# Keep only relevant columns for Prediction1
selected_columns_p1 = ['Morning', 'Prev_Week', 'Prev_Entry', 'Mov_Avg_Mor']
data_p1 = data[selected_columns_p1].assign(Prediction1=np.nan)

logger.info("Saving the Prediction1 data to CSV...")
data_p1.to_csv('/content/train_data_prediction1.csv', index=False)
logger.info("Date features processed successfully.")
logger.info("First few rows of Prediction1 data:\n%s", data_p1.head())


In [3]:
# Using print statements to display missing values information
print("Missing values in each column (excluding 'Prediction1'):")
print(data.drop(columns=['Prediction1']).isnull().sum())

print("\nPercentage of missing values in each column (excluding 'Prediction1'):")
print(data.drop(columns=['Prediction1']).isnull().mean() * 100)


Missing values in each column (excluding 'Prediction1'):
Date                0
Day of the Week     0
Morning             0
Prev_Week           0
Rep_Prev_Week       0
Prev_Entry          0
Rep_Prev_Entry      0
Mov_Avg_Mor         0
Afternoon           0
Prev_Week.1         0
Rep_Prev_Week.1     0
Prev_Entry.1        0
Rep_Prev_Entry.1    0
Mov_Avg_Aft         0
Evening             0
Prev_Week.2         0
Rep_Prev_Week.2     0
Prev_Entry.2        0
Rep_Prev_Entry.2    0
Mov_Avg_Eve         0
Night               0
Prev_Week.3         0
Rep_Prev_Week.3     0
Prev_Entry.3        0
Rep_Prev_Entry.3    0
Mov_Avg_Nig         0
dtype: int64

Percentage of missing values in each column (excluding 'Prediction1'):
Date                0.0
Day of the Week     0.0
Morning             0.0
Prev_Week           0.0
Rep_Prev_Week       0.0
Prev_Entry          0.0
Rep_Prev_Entry      0.0
Mov_Avg_Mor         0.0
Afternoon           0.0
Prev_Week.1         0.0
Rep_Prev_Week.1     0.0
Prev_Entry.1        0.

In [5]:
# STEP 3.1. LOAD TRAIN DATAFRAME FOR PREDICTION1

import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error
import joblib
from imblearn.over_sampling import SMOTE  # For data augmentation

# Set up logging
logger = logging.getLogger(__name__)

# Load the train dataframe
train_data = pd.read_csv('/content/train_data_prediction1.csv')
logger.info("Loading train dataframe for Prediction1...")

# Optional: Data Augmentation using SMOTE (if applicable)
# smote = SMOTE()
# X_p1, y_p1 = smote.fit_resample(X_p1, y_p1)

# Separate features and the target variable for Prediction1
X_p1 = train_data.drop('Prediction1', axis=1)  # Features
y_p1 = train_data['Prediction1']  # Target variable

logger.info("Train dataframe for Prediction1 loaded successfully.")
logger.info(f"Data Shape - Features: {X_p1.shape}, Target: {y_p1.shape}")

# STEP 3.2. SPLIT DATA INTO TRAINING AND TESTING SETS FOR PREDICTION1

print(f"Number of samples in historical data: {len(historical_data)}")

random_seed = 42
# Stratified split (adjust or remove if not applicable)
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_seed)

# Training on historical data with known 'Prediction1' values
historical_data = train_data[train_data['Prediction1'].notna()]
X_hist, y_hist = historical_data.drop('Prediction1', axis=1), historical_data['Prediction1']

# Training and validation split
for train_index, val_index in stratified_kfold.split(X_hist, y_hist):
    X_train_hist, X_val_hist = X_hist.iloc[train_index], X_hist.iloc[val_index]
    y_train_hist, y_val_hist = y_hist.iloc[train_index], y_hist.iloc[val_index]

    # Initialize and train the Random Forest model
    random_forest_model = RandomForestRegressor(random_state=random_seed)
    random_forest_model.fit(X_train_hist, y_train_hist)

    # Evaluation on the validation set
    y_pred_val = random_forest_model.predict(X_val_hist)
    mse_val = mean_squared_error(y_val_hist, y_pred_val)
    r2_score_val = r2_score(y_val_hist, y_pred_val)

    logger.info(f"Validation MSE: {mse_val}, R2 Score: {r2_score_val}")

# Save the trained model
model_path = '/content/random_forest_prediction_model_hist.pkl'
joblib.dump(random_forest_model, model_path)
logger.info(f"RandomForest model trained on historical data saved to {model_path}.")

# STEP 3.4. APPLY MODEL TO CURRENT DATA

# Apply the trained model to current data (where 'Prediction1' is NaN)
current_data = train_data[train_data['Prediction1'].isna()]
X_current = current_data.drop('Prediction1', axis=1)

# Generate predictions for the current data
y_pred_current = random_forest_model.predict(X_current)
current_data['Prediction1'] = y_pred_current

logger.info("Predictions applied to current data.")

# Optional: Save the current data with predictions
current_data.to_csv('/content/current_data_with_predictions.csv', index=False)
logger.info("Current data with predictions saved.")

logger.info("Model training and application to current data completed.")


Number of samples in historical data: 0


ValueError: ignored

In [None]:
# STEP 4.1. MODEL INTERPRETATION

# Check for NaN values in y_test_p1
nan_count = y_test_p1.isnull().sum()
logger.info(f"Number of NaN values in y_test_p1: {nan_count}")

# If NaN values exist, print some examples
if nan_count > 0:
    logger.info("Examples of NaN values in y_test_p1:")
    logger.info(y_test_p1[y_test_p1.isnull()])

import shap
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

logger = logging.getLogger(__name__)

# Load the finalized model
final_model = joblib.load('/content/random_forest_prediction_model_imputed.pkl')

# Apply the same imputation and feature modification to X_test_p1
imputed_test = imputer.transform(X_test_p1)
X_test_p1_imputed, missing_indicator_test = imputed_test[:, :-1], imputed_test[:, -1]
X_test_p1_imputed['Target_Missing'] = missing_indicator_test

# Using SHAP to interpret the model
explainer = shap.TreeExplainer(final_model)
shap_values = explainer.shap_values(X_test_p1_imputed)
shap.summary_plot(shap_values, X_test_p1_imputed, plot_type="bar")

# STEP 4.2. FINAL MODEL SELECTION AND REPORTING

# Report the chosen model's evaluation metrics
y_pred_final = final_model.predict(X_test_p1_imputed)

# Evaluate the model
accuracy = accuracy_score(y_test_p1, y_pred_final)
precision = precision_score(y_test_p1, y_pred_final, average='macro')
recall = recall_score(y_test_p1, y_pred_final, average='macro')
f1 = f1_score(y_test_p1, y_pred_final, average='macro')

logger.info(f"Model Performance Metrics:\n Accuracy: {accuracy}\n Precision: {precision}\n Recall: {recall}\n F1 Score: {f1}")

# Add cross-validation implementation here if applicable
# ...code for cross-validation...

# Analysis of model performance
# ...code/logic for detailed analysis of errors, biases, etc...

# STEP 4.3. PREPARATION FOR DEPLOYMENT

# ...code for deployment preparation...
# Assuming the model will be deployed in a specific environment
# Include any necessary steps for preparing the model for deployment
# This might include serialization, testing the model in a deployment-like environment, etc.

# STEP 4.4. DOCUMENTATION AND REPORTING

# Prepare a comprehensive report
report = f"""
Model Selection Rationale:
- The chosen model (e.g., Random Forest) was selected due to its superior performance in terms of accuracy, precision, and recall.

Model Performance:
- Accuracy: {accuracy}
- Precision: {precision}
- Recall: {recall}
- F1-Score: {f1}

Additional Model Analysis:
- Detailed error analysis, biases, etc.
- Results from cross-validation (if performed).

Limitations and Recommendations:
- The model may have limitations in terms of scalability or real-time prediction.
- Future work could explore more advanced models or feature engineering techniques.

Deployment Steps:
- The model will be deployed in a cloud-based environment.
- Necessary steps for deployment include serialization and environment setup.
"""

logger.info("Model documentation and reporting completed.")

# Final Checks and Tests (if applicable)
# Include any additional code for final testing or checks before deployment

logger.info("Final checks and tests completed.")

logger.info("Cell 4 tasks completed successfully.")


In [None]:
# STEP 5. CROSS-VALIDATION AND ADDITIONAL METRICS ANALYSIS

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestRegressor

# Assuming you are using RandomForestRegressor as your model
model = RandomForestRegressor(random_state=random_seed)

# Define your scoring metrics
# Remove ROC AUC score if your task is regression
scoring_metrics = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro'),
    'f1': make_scorer(f1_score, average='macro')
}

# Perform 10-fold cross-validation
k_folds = 10  # Number of folds
cv_results = {}
for metric_name, scorer in scoring_metrics.items():
    scores = cross_val_score(model, X_p1, y_p1.fillna(-999), scoring=scorer, cv=k_folds)
    cv_results[metric_name] = scores
    logger.info(f"{metric_name} scores for each fold: {scores}")
    logger.info(f"Average {metric_name} over {k_folds} folds: {scores.mean()}")

# Additional metrics analysis and error/bias exploration
# ... Add your code for detailed analysis of errors, biases, etc. ...
logger.info("Cross-validation and additional metrics analysis completed.")

# Feature Importance Analysis using SHAP
# Assuming 'final_prediction_model' is your trained RandomForestRegressor model
import shap

# Load the trained model (if not already loaded)
final_prediction_model = joblib.load('/content/final_prediction_model.pkl')

# Explain the model's predictions using SHAP
explainer = shap.TreeExplainer(final_prediction_model)
shap_values = explainer.shap_values(X_train_p1)

# Plot summary plot using SHAP values
shap.summary_plot(shap_values, X_train_p1)

logger.info("Feature importance analysis using SHAP completed.")

In [None]:
# STEP 6. DETAILED ERROR AND BIAS ANALYSIS

import pandas as pd
import matplotlib.pyplot as plt
import joblib
import os

# Set up logging
import logging
logger = logging.getLogger(__name__)

# Mount Google Drive to access the predictions_df.csv file
drive.mount('/content/drive')

# Define the path in Google Drive where the predictions DataFrame is saved
predictions_df_path = '/content/drive/My Drive/predictions_df.csv'

# Load the DataFrame from the CSV file if it exists, otherwise create it
if os.path.exists(predictions_df_path):
    predictions_df = pd.read_csv(predictions_df_path)
    logger.info("predictions_df loaded from Google Drive successfully.")
else:
    # Make sure final_model is loaded
    final_model = joblib.load('/content/final_prediction_model.pkl')

    # Predict on the test set
    y_pred_final = final_model.predict(X_test_p1)

    # Create the predictions DataFrame
    predictions_df = pd.DataFrame({'Actual': y_test_p1, 'Predicted': y_pred_final})

    # Save the new predictions_df to Google Drive for future use
    predictions_df.to_csv(predictions_df_path, index=False)
    logger.info("predictions_df saved to Google Drive successfully.")

# Merge 'Prev_Week' into predictions_df
predictions_df = predictions_df.merge(data[['Prev_Week']], left_index=True, right_index=True)
# Merge 'Prev_Entry' into predictions_df
predictions_df = predictions_df.merge(data[['Prev_Entry']], left_index=True, right_index=True)

# Proceed with error analysis only if predictions_df is loaded or created
if 'predictions_df' in locals():
    # Analyze error distribution
    predictions_df['Error'] = predictions_df['Predicted'] - predictions_df['Actual']
    predictions_df['Absolute_Error'] = predictions_df['Error'].abs()

    # Plotting error distribution
    plt.hist(predictions_df['Error'], bins=30)
    plt.title('Error Distribution')
    plt.xlabel('Prediction Error')
    plt.ylabel('Frequency')
    plt.show()

# Subgroup analysis based on 'Prev_Week'
prev_week_performance = predictions_df.groupby('Prev_Week').mean()['Absolute_Error']
plt.figure(figsize=(10, 6))
prev_week_performance.plot(kind='bar')
plt.title('Performance by Previous Week')
plt.xlabel('Previous Week Draw')
plt.ylabel('Average Absolute Error')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Subgroup analysis based on 'Prev_Entry'
prev_entry_performance = predictions_df.groupby('Prev_Entry').mean()['Absolute_Error']
plt.figure(figsize=(10, 6))
prev_entry_performance.plot(kind='bar')
plt.title('Performance by Previous Entry')
plt.xlabel('Previous Entry')
plt.ylabel('Average Absolute Error')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Document findings
error_bias_report = """
Detailed Error Analysis:
- Error Distribution Insights: {'Describe your findings from the error distribution here'}
- Largest Errors: {'Describe characteristics of instances with largest errors here'}

Bias Exploration:
- Performance by Previous Week: {'Describe performance variations based on the previous week here'}
- Performance by Previous Entry: {'Describe performance variations based on the previous entry here'}
"""

logger.info("Error and bias analysis completed.")
logger.info(error_bias_report)


In [None]:
# STEP 7. FINAL REVIEW, DEPLOYMENT PREPARATION, AND DOCUMENTATION

# Final Model Review and Refinement
# ... Code/comments for any last adjustments to the model ...

# Deployment Preparation
# Serialize the final model
joblib.dump(final_prediction_model, '/content/final_prediction_model_for_deployment.pkl')

# Comprehensive Documentation Update
# ... Update your comprehensive report with all final findings and methodologies ...

# Final Checks and Tests
# ... Code/comments for final tests and checks ...

# Planning for Future Improvements
future_improvement_plan = """
Future Improvement Plans:
- Areas for further research: {describe areas for future research}
- Methodologies to explore: {describe potential methodologies for future iterations}
"""

logger.info("Final review and deployment preparation completed.")
logger.info(future_improvement_plan)


# New Section

# New Section