In [2]:
# Import packages
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.compose import ColumnTransformer


In [3]:
# Read the preprocessed data

# Construct the full path to the CSV file
csv_file_path_pre_processed = Path.cwd().parent / 'preprocessed_data.csv'
csv_file_path_raw = Path.cwd().parent / 'training_data_vt2025.csv'

# Read the CSV file using pandas
pre_processed_data = pd.read_csv(csv_file_path_pre_processed)
raw_data = pd.read_csv(csv_file_path_raw)

In [4]:
# Split the data into input values, X, and output value, y
X = pre_processed_data.drop(columns=['increase_stock'])
y = pre_processed_data['increase_stock']

In [13]:
# Stratified K-Fold Cross Validation (better for imbalanced classification problems)

def model_training(X, y, k, model):
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    accuracies = []
    models = []
    classification_reports = []
    confusion_matrices = []

    for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
        print(f"Fold {fold+1}/{k}")
        
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        # Scaling
        ct = ColumnTransformer([
            ('scaler', StandardScaler(), ['temp','dew', 'windspeed', 'day_of_week_sin', 'day_of_week_cos', 'hour_of_day_sin', 'hour_of_day_cos', 'month_sin', 'month_cos']),
            ('passthrough', 'passthrough', ['weekday', 'is_raining', 'is_snowing', 'is_summer']),
            ('minmax', MinMaxScaler(), ['humidity', 'cloudcover'])
        ], remainder='passthrough')
        X_train = ct.fit_transform(X_train)
        X_val = ct.transform(X_val)

        # Train the model
        model_1 = model
        model_1.fit(X_train, y_train)

        # Make predictions on the validation set
        y_pred = model_1.predict(X_val)

        # Evaluate the model
        accuracy = accuracy_score(y_val, y_pred)
        accuracies.append(accuracy)

        report = classification_report(y_val, y_pred, output_dict=True) 
        classification_reports.append(report)

        cm = confusion_matrix(y_val, y_pred)
        confusion_matrices.append(cm)

        models.append(model_1)

    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)

    results = {
        'mean_accuracy': mean_accuracy,
        'std_accuracy': std_accuracy,
        'models': models,
        'classification_reports': classification_reports,
        'confusion_matrices': confusion_matrices
    }

    return results

In [21]:
results = model_training(X, y, 5, LinearDiscriminantAnalysis())

Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5


In [22]:
print(f"Mean Accuracy: {results['mean_accuracy']:.4f}")
print(f"Standard Deviation of Accuracy: {results['std_accuracy']:.4f}")

Mean Accuracy: 0.8606
Standard Deviation of Accuracy: 0.0073


In [None]:
classification_reports[1]

{'0': {'precision': 0.8905109489051095,
  'recall': 0.9277566539923955,
  'f1-score': 0.9087523277467412,
  'support': 263.0},
 '1': {'precision': 0.5869565217391305,
  'recall': 0.47368421052631576,
  'f1-score': 0.5242718446601942,
  'support': 57.0},
 'accuracy': 0.846875,
 'macro avg': {'precision': 0.7387337353221199,
  'recall': 0.7007204322593557,
  'f1-score': 0.7165120862034677,
  'support': 320.0},
 'weighted avg': {'precision': 0.8364403165661696,
  'recall': 0.846875,
  'f1-score': 0.84026674169695,
  'support': 320.0}}