<a href="https://colab.research.google.com/github/azzedbenj007/AgentGPT/blob/main/Maintenance_predict_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# sarimax_report.py (version améliorée avec analyse complète)

import os
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller

# === 1. Configuration ===
MODEL_DIR = 'trained_models'
DATA_FILE = 'datac.csv'
TIME_COLUMN = 'timestamp'
REPORT_DIR = 'sarimax_reports'

ALL_NUMERIC_COLUMNS = [
    'temperature_oil', 'temperature_winding', 'current',
    'voltage', 'humidity', 'dissolved_gas'
]

# === 2. Préparation ===

# Créer un dossier pour les rapports si nécessaire
os.makedirs(REPORT_DIR, exist_ok=True)

# Chargement et préparation des données
print(f"📥 Chargement des données depuis {DATA_FILE}...")
try:
    df = pd.read_csv(DATA_FILE, parse_dates=[TIME_COLUMN])
    df.set_index(TIME_COLUMN, inplace=True)
    df.sort_index(inplace=True)
    # Remplissage robuste des valeurs manquantes
    if df.isnull().values.any():
        print("   -> Remplissage des valeurs manquantes (ffill, bfill)...")
        df.fillna(method='ffill', inplace=True)
        df.fillna(method='bfill', inplace=True)
except FileNotFoundError:
    print(f"❌ Fichier de données non trouvé : {DATA_FILE}")
    exit()

print("✅ Données prêtes.")

# === 3. Génération des Rapports (un par variable) ===

for col in ALL_NUMERIC_COLUMNS:
    print(f"\n{'='*20} RAPPORT COMPLET POUR : {col.upper()} {'='*20}")

    # Initialiser le contenu du rapport texte
    report_content = [f"RAPPORT D'ANALYSE COMPLET POUR LA VARIABLE : {col}\n", "="*50, "\n"]

    # --- PARTIE 1 : ANALYSE DE LA SÉRIE TEMPORELLE BRUTE ---
    report_content.append("PARTIE 1 : Analyse de la série temporelle brute\n--------------------------------------------\n")

    # 1a. Statistiques descriptives
    desc_stats = df[col].describe().to_string()
    report_content.append("1a. Statistiques Descriptives:\n")
    report_content.append(desc_stats + "\n")

    # 1b. Test de Stationnarité (Augmented Dickey-Fuller)
    report_content.append("\n1b. Test de Stationnarité (Augmented Dickey-Fuller):\n")
    adf_result = adfuller(df[col])
    report_content.append(f'   - ADF Statistic: {adf_result[0]:.4f}')
    report_content.append(f'   - p-value: {adf_result[1]:.4f}')
    report_content.append('   - Critical Values:')
    for key, value in adf_result[4].items():
        report_content.append(f'      - {key}: {value:.4f}')
    if adf_result[1] <= 0.05:
        report_content.append("   - Conclusion : p-value <= 0.05. La série est probablement stationnaire.\n")
    else:
        report_content.append("   - Conclusion : p-value > 0.05. La série n'est probablement pas stationnaire.\n")

    # 1c. Visualisation de la série temporelle
    plt.figure(figsize=(12, 6))
    plt.plot(df.index, df[col], label=f'Valeurs de {col}')
    plt.title(f"Évolution temporelle de : {col}")
    plt.xlabel("Date")
    plt.ylabel("Valeur")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(REPORT_DIR, f"raw_timeseries_{col}.png"))
    plt.close()

    # 1d. Visualisation de la distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(df[col], kde=True)
    plt.title(f"Distribution des valeurs de : {col}")
    plt.xlabel("Valeur")
    plt.ylabel("Fréquence")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(REPORT_DIR, f"raw_distribution_{col}.png"))
    plt.close()

    # 1e. Graphes d'autocorrélation (ACF & PACF)
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
    plot_acf(df[col], ax=ax1, lags=40, title=f"Autocorrélation (ACF) - {col}")
    plot_pacf(df[col], ax=ax2, lags=40, title=f"Autocorrélation Partielle (PACF) - {col}")
    plt.tight_layout()
    plt.savefig(os.path.join(REPORT_DIR, f"raw_acf_pacf_{col}.png"))
    plt.close()

    print(f"   -> Analyse de la série brute pour '{col}' terminée.")

    # --- PARTIE 2 : ANALYSE DU MODÈLE SARIMAX ---
    report_content.append("\nPARTIE 2 : Analyse du modèle SARIMAX entraîné\n---------------------------------------------\n")

    model_path = os.path.join(MODEL_DIR, f'model_{col}.pkl')
    if not os.path.exists(model_path):
        warning_msg = f"❌ MODÈLE MANQUANT pour '{col}'. Analyse du modèle ignorée."
        print(f"   -> {warning_msg}")
        report_content.append(warning_msg)
    else:
        with open(model_path, 'rb') as f:
            model_fit = pickle.load(f)

        # 2a. Résumé du modèle
        summary_text = model_fit.summary().as_text()
        report_content.append("2a. Résumé statistique du modèle:\n")
        report_content.append(summary_text + "\n")

        # 2b. Critères d'information
        report_content.append(f"\n2b. Critères d'information:\n   - AIC: {model_fit.aic:.2f}\n   - BIC: {model_fit.bic:.2f}\n")

        # 2c. Diagnostic des résidus (4 graphiques en 1)
        fig = model_fit.plot_diagnostics(figsize=(15, 12))
        fig.suptitle(f"Diagnostic des résidus du modèle pour : {col}", fontsize=16)
        plt.tight_layout(rect=[0, 0, 1, 0.96])
        plt.savefig(os.path.join(REPORT_DIR, f"model_diagnostics_{col}.png"))
        plt.close()
        print(f"   -> Analyse du modèle pour '{col}' terminée.")

    # --- Écriture du fichier rapport texte consolidé ---
    report_file_path = os.path.join(REPORT_DIR, f'report_{col}.txt')
    with open(report_file_path, 'w', encoding='utf-8') as f:
        f.write("\n".join(report_content))
    print(f"   -> Rapport texte sauvegardé : {report_file_path}")


print(f"\n✅ Tous les rapports ont été générés avec succès dans le dossier '{REPORT_DIR}'")

📥 Chargement des données depuis datac.csv...
✅ Données prêtes.

   -> Analyse de la série brute pour 'temperature_oil' terminée.
   -> Analyse du modèle pour 'temperature_oil' terminée.
   -> Rapport texte sauvegardé : sarimax_reports/report_temperature_oil.txt

   -> Analyse de la série brute pour 'temperature_winding' terminée.
   -> Analyse du modèle pour 'temperature_winding' terminée.
   -> Rapport texte sauvegardé : sarimax_reports/report_temperature_winding.txt

   -> Analyse de la série brute pour 'current' terminée.
   -> Analyse du modèle pour 'current' terminée.
   -> Rapport texte sauvegardé : sarimax_reports/report_current.txt

   -> Analyse de la série brute pour 'voltage' terminée.
   -> Analyse du modèle pour 'voltage' terminée.
   -> Rapport texte sauvegardé : sarimax_reports/report_voltage.txt

   -> Analyse de la série brute pour 'humidity' terminée.
   -> Analyse du modèle pour 'humidity' terminée.
   -> Rapport texte sauvegardé : sarimax_reports/report_humidity.txt

In [2]:
# train_models.py

import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pickle
import warnings
import os

warnings.filterwarnings("ignore")

print("--- Starting Dynamic Model Training ---")

# --- Configuration ---
file_path = 'datac.csv'

# Define ALL columns that could potentially be a target or a feature.
# We will exclude the 'fault' column as it seems categorical.
all_numeric_columns = [
    'temperature_oil',
    'temperature_winding',
    'current',
    'voltage',
    'humidity',
    'dissolved_gas'
]
time_column = 'timestamp'

# The directory where models will be saved
model_dir = 'trained_models'
# --------------------

# Create the model directory if it doesn't exist
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    print(f"Created directory: '{model_dir}'")

# 1. Load and prepare the entire dataset
print(f"Loading data from '{file_path}'...")
try:
    df = pd.read_csv(file_path, usecols=[time_column] + all_numeric_columns)
except (FileNotFoundError, ValueError) as e:
    print(f"Error loading data: {e}")
    exit()

df[time_column] = pd.to_datetime(df[time_column])
df = df.set_index(time_column).sort_index()

# Handle missing values
if df.isnull().values.any():
    df.fillna(method='ffill', inplace=True)
    df.fillna(method='bfill', inplace=True)
print("Data preparation complete.")


# 2. Loop through each target column to train and save a model
for target_column in all_numeric_columns:
    print(f"\n--- Training model for: {target_column} ---")

    # The target variable (y) is the current column in the loop
    y_train = df[target_column]

    # The exogenous variables (exog) are all OTHER numeric columns
    exog_columns = [col for col in all_numeric_columns if col != target_column]
    exog_train = df[exog_columns]

    print(f"Target: {target_column}")
    print(f"Features: {exog_columns}")

    # Define SARIMAX model parameters
    my_order = (1, 1, 1)
    my_seasonal_order = (1, 1, 1, 24)

    # Train the SARIMAX model
    print(f"Training SARIMAX for {target_column}... (This may take a few minutes)")
    model = SARIMAX(y_train, exog=exog_train, order=my_order, seasonal_order=my_seasonal_order)
    model_fit = model.fit(disp=False)
    print("Training complete.")

    # 3. Save the trained model to a unique file
    model_save_path = os.path.join(model_dir, f'model_{target_column}.pkl')
    print(f"Saving model to '{model_save_path}'...")
    with open(model_save_path, 'wb') as pkl_file:
        pickle.dump(model_fit, pkl_file)

print("\n--- All models have been successfully trained and saved! ---")

--- Starting Dynamic Model Training ---
Loading data from 'datac.csv'...
Data preparation complete.

--- Training model for: temperature_oil ---
Target: temperature_oil
Features: ['temperature_winding', 'current', 'voltage', 'humidity', 'dissolved_gas']
Training SARIMAX for temperature_oil... (This may take a few minutes)
Training complete.
Saving model to 'trained_models/model_temperature_oil.pkl'...

--- Training model for: temperature_winding ---
Target: temperature_winding
Features: ['temperature_oil', 'current', 'voltage', 'humidity', 'dissolved_gas']
Training SARIMAX for temperature_winding... (This may take a few minutes)
Training complete.
Saving model to 'trained_models/model_temperature_winding.pkl'...

--- Training model for: current ---
Target: current
Features: ['temperature_oil', 'temperature_winding', 'voltage', 'humidity', 'dissolved_gas']
Training SARIMAX for current... (This may take a few minutes)
Training complete.
Saving model to 'trained_models/model_current.pkl'.

In [4]:
# predict_from_model.py

import pandas as pd
import pickle
import warnings
import os

warnings.filterwarnings("ignore")

# --- Configuration ---
file_path = 'datac.csv'
model_dir = 'trained_models'
all_numeric_columns = [
    'temperature_oil',
    'temperature_winding',
    'current',
    'voltage',
    'humidity',
    'dissolved_gas'
]
time_column = 'timestamp'
# --------------------

# --- 1. SETUP: LOAD HISTORICAL DATA AND PRE-LOAD MODELS (Optional but efficient) ---
print("--- Initializing Dynamic Prediction Service ---")

# We can pre-load all models into a dictionary for faster access
loaded_models = {}
for column_name in all_numeric_columns:
    model_path = os.path.join(model_dir, f'model_{column_name}.pkl')
    try:
        with open(model_path, 'rb') as f:
            loaded_models[column_name] = pickle.load(f)
        print(f"Successfully loaded model for '{column_name}'")
    except FileNotFoundError:
        print(f"Warning: Model file not found at '{model_path}'. Cannot predict for this column.")

# Load historical data for context (last values, past data checks)
try:
    df_history = pd.read_csv(file_path, usecols=[time_column] + all_numeric_columns)
    df_history[time_column] = pd.to_datetime(df_history[time_column])
    df_history = df_history.set_index(time_column).sort_index()
    last_known_timestamp = df_history.index[-1]
    print("\nHistorical data loaded for context.")
except FileNotFoundError:
    print(f"FATAL ERROR: Historical data file '{file_path}' not found. Cannot proceed.")
    exit()

print("--- Service is ready for predictions. ---\n")


# --- 2. DYNAMIC PREDICTION FUNCTION ---
def predict_value_at(target_column: str, timestamp_str: str) -> float:
    """Predicts a value for a specific column and timestamp using the correct pre-loaded model."""

    # Validate the requested target column
    if target_column not in loaded_models:
        print(f"Error: No model available for '{target_column}'. Please train it first.")
        return None

    model = loaded_models[target_column]

    try:
        target_timestamp = pd.to_datetime(timestamp_str)
    except ValueError:
        print(f"Error: Invalid date format for '{timestamp_str}'. Use 'YYYY-MM-DD HH:MM:SS'.")
        return None

    # Handle requests for dates that are in our historical data
    if target_timestamp <= last_known_timestamp:
        print(f"Info: '{timestamp_str}' is in the past. Returning known value for {target_column}.")
        return df_history.asof(target_timestamp)[target_column]

    # For future predictions, we need the exogenous variables (all columns EXCEPT the target)
    exog_cols = [col for col in all_numeric_columns if col != target_column]
    last_exog_values = df_history[exog_cols].iloc[-1:]

    # Forecast into the future
    forecast_result = model.get_forecast(until=target_timestamp, exog=last_exog_values)
    predicted_value = forecast_result.predicted_mean.iloc[-1]

    return predicted_value


# --- 3. TEST THE FUNCTION ---
if __name__ == "__main__":
    print("--- Testing the Dynamic Prediction Function ---\n")

    # Define test cases: (column_to_predict, time_to_predict)
    test_cases = [
        ('temperature_oil', (last_known_timestamp + pd.Timedelta(hours=6)).strftime('%Y-%m-%d %H:%M:%S')),
        ('current', (last_known_timestamp + pd.Timedelta(hours=24)).strftime('%Y-%m-%d %H:%M:%S')),
        ('voltage', (last_known_timestamp + pd.Timedelta(hours=48)).strftime('%Y-%m-%d %H:%M:%S')),
        ('humidity', (last_known_timestamp + pd.Timedelta(hours=72)).strftime('%Y-%m-%d %H:%M:%S')),
        ('temperature_winding', (last_known_timestamp + pd.Timedelta(hours=1)).strftime('%Y-%m-%d %H:%M:%S')),
    ]

    for column, ts_str in test_cases:
        predicted_val = predict_value_at(column, ts_str)
        if predicted_val is not None:
            # Get the units for better print statements
            units = "V" if "volt" in column else "A" if "curr" in column else "%" if "hum" in column else "C" if "temp" in column else ""
            print(f"Prediction for '{column}' at '{ts_str}': {predicted_val:.2f} {units}")
        print("-" * 20)

--- Initializing Dynamic Prediction Service ---
Successfully loaded model for 'temperature_oil'
Successfully loaded model for 'temperature_winding'
Successfully loaded model for 'current'
Successfully loaded model for 'voltage'
Successfully loaded model for 'humidity'
Successfully loaded model for 'dissolved_gas'

Historical data loaded for context.
--- Service is ready for predictions. ---

--- Testing the Dynamic Prediction Function ---

Prediction for 'temperature_oil' at '2023-03-25 13:00:00': 59.34 C
--------------------
Prediction for 'current' at '2023-03-26 07:00:00': 249.55 A
--------------------
Prediction for 'voltage' at '2023-03-27 07:00:00': 21934.17 V
--------------------
Prediction for 'humidity' at '2023-03-28 07:00:00': 36.91 %
--------------------
Prediction for 'temperature_winding' at '2023-03-25 08:00:00': 67.27 C
--------------------


In [5]:
# train_classifier.py (version robuste pour prédiction à J+1)

import os
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# === 1. Configuration ===

FEATURES_COLS = [
    'temperature_oil', 'temperature_winding', 'current',
    'voltage', 'humidity', 'dissolved_gas'
]
TIMESTAMP_COL = 'timestamp'
FAULT_COL = 'fault'
TARGET_COL = 'Panne_Le_Lendemain'

DATA_FILE = 'datac.csv'  # Assure-toi d'utiliser le fichier nettoyé
OUTPUT_MODEL_PATH = 'trained_models/classifier_model_j1.pkl'
DETECTION_HORIZON_DAYS = 1

# === 2. Chargement des données ===

print(f"📥 Chargement des données depuis : {DATA_FILE}")
try:
    df = pd.read_csv(DATA_FILE, parse_dates=[TIMESTAMP_COL])
except FileNotFoundError:
    print(f"❌ Fichier non trouvé : {DATA_FILE}")
    exit()
except Exception as e:
    print(f"❌ Erreur de chargement : {e}")
    exit()

# Vérifier la présence des colonnes nécessaires
required_columns = FEATURES_COLS + [FAULT_COL, TIMESTAMP_COL]
missing_cols = [col for col in required_columns if col not in df.columns]
if missing_cols:
    print(f"❌ Colonnes manquantes dans le fichier : {missing_cols}")
    exit()

# === 3. Préparation de la cible à J+1 ===

print(f"🛠️ Création de la variable cible '{TARGET_COL}' (décalage -{DETECTION_HORIZON_DAYS})...")
df[TARGET_COL] = df[FAULT_COL].shift(-DETECTION_HORIZON_DAYS, fill_value=0).astype(int)

X = df[FEATURES_COLS]
y = df[TARGET_COL]

# === 4. Affichage de la distribution des classes ===

print("\n📊 Distribution des classes (0 = pas de panne, 1 = panne demain) :")
print(y.value_counts(normalize=True).rename(lambda x: f"{x} - {'Panne' if x == 1 else 'OK'}"))

# === 5. Split entraînement / test (80% - 20%) ===

split_index = int(len(df) * 0.8)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

print(f"\n📚 Entraînement sur {len(X_train)} exemples, test sur {len(X_test)}.")

# === 6. Entraînement du modèle ===

model = RandomForestClassifier(
    n_estimators=150,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

print("\n⚙️ Entraînement du modèle RandomForest...")
model.fit(X_train, y_train)
print("✅ Entraînement terminé.")

# === 7. Évaluation ===

print("\n📈 Évaluation sur l'ensemble de test :")
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Pas de Panne (0)', 'Panne (1)']))

# === 8. Sauvegarde ===

# Crée le dossier s'il n'existe pas
os.makedirs(os.path.dirname(OUTPUT_MODEL_PATH), exist_ok=True)

print(f"\n💾 Sauvegarde du modèle vers : {OUTPUT_MODEL_PATH}")
joblib.dump(model, OUTPUT_MODEL_PATH)

print("\n✅ Script terminé avec succès ! Modèle J+1 prêt à être utilisé.")


📥 Chargement des données depuis : datac.csv
🛠️ Création de la variable cible 'Panne_Le_Lendemain' (décalage -1)...

📊 Distribution des classes (0 = pas de panne, 1 = panne demain) :
Panne_Le_Lendemain
0 - OK       0.953
1 - Panne    0.047
Name: proportion, dtype: float64

📚 Entraînement sur 1600 exemples, test sur 400.

⚙️ Entraînement du modèle RandomForest...
✅ Entraînement terminé.

📈 Évaluation sur l'ensemble de test :
                  precision    recall  f1-score   support

Pas de Panne (0)       0.94      1.00      0.97       378
       Panne (1)       0.00      0.00      0.00        22

        accuracy                           0.94       400
       macro avg       0.47      0.50      0.49       400
    weighted avg       0.89      0.94      0.92       400


💾 Sauvegarde du modèle vers : trained_models/classifier_model_j1.pkl

✅ Script terminé avec succès ! Modèle J+1 prêt à être utilisé.


In [14]:
#pour telecharger un fichier dans google colab
from google.colab import files
uploaded = files.upload()


Saving predict.py to predict.py


In [16]:
# predict_failure_pipeline.py (Nouveau script qui remplace predict_classifier.py et predict.py)

import pandas as pd
import joblib
from predict import predict_value_at # On importe la fonction du script SARIMAX

# === 1. Configuration ===
CLASSIFIER_MODEL_PATH = 'trained_models/classifier_model_j1.pkl'
FEATURES_COLS = [
    'temperature_oil', 'temperature_winding', 'current',
    'voltage', 'humidity', 'dissolved_gas'
]
# Horizon de prédiction : 24 heures dans le futur
prediction_horizon = pd.Timedelta(hours=24)
last_known_timestamp = pd.to_datetime(pd.read_csv('datac.csv')['timestamp'].iloc[-1])
target_timestamp = last_known_timestamp + prediction_horizon
target_timestamp_str = target_timestamp.strftime('%Y-%m-%d %H:%M:%S')

# === 2. Chargement du classifieur ===
print(f"📥 Chargement du modèle classifieur depuis : {CLASSIFIER_MODEL_PATH}")
classifier_model = joblib.load(CLASSIFIER_MODEL_PATH)

# === 3. Construction du vecteur de caractéristiques du futur via SARIMAX ===
print(f"\n🔮 Construction des données pour le futur ({target_timestamp_str}) en utilisant les modèles SARIMAX...")
future_data = {}
for feature in FEATURES_COLS:
    print(f"  -> Prédiction de '{feature}'...")
    predicted_value = predict_value_at(feature, target_timestamp_str)
    if predicted_value is None:
        print(f"❌ Impossible de prédire '{feature}', arrêt du processus.")
        exit()
    future_data[feature] = predicted_value

# Convertir le dictionnaire en DataFrame d'une ligne
X_future = pd.DataFrame([future_data])

print("\n📊 Données futures prédites pour le classifieur :")
print(X_future)

# === 4. Prédiction finale de la panne ===
prediction = classifier_model.predict(X_future)[0]
proba = classifier_model.predict_proba(X_future)[0][prediction]

# === 5. Résultat ===
if prediction == 1:
    print(f"\n🚨 Prédiction FINALE : **Panne probable à {target_timestamp_str}** avec une confiance de {proba:.2%}")
else:
    print(f"\n✅ Prédiction FINALE : **Pas de panne prévue à {target_timestamp_str}** avec une confiance de {proba:.2%}")

--- Initializing Dynamic Prediction Service ---
Successfully loaded model for 'temperature_oil'
Successfully loaded model for 'temperature_winding'
Successfully loaded model for 'current'
Successfully loaded model for 'voltage'
Successfully loaded model for 'humidity'
Successfully loaded model for 'dissolved_gas'

Historical data loaded for context.
--- Service is ready for predictions. ---

📥 Chargement du modèle classifieur depuis : trained_models/classifier_model_j1.pkl

🔮 Construction des données pour le futur (2023-03-26 07:00:00) en utilisant les modèles SARIMAX...
  -> Prédiction de 'temperature_oil'...
  -> Prédiction de 'temperature_winding'...
  -> Prédiction de 'current'...
  -> Prédiction de 'voltage'...
  -> Prédiction de 'humidity'...
  -> Prédiction de 'dissolved_gas'...

📊 Données futures prédites pour le classifieur :
   temperature_oil  temperature_winding     current       voltage   humidity  \
0        59.343341            67.265554  249.549189  21934.171041  36.914

In [17]:
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
import warnings
import numpy as np

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# --- 1. SETUP PHASE: TRAIN THE MODEL ONCE ---

print("--- Starting Setup Phase ---")

# Configuration
file_path = 'datac.csv'
time_column = 'timestamp'
target_column = 'temperature_oil'
exog_columns = ['current', 'voltage', 'humidity', 'dissolved_gas']

# Load and prepare the entire dataset
print(f"Loading data from '{file_path}'...")
try:
    columns_to_load = [time_column, target_column] + exog_columns
    df = pd.read_csv(file_path, usecols=columns_to_load)
except (FileNotFoundError, ValueError) as e:
    print(f"Error loading data: {e}")
    exit()

df[time_column] = pd.to_datetime(df[time_column])
df = df.set_index(time_column).sort_index()

# Handle missing values
if df.isnull().values.any():
    print(f"Found {df.isnull().values.sum()} missing values. Filling them...")
    df.fillna(method='ffill', inplace=True)
    df.fillna(method='bfill', inplace=True)

print("Data preparation complete.")

# Separate the target (y) and exogenous (exog) variables
y_full = df[target_column]
exog_full = df[exog_columns]

# Define SARIMAX model parameters
my_order = (1, 1, 1)
my_seasonal_order = (1, 1, 1, 24)

# Train the SARIMAX model on the full dataset
print("Training the predictive model on all available data... (This may take a few minutes)")
model_full = SARIMAX(y_full, exog=exog_full, order=my_order, seasonal_order=my_seasonal_order)
model_fit_full = model_full.fit(disp=False)
print("Model training complete.")
print("--- Setup Phase Finished. Ready for predictions. ---\n")


# --- 2. PREDICTION FUNCTION ---

def predict_temperature_at(timestamp_str: str) -> float:
    """
    Predicts the temperature for a specific future timestamp.

    Args:
        timestamp_str (str): The date and time for the prediction,
                             in a format like 'YYYY-MM-DD HH:MM:SS'.

    Returns:
        float: The predicted temperature value.
    """
    global model_fit_full, df, exog_full

    # Convert input string to a datetime object
    try:
        target_timestamp = pd.to_datetime(timestamp_str)
    except ValueError:
        print(f"Error: Invalid date format for '{timestamp_str}'. Please use 'YYYY-MM-DD HH:MM:SS'.")
        return None

    last_known_timestamp = df.index[-1]

    # Check if the requested time is in the past (already has data)
    if target_timestamp <= last_known_timestamp:
        print(f"Info: '{timestamp_str}' is in the past. Returning actual known value.")
        # Return the closest known value if it exists
        try:
            return df.loc[target_timestamp, target_column]
        except KeyError:
            # Use asof for approximate match if exact timestamp is not in index
            return df[target_column].asof(target_timestamp)


    # If the time is in the future, we need to forecast
    # Determine the number of steps to forecast
    # The frequency of the data is assumed to be hourly based on the seasonal order (24) and prior context
    freq = pd.infer_freq(df.index)
    if freq is None:
        print("Warning: Could not infer frequency of the time series. Assuming hourly ('H').")
        freq = 'H'

    time_difference = target_timestamp - last_known_timestamp
    # Calculate the number of steps based on the inferred frequency
    steps = int(time_difference / pd.Timedelta(freq))

    if steps <= 0:
         print(f"Info: Target timestamp '{target_timestamp}' is not in the future relative to the last known timestamp '{last_known_timestamp}'.")
         return df[target_column].iloc[-1]


    # We need to provide future values for the helper variables (exog)
    # A common assumption is that they remain constant at their last known value.
    last_exog_values = exog_full.iloc[-1:].values

    # Repeat the last known exogenous values for the number of forecast steps
    exog_forecast = np.tile(last_exog_values, (steps, 1))


    # Generate a forecast from the end of our data up to the target time
    forecast_result = model_fit_full.get_forecast(
        steps=steps, # Use steps instead of until
        exog=exog_forecast # Provide exog for all forecast steps
    )

    # The result contains predictions for the whole interval. We just need the one at our target time.
    # The forecast result index will be the future timestamps
    predicted_value = forecast_result.predicted_mean.iloc[-1] # Get the last predicted value

    return predicted_value


# --- 3. TEST THE FUNCTION ---

print("--- Testing the Prediction Function ---")

# Get the last timestamp from our data to generate future test dates
last_date_in_data = df.index[-1]

# Generate 10 random future timestamps within the next 7 days for testing
# (1 to 168 hours ahead)
test_dates = [last_date_in_data + pd.Timedelta(hours=h) for h in sorted(pd.Series(range(1, 169)).sample(10))]
test_date_strings = [dt.strftime('%Y-%m-%d %H:%M:%S') for dt in test_dates]

# Add one test date from the past to show that functionality
test_date_strings.append(df.index[100].strftime('%Y-%m-%d %H:%M:%S')) # Add a date from the past that is in the index

for ts_str in test_date_strings:
    predicted_temp = predict_temperature_at(ts_str)
    if predicted_temp is not None:
        print(f"Prediction for '{ts_str}': {predicted_temp:.2f} degrees")

--- Starting Setup Phase ---
Loading data from 'datac.csv'...
Data preparation complete.
Training the predictive model on all available data... (This may take a few minutes)
Model training complete.
--- Setup Phase Finished. Ready for predictions. ---

--- Testing the Prediction Function ---


ValueError: unit abbreviation w/o a number