# Partie 1. Visualisation et étude de corrélation

## Non normalisés

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
csv_files_path = './Data/'
csv_files = [f for f in os.listdir(csv_files_path) if f.endswith('.csv')]
example_files = csv_files[:3]
example_data = {file: pd.read_csv(os.path.join(csv_files_path, file)) for file in example_files}
example_data[example_files[2]].head()  

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

def affichage_plot(data, title, directory):
    if not os.path.exists(directory):
        os.makedirs(directory) 
    
    data['year'] = pd.to_datetime(data['date']).dt.year
    data['year_str'] = data['year'].astype(str)
    unique_dates = data.drop_duplicates(subset='year') 

    for column in data.columns[1:-2]:  
        plt.figure(figsize=(12, 6))
        plt.plot(data['date'], data[column], label=column, marker='.', linestyle='-')
        plt.title(f'{column} over time for {title}')
        plt.xlabel('Year')
        plt.ylabel(column)
        plt.grid(True)
        
        plt.xticks(unique_dates['date'], unique_dates['year_str'], rotation=45)
        plt.legend()
        plt.tight_layout()
        
        file_name = os.path.join(directory, f"{title}_{column}.png")
        plt.savefig(file_name)
        
        plt.close() 

output_directory = "plots"
for file, data in example_data.items():
    affichage_plot(data, file, output_directory)


## Normalisé

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def affichage_plot(data, title, directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    data['year'] = pd.to_datetime(data['date']).dt.year
    data['year_str'] = data['year'].astype(str)
    unique_dates = data.drop_duplicates(subset='year')  

    for column in data.columns[1:-2]:  
        plt.figure(figsize=(12, 6))
        plt.plot(data['date'], data[column], label=column, marker='.', linestyle='-')
        plt.title(f'{column} over time for {title}')
        plt.xlabel('Year')
        plt.ylabel(column)
        plt.grid(True)
        
        plt.xticks(unique_dates['date'], unique_dates['year_str'], rotation=45)
        plt.legend()
        plt.tight_layout()
       
        file_name = os.path.join(directory, f"{title}_{column}_Normalized.png")
        plt.savefig(file_name)
        
        plt.close()

def normalize_data_cleaned(data):
    data_cleaned = data.dropna()  # Supprimer les valeurs manquantes
    numerical_columns = data_cleaned.select_dtypes(include=[np.number]).columns  
    for column in numerical_columns:
        mean = data_cleaned[column].mean()
        std = data_cleaned[column].std()
        if std != 0:
            data_cleaned[column] = (data_cleaned[column] - mean) / std
        else:
            data_cleaned[column] = 0  # Normaliser en utilisant la moyenne et l'écart-type
    return data_cleaned

# Normaliser les données pour chaque fichier
normalized_data_cleaned = {file: normalize_data_cleaned(df.copy()) for file, df in example_data.items()}

for file, data in normalized_data_cleaned.items():
    affichage_plot(data, file, output_directory)


## Corrélation

In [None]:
import pandas as pd
import os
from scipy.stats import pearsonr

def calculate_ccc(data, var1, var2):
    """
    Calculer le Coefficient de Corrélation de Concordance entre deux variables.
    """
    clean_data = data.dropna(subset=[var1, var2])  # Supprimer les valeurs manquantes pour les variables spécifiées
    mean_var1 = clean_data[var1].mean()
    mean_var2 = clean_data[var2].mean()
    var1_mean_diff = clean_data[var1] - mean_var1
    var2_mean_diff = clean_data[var2] - mean_var2
    
    covariance = (var1_mean_diff * var2_mean_diff).mean()
    var1_variance = clean_data[var1].var()
    var2_variance = clean_data[var2].var()
    
    correlation = covariance / (var1_variance * var2_variance)**0.5
    mean_diff_ratio = 2 * covariance / (var1_variance + var2_variance + (mean_var1 - mean_var2)**2)
    
    ccc = correlation * mean_diff_ratio
    return ccc

variables = ['ET', 'NDVI', 'P', 'T']

ccc_results = {}
for file, data in example_data.items():
    ccc_results[file] = {}
    for var in variables:
        #données non normalisées
        ccc_results[file][f'GWL vs {var} (non-normalized)'] = calculate_ccc(data, 'GWL', var)
        #données normalisées
        ccc_results[file][f'GWL vs {var} (normalized)'] = calculate_ccc(normalized_data_cleaned[file], 'GWL', var)

ccc_results


# Partie 2. Prédiction du niveau des nappes

## Prétraitement de données

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error
from math import sqrt

# Définir les méthodes d'imputation
def knn_imputation(df, n_neighbors=5):
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df_imputed = df.copy()
    df_imputed[numeric_cols] = imputer.fit_transform(df[numeric_cols])
    return df_imputed

def linear_interpolation_imputation(df):
    df_imputed = df.copy()
    df_imputed = df_imputed.interpolate(method='linear')
    return df_imputed

def linear_knn_imputation(df, n_neighbors=5):
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    df_interpolated = df[numeric_cols].copy()
    df_interpolated = df_interpolated.interpolate(method='linear')
    
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df_imputed = df_interpolated.copy()
    df_imputed.iloc[:, :] = imputer.fit_transform(df_interpolated)
    
    df_combined = df.copy()
    df_combined[numeric_cols] = df_imputed
    
    return df_combined

# Fonction pour masquer une partie des valeurs non nulles pour l'évaluation
def mask_values(df, column, mask_fraction=0.2):
    df_copy = df.copy()
    non_null_indices = df_copy[df_copy[column].notna()].index
    mask_size = int(len(non_null_indices) * mask_fraction)
    mask_indices = np.random.choice(non_null_indices, mask_size, replace=False)
    df_copy.loc[mask_indices, column] = np.nan
    return df_copy, mask_indices

# Fonction pour évaluer les méthodes d'imputation
def evaluate_imputation_on_masked(df, original_df, mask_indices, method_name):
    rmse = sqrt(mean_squared_error(original_df.loc[mask_indices]['GWL'], df.loc[mask_indices]['GWL']))
    return rmse

# Fonction pour appliquer l'imputation et s'assurer qu'il n'y a plus de NaN
def apply_and_evaluate(method, method_name, masked_df, original_df, mask_indices, n_neighbors=5):
    if method_name == "KNN":
        imputed_df = knn_imputation(masked_df, n_neighbors)
    elif method_name == "Linear":
        imputed_df = linear_interpolation_imputation(masked_df)
    elif method_name == "KNN + Linear":
        imputed_df = linear_knn_imputation(masked_df, n_neighbors)
    
    imputed_df.fillna(imputed_df.mean(), inplace=True)
    
    rmse = evaluate_imputation_on_masked(imputed_df, original_df, mask_indices, method_name)
    return rmse

# Fonction pour traiter et évaluer les fichiers
def process_and_evaluate_files(source_dir, n_neighbors=5):
    results = {}
    errors = []
    for filename in os.listdir(source_dir):
        if filename.endswith(".csv"):
            try:
                filepath = os.path.join(source_dir, filename)
                df = pd.read_csv(filepath)
                
                masked_df, mask_indices = mask_values(df, 'GWL')
                
                rmse_knn = apply_and_evaluate(knn_imputation, "KNN", masked_df, df, mask_indices, n_neighbors)
                rmse_linear = apply_and_evaluate(linear_interpolation_imputation, "Linear", masked_df, df, mask_indices, n_neighbors)
                rmse_linear_knn = apply_and_evaluate(linear_knn_imputation, "KNN + Linear", masked_df, df, mask_indices, n_neighbors)
                
                results[filename] = {
                    "KNN": rmse_knn,
                    "Linear": rmse_linear,
                    "KNN + Linear": rmse_linear_knn
                }
            except Exception as e:
                errors.append((filename, str(e)))
    
    return results, errors

source_directory = './Data/'

results, errors = process_and_evaluate_files(source_directory, n_neighbors=18)

In [None]:
import pandas as pd

df = pd.DataFrame(results).T

# Identifier la meilleure approche
df['Best_Approach'] = df.idxmin(axis=1)
mean_performance = df.mean()
best_overall_approach = mean_performance.idxmin()

print("Meilleure approche pour chaque fichier :")
print(df['Best_Approach'].value_counts())
print("\nPerformance moyenne de chaque méthode :")
print(mean_performance)
print("\nMeilleure approche globale :")
print(best_overall_approach)


In [None]:
import os
import pandas as pd
from sklearn.impute import KNNImputer
import numpy as np

def linear_knn_imputation(df):
    # Sélectionner uniquement les colonnes numériques
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    df_interpolated = df[numeric_cols].copy()
    df_interpolated = df_interpolated.interpolate(method='linear')
    
    imputer = KNNImputer(n_neighbors=5)
    df_imputed = df_interpolated.copy()
    df_imputed.iloc[:, :] = imputer.fit_transform(df_interpolated)
    
    # Ajouter du bruit aléatoire pour éviter les valeurs identiques
    noise = np.random.normal(0, 1e-5, df_imputed.shape)
    df_imputed += noise
    
    df_combined = df.copy()
    df_combined[numeric_cols] = df_imputed
    
    # Supprimer les lignes avec des valeurs imputées identiques
    df_combined = df_combined.loc[~df_combined.duplicated(subset=numeric_cols, keep=False)]
    
    return df_combined

def process_files(source_dir, dest_dir):
    
    os.makedirs(dest_dir, exist_ok=True)
    
    for filename in os.listdir(source_dir):
        if filename.endswith(".csv"):
            try:
                filepath = os.path.join(source_dir, filename)
                df = pd.read_csv(filepath)
                
                # Appliquer l'imputation Linéaire + KNN
                df_imputed = linear_knn_imputation(df)
                
                # Sauvegarder
                output_filepath = os.path.join(dest_dir, filename)
                df_imputed.to_csv(output_filepath, index=False)
                print(f"Processed and saved: {filename}")
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

source_directory = './Data/'  
destination_directory = './Data1/'  

process_files(source_directory, destination_directory)


In [None]:
""""
#Pour supprimer les valeurs Null au lieu de les remplacer

import os
import pandas as pd

def process_files(source_dir, dest_dir):
    
    os.makedirs(dest_dir, exist_ok=True)
    
    
    for filename in os.listdir(source_dir):
        if filename.endswith(".csv"):
            try:
                filepath = os.path.join(source_dir, filename)
                df = pd.read_csv(filepath)
                
                # Supprimer les lignes où GWL est nul
                df_cleaned = df.dropna(subset=['GWL'])
                

                output_filepath = os.path.join(dest_dir, filename)
                df_cleaned.to_csv(output_filepath, index=False)
                print(f"Processed and saved: {filename}")
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

source_directory = './Data/'  
destination_directory = './Data/' 

process_files(source_directory, destination_directory)""""

## Modèle spécifique à un puits

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset

# Fonction de prétraitement des données
def preprocess_data(file_path):
    data = pd.read_csv(file_path)

    # Trier par date
    data['date'] = pd.to_datetime(data['date'])
    data.sort_values('date', inplace=True)

    # Extraire les features et la cible
    features = data[['P', 'T', 'ET', 'NDVI']]
    target = data['GWL']

    # Normaliser les features
    feature_scaler = MinMaxScaler()
    features_scaled = feature_scaler.fit_transform(features)

    # Normaliser la cible
    target_scaler = MinMaxScaler()
    target_scaled = target_scaler.fit_transform(target.values.reshape(-1, 1))

    # Créer des séquences pour les prédictions à 6 mois
    sequence_length = 6
    sequences = []
    targets = []

    for i in range(len(features_scaled) - sequence_length):
        sequences.append(features_scaled[i:i+sequence_length])
        targets.append(target_scaled[i+1:i+1+sequence_length])

    sequences = np.array(sequences)
    targets = np.array(targets).reshape(-1, sequence_length)

    # Diviser les données
    train_size = len(sequences) - 24  # 12 mois pour validation et test
    val_size = 12
    test_size = 12

    train_features = sequences[:train_size]
    train_target = targets[:train_size]

    val_features = sequences[train_size:train_size+val_size]
    val_target = targets[train_size:train_size+val_size]

    test_features = sequences[train_size+val_size:]
    test_target = targets[train_size+val_size:]

    # Convertir en tenseurs
    train_features_tensor = torch.tensor(train_features, dtype=torch.float32)
    train_target_tensor = torch.tensor(train_target, dtype=torch.float32)

    val_features_tensor = torch.tensor(val_features, dtype=torch.float32)
    val_target_tensor = torch.tensor(val_target, dtype=torch.float32)

    test_features_tensor = torch.tensor(test_features, dtype=torch.float32)
    test_target_tensor = torch.tensor(test_target, dtype=torch.float32)

    return train_features_tensor, train_target_tensor, val_features_tensor, val_target_tensor, test_features_tensor, test_target_tensor, target_scaler

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Fonction pour entraîner le modèle
def train_model(train_loader, val_loader, model, criterion, optimizer, num_epochs, patience):
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    patience_counter = 0
    best_model = None
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for features, target in train_loader:
            output = model(features)
            loss = criterion(output, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for features, target in val_loader:
                output = model(features)
                loss = criterion(output, target)
                val_loss += loss.item()
        
        val_loss /= len(val_loader)
        val_losses.append(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model = model.state_dict()
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f'Early stopping at epoch {epoch + 1}')
            model.load_state_dict(best_model)
            break
    
    model.load_state_dict(best_model)
    return train_losses, val_losses

# Fonction pour évaluer le modèle
def evaluate_model(test_loader, model, target_scaler):
    model.eval()
    mae = 0
    with torch.no_grad():
        for features, target in test_loader:
            output = model(features)
            output = target_scaler.inverse_transform(output.numpy())
            target = target_scaler.inverse_transform(target.numpy())

            mae += np.mean(np.abs(output - target))
    
    mae /= len(test_loader)
    return mae

data_dir = './data1'
files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.csv')][:20]

# Hyperparamètres
hidden_size = 50
num_layers = 2
output_size = 6  # Prédiction sur six mois
num_epochs = 50
learning_rate = 0.001
batch_size = 32
patience = 10  # Patience pour l'arrêt

mae_list = []

for file_path in files:
    train_features_tensor, train_target_tensor, val_features_tensor, val_target_tensor, test_features_tensor, test_target_tensor, target_scaler = preprocess_data(file_path)

    # Créer le DataLoader
    train_dataset = TensorDataset(train_features_tensor, train_target_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    val_dataset = TensorDataset(val_features_tensor, val_target_tensor)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    test_dataset = TensorDataset(test_features_tensor, test_target_tensor)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    input_size = train_features_tensor.shape[2]
    model = LSTMModel(input_size, hidden_size, num_layers, output_size)
    criterion = nn.L1Loss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    train_losses, val_losses = train_model(train_loader, val_loader, model, criterion, optimizer, num_epochs, patience)

    test_mae = evaluate_model(test_loader, model, target_scaler)
    mae_list.append(test_mae)
    print(f'File: {file_path}, Test MAE: {test_mae:.4f}')

# Calculer le MAE minimum et moyen
min_mae = np.min(mae_list)
avg_mae = np.mean(mae_list)

print(f'Minimum MAE: {min_mae:.4f}')
print(f'Average MAE: {avg_mae:.4f}')


## Modèle général simple

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset

# Fonction de prétraitement des données
def preprocess_data(files):
    train_features_list = []
    train_target_list = []
    test_features_list = []
    test_target_list = []

    for file_path in files:
        data = pd.read_csv(file_path)  

        data['date'] = pd.to_datetime(data['date']) 
        data.sort_values('date', inplace=True)  # Trier les données par date

        features = data[['P', 'T', 'ET', 'NDVI']]  # Extraire les features
        target = data['GWL']  # Extraire la cible

        feature_scaler = MinMaxScaler()  # Initialiser le scaler pour les features
        features_scaled = feature_scaler.fit_transform(features)  # Normaliser les features

        target_scaler = MinMaxScaler()  # Initialiser le scaler pour la cible
        target_scaled = target_scaler.fit_transform(target.values.reshape(-1, 1))  # Normaliser la cible

        sequence_length = 6  # Longueur des séquences pour les prédictions à 6 mois
        num_data_points = len(features_scaled)  # Nombre de points de données

        train_end = num_data_points - 12  # Définir la fin des données d'entraînement
        #test_start = train_end  # Début des données de test

        # S'assurer qu'il y a suffisamment de points de données pour l'entraînement et le test
        if train_end < sequence_length:
            continue

        file_train_features_list = []
        file_train_target_list = []
        file_test_features_list = []
        file_test_target_list = []

        # Créer des séquences pour les données d'entraînement
        for i in range(sequence_length, train_end + 1):
            file_train_features_list.append(features_scaled[i-sequence_length:i])
            file_train_target_list.append(target_scaled[i-sequence_length+1:i+1])

        # Créer des séquences pour les données de test
        for i in range(train_end, num_data_points - sequence_length + 1):
            file_test_features_list.append(features_scaled[i-sequence_length:i])
            file_test_target_list.append(target_scaled[i-sequence_length+1:i+1])

        # Ajouter les séquences de chaque fichier à la liste principale
        train_features_list.extend(file_train_features_list)
        train_target_list.extend(file_train_target_list)
        test_features_list.extend(file_test_features_list)
        test_target_list.extend(file_test_target_list)

    # Convertir les listes en arrays numpy
    train_features_array = np.array(train_features_list)
    train_target_array = np.array(train_target_list).reshape(-1, sequence_length)

    test_features_array = np.array(test_features_list)
    test_target_array = np.array(test_target_list).reshape(-1, sequence_length)

    # Convertir les arrays numpy en tenseurs PyTorch
    train_features_tensor = torch.tensor(train_features_array, dtype=torch.float32)
    train_target_tensor = torch.tensor(train_target_array, dtype=torch.float32)

    test_features_tensor = torch.tensor(test_features_array, dtype=torch.float32)
    test_target_tensor = torch.tensor(test_target_array, dtype=torch.float32)

    return train_features_tensor, train_target_tensor, test_features_tensor, test_target_tensor, target_scaler

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Fonction pour entraîner le modèle
def train_model(train_loader, model, criterion, optimizer, num_epochs, model_save_path):
    train_losses = []
    best_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for features, target in train_loader:
            output = model(features)
            loss = criterion(output, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        train_losses.append(train_loss)

        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}')

        # Sauvegarder le meilleur modèle
        if train_loss < best_loss:
            best_loss = train_loss
            torch.save(model.state_dict(), model_save_path)
    
    return train_losses

data_dir = './data1'
files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.csv')]

hidden_size = 50
num_layers = 2
output_size = 6  # Prédiction sur six mois
num_epochs = 65
learning_rate = 0.001
batch_size = 32

train_features_tensor, train_target_tensor, test_features_tensor, test_target_tensor, target_scaler = preprocess_data(files)

# Créer le DataLoader
train_dataset = TensorDataset(train_features_tensor, train_target_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(test_features_tensor, test_target_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

input_size = train_features_tensor.shape[2]
model = LSTMModel(input_size, hidden_size, num_layers, output_size)
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


model_save_path = './model/best_model.pth'
if not os.path.exists(os.path.dirname(model_save_path)):
    os.makedirs(os.path.dirname(model_save_path))

train_losses = train_model(train_loader, model, criterion, optimizer, num_epochs, model_save_path)


In [None]:
# Lecture du modèle sauvegarder
model.load_state_dict(torch.load(model_save_path))

# Evaluation
def evaluate_model(test_loader, model, target_scaler):
    model.eval()
    mae_list = []
    errors = []

    with torch.no_grad():
        for features, target in test_loader:
            output = model(features)
            output = target_scaler.inverse_transform(output.numpy())
            target = target_scaler.inverse_transform(target.numpy())

            mae = np.mean(np.abs(output - target))
            mae_list.append(mae)

            errors.append(output - target)
    
    errors = np.concatenate(errors)
    return np.mean(mae_list), np.std(errors), np.min(errors), np.max(errors)


test_mae, std_error, min_error, max_error = evaluate_model(test_loader, model, target_scaler)
print(f'Final Test MAE: {test_mae:.4f}, Std Error: {std_error:.4f}, Min Error: {min_error:.4f}, Max Error: {max_error:.4f}')


## Modèle général fine-tuned par région

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Prétraitement des données pour une région spécifique
def preprocess_region_data(files):
    train_features_list = []
    train_target_list = []
    test_features_list = []
    test_target_list = []

    for file_path in files:
        data = pd.read_csv(file_path)
        data['date'] = pd.to_datetime(data['date'])
        data.sort_values('date', inplace=True)

        features = data[['P', 'T', 'ET', 'NDVI']]
        target = data['GWL']

        feature_scaler = MinMaxScaler()
        features_scaled = feature_scaler.fit_transform(features)

        target_scaler = MinMaxScaler()
        target_scaled = target_scaler.fit_transform(target.values.reshape(-1, 1))

        sequence_length = 6
        num_data_points = len(features_scaled)
        train_end = num_data_points - 12

        if train_end < sequence_length:
            continue

        file_train_features_list = []
        file_train_target_list = []
        file_test_features_list = []
        file_test_target_list = []

        for i in range(sequence_length, train_end + 1):
            file_train_features_list.append(features_scaled[i-sequence_length:i])
            file_train_target_list.append(target_scaled[i-sequence_length+1:i+1])

        for i in range(train_end, num_data_points - sequence_length + 1):
            file_test_features_list.append(features_scaled[i-sequence_length:i])
            file_test_target_list.append(target_scaled[i-sequence_length+1:i+1])

        train_features_list.extend(file_train_features_list)
        train_target_list.extend(file_train_target_list)
        test_features_list.extend(file_test_features_list)
        test_target_list.extend(file_test_target_list)

    train_features_array = np.array(train_features_list)
    train_target_array = np.array(train_target_list).reshape(-1, sequence_length)

    test_features_array = np.array(test_features_list)
    test_target_array = np.array(test_target_list).reshape(-1, sequence_length)

    train_features_tensor = torch.tensor(train_features_array, dtype=torch.float32)
    train_target_tensor = torch.tensor(train_target_array, dtype=torch.float32)

    test_features_tensor = torch.tensor(test_features_array, dtype=torch.float32)
    test_target_tensor = torch.tensor(test_target_array, dtype=torch.float32)

    return train_features_tensor, train_target_tensor, test_features_tensor, test_target_tensor, target_scaler

# Fine-tuning du modèle
def fine_tune_model(train_loader, model, criterion, optimizer, num_epochs, model_save_path):
    best_loss = float('inf')
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for features, target in train_loader:
            output = model(features)
            loss = criterion(output, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
        
        train_loss /= len(train_loader)

        if train_loss < best_loss:
            best_loss = train_loss
            torch.save(model.state_dict(), model_save_path)
        
        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f} (Best: {best_loss:.4f})')

# Évaluation du modèle sur les données de test
def evaluate_model(test_loader, model, target_scaler):
    model.eval()
    mae_list = []
    errors = []

    with torch.no_grad():
        for features, target in test_loader:
            output = model(features)
            output = target_scaler.inverse_transform(output.numpy())
            target = target_scaler.inverse_transform(target.numpy())

            mae = np.mean(np.abs(output - target))
            mae_list.append(mae)

            errors.append(output - target)
    
    errors = np.concatenate(errors)
    return np.mean(mae_list), np.std(errors), np.min(errors), np.max(errors)

input_size = 4
hidden_size = 50
num_layers = 2
output_size = 6

model = LSTMModel(input_size, hidden_size, num_layers, output_size)
model.load_state_dict(torch.load(model_save_path))

# Lire OUVRAGES.csv pour obtenir les identifiants de puits et les régions
ouvrages_file_path = './OUVRAGES.csv'
ouvrages_data = pd.read_csv(ouvrages_file_path)

well_id_column = 'Ouvrage'
region_column = 'Region'

vic_wells = ouvrages_data[ouvrages_data[region_column] == 'VIC'][well_id_column].tolist()
qld_wells = ouvrages_data[ouvrages_data[region_column] == 'QLD'][well_id_column].tolist()

# Préparer la liste des fichiers CSV correspondant aux puits pour chaque région
data_dir = './data1'
vic_files = [os.path.join(data_dir, f"{well_id}.csv") for well_id in vic_wells if os.path.isfile(os.path.join(data_dir, f"{well_id}.csv"))]
qld_files = [os.path.join(data_dir, f"{well_id}.csv") for well_id in qld_wells if os.path.isfile(os.path.join(data_dir, f"{well_id}.csv"))]
print(vic_files)
print(qld_files)

# Prétraiter les données pour VIC et QLD
train_features_tensor_vic, train_target_tensor_vic, test_features_tensor_vic, test_target_tensor_vic, target_scaler_vic = preprocess_region_data(vic_files)
train_features_tensor_qld, train_target_tensor_qld, test_features_tensor_qld, test_target_tensor_qld, target_scaler_qld = preprocess_region_data(qld_files)

# Créer des DataLoader pour VIC
batch_size = 32
train_dataset_vic = TensorDataset(train_features_tensor_vic, train_target_tensor_vic)
train_loader_vic = DataLoader(train_dataset_vic, batch_size=batch_size, shuffle=True)

test_dataset_vic = TensorDataset(test_features_tensor_vic, test_target_tensor_vic)
test_loader_vic = DataLoader(test_dataset_vic, batch_size=batch_size, shuffle=False)

# Créer des DataLoader pour QLD
train_dataset_qld = TensorDataset(train_features_tensor_qld, train_target_tensor_qld)
train_loader_qld = DataLoader(train_dataset_qld, batch_size=batch_size, shuffle=True)

test_dataset_qld = TensorDataset(test_features_tensor_qld, test_target_tensor_qld)
test_loader_qld = DataLoader(test_dataset_qld, batch_size=batch_size, shuffle=False)

# Hyperparamètres pour le fine-tuning
learning_rate = 0.0001
num_epochs = 50
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

### VIC

In [None]:
# Fine-tuning pour VIC
model.load_state_dict(torch.load(model_save_path))  # Charger le modèle général
model_save_path_vic = './model/vic_model.pth'
fine_tune_model(train_loader_vic, model, criterion, optimizer, num_epochs, model_save_path_vic)
# Évaluation du modèle fine-tuned sur VIC
model.load_state_dict(torch.load('./model/vic_model.pth'))
test_mae_vic, std_error_vic, min_error_vic, max_error_vic = evaluate_model(test_loader_vic, model, target_scaler_vic)
print(f'VIC - Test MAE: {test_mae_vic:.4f}, Std Error: {std_error_vic:.4f}, Min Error: {min_error_vic:.4f}, Max Error: {max_error_vic:.4f}')

### QLD

In [None]:
# Fine-tuning pour QLD
model.load_state_dict(torch.load(model_save_path))  # Charger le modèle général
model_save_path_qld = './model/qld_model.pth'
fine_tune_model(train_loader_qld, model, criterion, optimizer, num_epochs, './model/qld_model.pth')

model.load_state_dict(torch.load('./model/qld_model.pth'))
test_mae_qld, std_error_qld, min_error_qld, max_error_qld = evaluate_model(test_loader_qld, model, target_scaler_qld)
print(f'QLD - Test MAE: {test_mae_qld:.4f}, Std Error: {std_error_qld:.4f}, Min Error: {min_error_qld:.4f}, Max Error: {max_error_qld:.4f}')

## Modèles fine-tuned par puits

### Modèle par region fine-tuned par puits

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Prétraitement des données pour une région spécifique
def preprocess_single_well_data(file_path):
    data = pd.read_csv(file_path)
    data['date'] = pd.to_datetime(data['date'])
    data.sort_values('date', inplace=True)

    features = data[['P', 'T', 'ET', 'NDVI']]
    target = data['GWL']

    feature_scaler = MinMaxScaler()
    features_scaled = feature_scaler.fit_transform(features)

    target_scaler = MinMaxScaler()
    target_scaled = target_scaler.fit_transform(target.values.reshape(-1, 1))

    sequence_length = 6
    num_data_points = len(features_scaled)

    train_end = num_data_points - 12
    test_start = train_end

    if train_end < sequence_length:
        return None, None, None, None, None

    train_features_list = []
    train_target_list = []
    test_features_list = []
    test_target_list = []

    for i in range(sequence_length, train_end + 1):
        train_features_list.append(features_scaled[i-sequence_length:i])
        train_target_list.append(target_scaled[i-sequence_length+1:i+1])

    for i in range(train_end, num_data_points - sequence_length + 1):
        test_features_list.append(features_scaled[i-sequence_length:i])
        test_target_list.append(target_scaled[i-sequence_length+1:i+1])

    train_features_array = np.array(train_features_list)
    train_target_array = np.array(train_target_list).reshape(-1, sequence_length)

    test_features_array = np.array(test_features_list)
    test_target_array = np.array(test_target_list).reshape(-1, sequence_length)

    train_features_tensor = torch.tensor(train_features_array, dtype=torch.float32)
    train_target_tensor = torch.tensor(train_target_array, dtype=torch.float32)

    test_features_tensor = torch.tensor(test_features_array, dtype=torch.float32)
    test_target_tensor = torch.tensor(test_target_array, dtype=torch.float32)

    return train_features_tensor, train_target_tensor, test_features_tensor, test_target_tensor, target_scaler

# Fine-tuning du modèle
def fine_tune_model(train_loader, model, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for features, target in train_loader:
            output = model(features)
            loss = criterion(output, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}')

# Évaluation du modèle sur les données de test
def evaluate_model(test_loader, model, target_scaler):
    model.eval()
    mae = 0
    with torch.no_grad():
        for features, target in test_loader:
            output = model(features)
            output = target_scaler.inverse_transform(output.numpy())
            target = target_scaler.inverse_transform(target.numpy())

            mae += np.mean(np.abs(output - target))
    
    mae /= len(test_loader)
    return mae

# Script principal
data_dir = './data1'
model_save_path_vic = './model/vic_model.pth'
model_save_path_qld = './model/qld_model.pth'

# Charger les modèles pré-entraînés pour chaque région
input_size = 4  # Nombre de caractéristiques d'entrée (P, T, ET, NDVI)
hidden_size = 50
num_layers = 2
output_size = 6  # Prédiction de 6 mois

model_vic = LSTMModel(input_size, hidden_size, num_layers, output_size)
model_vic.load_state_dict(torch.load(model_save_path_vic))

model_qld = LSTMModel(input_size, hidden_size, num_layers, output_size)
model_qld.load_state_dict(torch.load(model_save_path_qld))

# Lire OUVRAGES.csv pour obtenir les identifiants de puits et les régions
ouvrages_file_path = './OUVRAGES.csv'
ouvrages_data = pd.read_csv(ouvrages_file_path)

# Identifier les puits pour VIC et QLD
well_id_column = 'Ouvrage'
region_column = 'Region'

vic_wells = ouvrages_data[ouvrages_data[region_column] == 'VIC'][well_id_column].tolist()
qld_wells = ouvrages_data[ouvrages_data[region_column] == 'QLD'][well_id_column].tolist()

vic_files = [os.path.join(data_dir, f"{well_id}.csv") for well_id in vic_wells if os.path.isfile(os.path.join(data_dir, f"{well_id}.csv"))]
qld_files = [os.path.join(data_dir, f"{well_id}.csv") for well_id in qld_wells if os.path.isfile(os.path.join(data_dir, f"{well_id}.csv"))]


num_epochs = 50
learning_rate = 0.001
batch_size = 32

#### VIC

In [None]:
# Fichier pour sauvegarder les résultats
results_file_path = './results/fine_tuning_results_vic.txt'
if not os.path.exists(os.path.dirname(results_file_path)):
    os.makedirs(os.path.dirname(results_file_path))
    
# Fine-tuning pour chaque puits dans VIC
mae_list_vic = []
with open(results_file_path, 'w') as f:
    for file_path in vic_files:
        preprocessed_data = preprocess_single_well_data(file_path)
        if preprocessed_data[0] is None:
            continue

        train_features_tensor, train_target_tensor, test_features_tensor, test_target_tensor, target_scaler = preprocessed_data

        # Créer des DataLoader
        train_dataset = TensorDataset(train_features_tensor, train_target_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        test_dataset = TensorDataset(test_features_tensor, test_target_tensor)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        model = LSTMModel(input_size, hidden_size, num_layers, output_size)
        model.load_state_dict(model_vic.state_dict()) 

        criterion = nn.L1Loss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        # Entraîner le modèle pour ce puits
        well_id = os.path.basename(file_path).split('.')[0]

        print(f'Training model for well: {well_id} in VIC')
        fine_tune_model(train_loader, model, criterion, optimizer, num_epochs)

        # Évaluer le modèle entraîné sur ce puits
        test_mae = evaluate_model(test_loader, model, target_scaler)
        mae_list_vic.append(test_mae)
        print(f'VIC Well {well_id} - Test MAE: {test_mae:.4f}')

        # Sauvegarder
        f.write(f'VIC Well {well_id} - Test MAE: {test_mae:.4f}\n')

    # Calculer la moyenne des MAE pour VIC
    avg_mae_vic = np.mean(mae_list_vic)
    print(f'Average MAE for VIC: {avg_mae_vic:.4f}')
    f.write(f'Average MAE for VIC: {avg_mae_vic:.4f}\n\n')


#### QLD

In [None]:
# Fichier pour sauvegarder les résultats
results_file_path = './results/fine_tuning_results_qld.txt'
if not os.path.exists(os.path.dirname(results_file_path)):
    os.makedirs(os.path.dirname(results_file_path))

# Fine-tuning pour chaque puits dans QLD
mae_list_qld = []
with open(results_file_path, 'a') as f:
    for file_path in qld_files:
        preprocessed_data = preprocess_single_well_data(file_path)
        if preprocessed_data[0] is None:
            continue

        train_features_tensor, train_target_tensor, test_features_tensor, test_target_tensor, target_scaler = preprocessed_data

        # Créer des DataLoader
        train_dataset = TensorDataset(train_features_tensor, train_target_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        test_dataset = TensorDataset(test_features_tensor, test_target_tensor)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        # Charger les poids du modèle pré-entraîné QLD
        model = LSTMModel(input_size, hidden_size, num_layers, output_size)
        model.load_state_dict(model_qld.state_dict())  # Initialiser avec les poids du modèle QLD

        criterion = nn.L1Loss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        # Entraîner le modèle pour ce puits
        well_id = os.path.basename(file_path).split('.')[0]

        print(f'Training model for well: {well_id} in QLD')
        fine_tune_model(train_loader, model, criterion, optimizer, num_epochs)

        # Évaluer le modèle entraîné sur ce puits
        test_mae = evaluate_model(test_loader, model, target_scaler)
        mae_list_qld.append(test_mae)
        print(f'QLD Well {well_id} - Test MAE: {test_mae:.4f}')

        # Sauvegarder
        f.write(f'QLD Well {well_id} - Test MAE: {test_mae:.4f}\n')

    # Calculer la moyenne des MAE pour QLD
    avg_mae_qld = np.mean(mae_list_qld)
    print(f'Average MAE for QLD: {avg_mae_qld:.4f}')
    f.write(f'Average MAE for QLD: {avg_mae_qld:.4f}\n\n')

print(f'All results saved to {results_file_path}')


### Modèle genéral fine-tuned par puits

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset

# Prétraiter les données pour un seul puits
def preprocess_single_well_data(file_path):
    data = pd.read_csv(file_path)
    data['date'] = pd.to_datetime(data['date'])
    data.sort_values('date', inplace=True)

    features = data[['P', 'T', 'ET', 'NDVI']]
    target = data['GWL']

    feature_scaler = MinMaxScaler()
    features_scaled = feature_scaler.fit_transform(features)

    target_scaler = MinMaxScaler()
    target_scaled = target_scaler.fit_transform(target.values.reshape(-1, 1))

    sequence_length = 6
    num_data_points = len(features_scaled)
    train_end = num_data_points - 12

    if train_end < sequence_length:
        return None, None, None, None, None

    train_features_list = []
    train_target_list = []
    test_features_list = []
    test_target_list = []

    for i in range(sequence_length, train_end + 1):
        train_features_list.append(features_scaled[i-sequence_length:i])
        train_target_list.append(target_scaled[i+sequence_length-6:i+sequence_length])

    for i in range(train_end, num_data_points - sequence_length + 1):
        test_features_list.append(features_scaled[i-sequence_length:i])
        test_target_list.append(target_scaled[i+sequence_length-6:i+sequence_length])

    train_features_array = np.array(train_features_list)
    train_target_array = np.array(train_target_list).reshape(-1, 6)

    test_features_array = np.array(test_features_list)
    test_target_array = np.array(test_target_list).reshape(-1, 6)

    train_features_tensor = torch.tensor(train_features_array, dtype=torch.float32)
    train_target_tensor = torch.tensor(train_target_array, dtype=torch.float32)
    test_features_tensor = torch.tensor(test_features_array, dtype=torch.float32)
    test_target_tensor = torch.tensor(test_target_array, dtype=torch.float32)

    return train_features_tensor, train_target_tensor, test_features_tensor, test_target_tensor, target_scaler

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Entraîner le modèle
def train_model(train_loader, model, criterion, optimizer, num_epochs):
    train_losses = []

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for features, target in train_loader:
            output = model(features)
            loss = criterion(output, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}')
    
    return train_losses

# Évaluer le modèle
def evaluate_model(test_loader, model, target_scaler):
    model.eval()
    mae = 0
    with torch.no_grad():
        for features, target in test_loader:
            output = model(features)
            output = target_scaler.inverse_transform(output.numpy())
            target = target_scaler.inverse_transform(target.numpy())

            mae += np.mean(np.abs(output - target))
    
    mae /= len(test_loader)
    return mae


data_dir = './data1'
files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.csv')]

# Charger le modèle général pré-entraîné
model_save_path_general = './model/best_model.pth'
input_size = 4
hidden_size = 50
num_layers = 2
output_size = 6

model_general = LSTMModel(input_size, hidden_size, num_layers, output_size)
model_general.load_state_dict(torch.load(model_save_path_general))

# Hyperparameters
num_epochs = 50
learning_rate = 0.001
batch_size = 32

# Fichier pour sauvegarder les résultats
results_file_path = './results/fine_tuning_results.txt'
if not os.path.exists(os.path.dirname(results_file_path)):
    os.makedirs(os.path.dirname(results_file_path))

mae_list = []

with open(results_file_path, 'w') as f:
    for file_path in files:
        preprocessed_data = preprocess_single_well_data(file_path)
        if preprocessed_data[0] is None:
            continue

        train_features_tensor, train_target_tensor, test_features_tensor, test_target_tensor, target_scaler = preprocessed_data

        train_dataset = TensorDataset(train_features_tensor, train_target_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        test_dataset = TensorDataset(test_features_tensor, test_target_tensor)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        model = LSTMModel(input_size, hidden_size, num_layers, output_size)
        model.load_state_dict(model_general.state_dict())

        criterion = nn.L1Loss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        well_id = os.path.basename(file_path).split('.')[0]

        print(f'Training model for well: {well_id}')
        train_losses = train_model(train_loader, model, criterion, optimizer, num_epochs)

        test_mae = evaluate_model(test_loader, model, target_scaler)
        mae_list.append(test_mae)
        print(f'Well {well_id} - Test MAE: {test_mae:.4f}')

        f.write(f'Well {well_id} - Test MAE: {test_mae:.4f}\n')
        f.write(f'Train Losses: {train_losses}\n\n')

    avg_mae = np.mean(mae_list)
    print(f'Average MAE: {avg_mae:.4f}')
    f.write(f'Average MAE: {avg_mae:.4f}\n')

print(f'All results saved to {results_file_path}')
