In [21]:
## Chargement et exploration initiale de la dataset.

import pandas as pd
import numpy as np 
import os

# Get a list of all .csv files in the current directory
csv_files = [file for file in os.listdir('.') if file.endswith('.csv')]

# Create a dictionary to store DataFrames with corresponding names for each .csv file
dfs = {}

# Read each .csv file, rename the DataFrame, and store it in the dictionary
for file_name in csv_files:
    df_name = file_name.replace('.csv', '')  # Extract DataFrame name from the file name
    dfs[df_name] = pd.read_csv(file_name)  # Create DataFrame with the extracted name

# Perform the merge based on a common key (e.g., 'common_column')
# Replace 'common_column' with the actual column name that is common across all DataFrames
merged_df = dfs['01_occ']  # Initialize merged_df with one of the DataFrames
for df_name, df in dfs.items():
    if df_name != '01_occ':  # Skip the first DataFrame since it's already stored in merged_df
        merged_df = pd.merge(merged_df, df, on='timestamp [dd/mm/yyyy HH:MM]',how='outer' )
        # merged_df.fillna(method='ffill', inplace=True)
        # Forward-fill missing temperature values
        merged_df.fillna(method='ffill', inplace=True)


# Now you have a merged DataFrame named 'merged_df' containing data from all .csv files
nan_df = merged_df.isna()

# If there are any NaN values, the nan_df DataFrame will contain True in those positions.
# You can check if there are any NaN values in the entire DataFrame by using the any() method.
if nan_df.any().any():
    print("The DataFrame contains NaN values.")
else:
    print("The DataFrame does not contain NaN values.")


merged_df

The DataFrame does not contain NaN values.


Unnamed: 0,timestamp [dd/mm/yyyy HH:MM],ki [0:vacant 1:occupied],o1_1 [0:vacant 1:occupied],o1_2 [0:vacant 1:occupied],o1_3 [0:vacant 1:occupied],o1_4 [0:vacant 1:occupied],o1_5 [0:vacant 1:occupied],o2 [0:vacant 1:occupied],o3 [0:vacant 1:occupied],o4 [0:vacant 1:occupied],...,o1_2 [%],o2 [%],o3 [%],o4 [%],mr [%],gh [W/m2],tempOut [C],rh [%],wind speed [m/s],Wind direction [Degree] [North:0 East:90 South:180 West:270]
0,01/01/2013 00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.1,20.8,25.1,22.4,19.6,2.0,1.6,97.0,0.45,37.0
1,01/01/2013 00:15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.1,20.8,25.1,22.4,19.6,2.0,1.6,97.0,0.45,37.0
2,01/01/2013 00:30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.1,20.8,25.1,22.4,19.6,2.0,1.6,97.0,0.45,37.0
3,01/01/2013 00:45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.1,20.8,25.1,22.4,19.8,2.0,1.6,97.0,0.45,37.0
4,01/01/2013 01:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,23.3,20.8,25.1,22.4,20.1,2.0,1.2,99.0,0.82,41.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35035,31/12/2013 22:45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,28.6,27.1,29.6,29.3,28.2,1.0,3.4,80.0,3.32,118.0
35036,31/12/2013 23:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,28.2,27.1,29.7,29.2,28.2,1.0,3.5,79.0,3.69,124.0
35037,31/12/2013 23:15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,28.2,27.1,29.4,29.0,27.9,1.0,3.5,79.0,3.69,124.0
35038,31/12/2013 23:30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,28.7,27.1,29.0,29.0,27.8,1.0,3.5,79.0,3.69,124.0


In [69]:
merged_df.columns

Index(['timestamp [dd/mm/yyyy HH:MM]', 'ki [0:vacant 1:occupied]',
       'o1_1 [0:vacant 1:occupied]', 'o1_2 [0:vacant 1:occupied]',
       'o1_3 [0:vacant 1:occupied]', 'o1_4 [0:vacant 1:occupied]',
       'o1_5 [0:vacant 1:occupied]', 'o2 [0:vacant 1:occupied]',
       'o3 [0:vacant 1:occupied]', 'o4 [0:vacant 1:occupied]',
       'ki  [1:closed 0:open]', 'o1_1 [1:closed 0:open]',
       'o1_2 [1:closed 0:open]', 'o1_3 [1:closed 0:open]',
       'o1_4 [1:closed 0:open]', 'o2_1 [1:closed 0:open]',
       'o2_2 [1:closed 0:open]', 'o3_1 [1:closed 0:open]',
       'o3_2 [1:closed 0:open]', 'o3_3 [1:closed 0:open]',
       'o3_4 [1:closed 0:open]', 'o4_1 [1:closed 0:open]',
       'o4_2 [1:closed 0:open]', 'mr_1 [1:closed 0:open]',
       'mr_2 [1:closed 0:open]', 'mr_3 [1:closed 0:open]',
       'mr_4 [1:closed 0:open]', 'mr_5 [1:closed 0:open]',
       'mr_6 [1:closed 0:open]', 'ki [0:off 1:on]', 'o1_1 [0:off 1:on]',
       'o1_2 [0:off 1:on]', 'o2 [0:off 1:on]', 'o3_1 [0:off 1:on]',


In [12]:
merged_df.filter(like='o4', axis=1).columns

Index(['o4 [0:vacant 1:occupied]', 'o4_1 [1:closed 0:open]',
       'o4_2 [1:closed 0:open]', 'o4_1 [0:off 1:on]', 'o4_2 [0:off 1:on]',
       'o4 [W]', 'o4 [C]', 'o4 [%]'],
      dtype='object')

In [28]:
#Data 

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from keras.layers import LSTM, Dense, RepeatVector, TimeDistributed
from keras.models import Sequential

# Charger les données (assurez-vous d'avoir vos données préparées sous forme de DataFrame)
# features = ['ki [%]', 'ki [C]', 'gh [W/m2]', 'ki  [1:closed 0:open]', 'tempOut [C]', 'rh [%]','ki [0:off 1:on]', 'wind speed [m/s]']
features = ['o4_1 [1:closed 0:open]','o4_1 [0:off 1:on]', 'o4 [W]', 'o4 [C]', 'o4 [%]', 'gh [W/m2]', 'tempOut [C]', 'rh [%]','ki [0:off 1:on]', 'wind speed [m/s]']
target = ["o4 [0:vacant 1:occupied]"]

merged_df = merged_df[['timestamp [dd/mm/yyyy HH:MM]','o4 [0:vacant 1:occupied]']]
# Assuming your 'timestamp [dd/mm/yyyy HH:MM]' column is in datetime format
merged_df['timestamp'] = pd.to_datetime(merged_df['timestamp [dd/mm/yyyy HH:MM]'])

print(merged_df)
# # Set the timestamp as the DataFrame index
merged_df.set_index('timestamp', inplace=True)
# # Creates time series features from datetime index
# def create_features(df, label=None):
#     # Creates time series features from datetime index
#     df['hour'] = df.index.hour
#     df['dayofweek'] = df.index.dayofweek
#     df['quarter'] = df.index.quarter
#     df['month'] = df.index.month
#     df['year'] = df.index.year
#     df['dayofyear'] = df.index.dayofyear
#     df['dayofmonth'] = df.index.day
#     df['weekofyear'] = df.index.isocalendar().week
    
#     X = df[features]
   

#     if label:
#         y = df[label]        
#         return X, y
#     return X

# Filter data based on a specific split date for training & test : 
# we will take last three month for Testing, 
# 9 month for Training, 3 for testing 

split_date = '2013-10-10'
train_data = merged_df.loc[merged_df.index <= split_date].copy()
test_data = merged_df.loc[merged_df.index > split_date].copy()



                    timestamp [dd/mm/yyyy HH:MM]  o4 [0:vacant 1:occupied]  \
timestamp                                                                    
2013-01-01 00:00:00             01/01/2013 00:00                       0.0   
2013-01-01 00:15:00             01/01/2013 00:15                       0.0   
2013-01-01 00:30:00             01/01/2013 00:30                       0.0   
2013-01-01 00:45:00             01/01/2013 00:45                       0.0   
2013-01-01 01:00:00             01/01/2013 01:00                       0.0   
...                                          ...                       ...   
2013-12-31 22:45:00             31/12/2013 22:45                       0.0   
2013-12-31 23:00:00             31/12/2013 23:00                       0.0   
2013-12-31 23:15:00             31/12/2013 23:15                       0.0   
2013-12-31 23:30:00             31/12/2013 23:30                       0.0   
2013-12-31 23:45:00             31/12/2013 23:45                

In [29]:
#Train & Test : 

# X_train, y_train = create_features(train_data, label=target)
# X_test, y_test = create_features(test_data, label=target) 

# Initialize the StandardScaler
from sklearn.preprocessing import StandardScaler

scaler = MinMaxScaler()
train_data['o4 [0:vacant 1:occupied]'] = scaler.fit_transform(train_data['o4 [0:vacant 1:occupied]'])
test_data['o4 [0:vacant 1:occupied]'] = scaler.fit_transform(test_data['o4 [0:vacant 1:occupied]'])


# # Fonction pour créer des séquences temporelles
def create_sequences(data, sequence_length):
    sequences = []
    for i in range(len(data) - sequence_length):
        seq = data.iloc[i:i+sequence_length]
        sequences.append(seq.values)
    return np.array(sequences)



X, y = train_data['o4 [0:vacant 1:occupied]'] , train_data['o4 [0:vacant 1:occupied]'] 
sequence_length = 10 # Longueur de la séquence temporelle
X_train_seq = create_sequences(train_data, sequence_length)
X_test_seq = create_sequences(test_data, sequence_length)

# X_train_standardized_seq = create_sequences(X_train_standardized, sequence_length)
# X_test_standardized_seq = create_sequences(X_test_standardized, sequence_length)

# y_train_seq = create_sequences(y_train, sequence_length)
# y_test_seq = create_sequences(y_test, sequence_length)

ValueError: Expected 2D array, got 1D array instead:
array=[0. 0. 0. ... 0. 0. 0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
# Normalisation

# transform of data

scaler = MinMaxScaler()
X_train_seq_Norm = scaler.fit_transform(X_train_seq)
X_test_seq_Norm = scaler.transform(X_test_seq)

In [19]:
# Model: 
# Build the LSTM autoencoder model

model = Sequential()

model.add(LSTM(64, activation='relu', input_shape=(sequence_length, X_train_seq.shape[2]), return_sequences=True))
model.add(LSTM(32, activation='relu', return_sequences=False))

model.add(RepeatVector(sequence_length))

model.add(LSTM(32, activation='relu', return_sequences=True))
model.add(LSTM(64, activation='relu', return_sequences=True))

model.add(TimeDistributed(Dense(1, activation='sigmoid')))
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 10, 64)            19200     
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 repeat_vector (RepeatVecto  (None, 10, 32)            0         
 r)                                                              
                                                                 
 lstm_2 (LSTM)               (None, 10, 32)            8320      
                                                                 
 lstm_3 (LSTM)               (None, 10, 64)            24832     
                                                                 
 time_distributed (TimeDist  (None, 10, 1)             65        
 ributed)                                               

In [70]:
# Training the model
# history = model.fit(X_train_seq, y_train, epochs=10, batch_size=32, validation_data=(X_test_seq, y_test))
# X_train_seq = X_train_seq.astype(np.float32)
# y_train = y_train.astype(np.float32)
# X_test_seq = X_test_seq.astype(np.float32)
# y_test = y_test.astype(np.float32)

# Training the model
history = model.fit(X_train_seq, X_train_seq, epochs=10, batch_size=32, validation_data=(X_test_seq, X_test_seq))


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [None]:
# Prediction
y_pred_seq = model.predict(X_test_seq)

# Reshape the predictions to match the original shape
y_pred = y_pred_seq.reshape(-1, 1)

# Rescale the predictions to the original scale
# predictions = scaler.inverse_transform(predictions.reshape(-1, 1))

In [None]:
# Debug
print('X_test_seq', X_test_seq.shape)
print('y_test_seq', y_test_seq.shape)

print(sequence_length, X_train_seq.shape[2])


X_test_seq (7957, 10, 16)
y_test_seq (7957, 10)
10 16


In [None]:
# Evaluate the model
# from sklearn.metrics import mean_squared_error


mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error (MSE): {mse}')


# mse = mean_squared_error(y_test, y_pred)


In [None]:
# Performance 