In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder, minmax_scale
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score,mean_absolute_percentage_error
import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings('ignore', category=FutureWarning)

In [2]:
# Carregar dades
data_dir = "/kaggle/input/co2-clean/" #Modificar amb la vostra ruta

df = pd.read_csv(data_dir + "co2_records_models.csv")
display(df.sample(n=5))

extra_df = pd.read_csv(data_dir + "co2_additional.csv")
display(extra_df.sample(n=5))

Unnamed: 0,date_time,location,sensor_id,temp,hum,co2,occupied,Fri,Mon,Sat,Sun,Thu,Tue,Wed,sin_d_record,cos_d_record
13626,2021-06-06 05:10:00+02:00,Albea,CO2_03,0.368327,0.592221,0.063389,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.988148,0.60822
73689,2021-06-09 14:45:00+02:00,Albea,CO2_02,0.387134,0.774058,0.099526,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.170327,0.12408
67779,2021-05-28 01:00:00+02:00,SantMiquel,CO2_05,0.294325,0.609463,0.075237,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.62941,0.982963
37907,2021-05-07 21:15:00+02:00,SantMiquel,CO2_05,0.286148,0.561814,0.154028,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170327,0.87592
7674,2021-06-07 08:00:00+02:00,Albea,CO2_02,0.304138,0.705092,0.091232,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.933013,0.25


Unnamed: 0,sensor_id,location,grade,students,square_metres,windows,orientation,location_id,orientation_id,grade_scaled,students_scaled,square_metres_scaled,windows_scaled
10,CO2_05,Albea,3,24,44.56,2,Playground,1,0,0.4,0.909091,0.262511,0.0
4,CO2_05,SantMiquel,5,18,43.37,4,Street,0,1,0.8,0.363636,0.22685,0.5
1,CO2_02,SantMiquel,6,18,43.37,4,Street,0,1,1.0,0.363636,0.22685,0.5
5,CO2_06,SantMiquel,3,17,43.37,4,Street,0,1,0.4,0.272727,0.22685,0.5
6,CO2_01,Albea,1,16,35.8,2,Playground,1,0,0.0,0.181818,0.0,0.0


In [3]:
df['date_time'] = pd.to_datetime(df['date_time'])

In [4]:
# Selecció de variables extra_df
extra_df = extra_df[["sensor_id", "location", "location_id", "orientation_id", "grade_scaled", "students_scaled", "square_metres_scaled", "windows_scaled"]]
display(extra_df.sample(n=5))

Unnamed: 0,sensor_id,location,location_id,orientation_id,grade_scaled,students_scaled,square_metres_scaled,windows_scaled
8,CO2_03,Albea,1,0,0.6,0.181818,0.934972,0.5
10,CO2_05,Albea,1,0,0.4,0.909091,0.262511,0.0
3,CO2_04,SantMiquel,0,0,0.0,0.272727,0.413845,1.0
2,CO2_03,SantMiquel,0,1,0.6,0.090909,0.22685,0.5
1,CO2_02,SantMiquel,0,1,1.0,0.363636,0.22685,0.5


# Preparació de les dades

In [5]:
def rnn_window_data(group, window_size=6):
    X = []
    y = []
    metadata = []
    
    for i in range(len(group) - window_size):
        block = group.iloc[i:i + window_size].drop(["date_time","location","sensor_id","percent_pos"], axis=1)
        pred =  group.iloc[i+window_size]["co2"]
        if block.isna().sum().sum() == 0 and ~np.isnan(pred): #aqui para dejar pasar nulls
            X.append(block)
            y.append(pred)
            metadata.append(group.iloc[i+window_size][["date_time","location","sensor_id","percent_pos"]])
            
    return np.array(X), np.array(y), pd.concat(metadata, axis=1).T

In [6]:
# Generar les dades en format adecuat per rnn i amb la dimensio de la finestra
finestra = 6

grouped = df.groupby(['location', 'sensor_id'])

X_dynamic = []
y = []

metadata = []

for name, group in grouped:
    group = group.sort_values(by='date_time').reset_index(drop=True)
    
    group["percent_pos"] = group.index / (group.shape[0] - 1)
    
    X_group, y_group, metadata_group = rnn_window_data(group,finestra)
    
    X_dynamic.append(X_group)
    
    y.append(y_group)
    
    metadata.append(metadata_group)
    
metadata = pd.concat(metadata, ignore_index=True).sort_values(by='percent_pos')

X_dynamic = np.concatenate(X_dynamic, axis=0)[metadata.index]

y = np.concatenate(y, axis=0)[metadata.index]

metadata.reset_index(inplace=False,drop=True)
metadata["date_time"] = pd.to_datetime(metadata["date_time"])

In [7]:
# Conjunt de dades estatiques
def get_static_data(df):
    #genera les variables de dades estatics partint del df de metadades
    merged_df = pd.merge(df, extra_df, on=['location', 'sensor_id'])
    data_df = merged_df.drop(columns=['date_time', 'location', 'sensor_id','percent_pos'])
    data_array = data_df.to_numpy()
    return data_array

X_static = get_static_data(metadata)

In [8]:
X_static_original = X_static.copy()
X_static = np.full((len(X_static), 13), -1) 
X_static[:, :6] = X_static_original
X_static = np.expand_dims(X_static, axis=1)
X = np.concatenate((X_dynamic, X_static), axis=1)

In [9]:
np.save('X.npy', X)
np.save('y.npy', y)
metadata.to_csv('metadata.csv', index=False)