# Preprocesamiento de los datos

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
#from scipy import stats as stats
import seaborn as sns
from datetime import datetime, timedelta
import os

## Lectura y union de los archivos *.parquet

In [None]:
path = '../data/raw/Lecturas_Eneero_2025'
content = os.listdir(path)
files = []
no_empty = []
# Create DataFrame
df = pd.DataFrame(
{
    'ReadId': [],
    'TimeSpan': [],
    'SensorId': [],
    'Value': [],
    'LocalTimeSpan': [],
})

for folder in content:
    if folder.endswith(''):
        path_files = os.path.join(path,folder)
        for file in os.listdir(path_files):
            if file.endswith('parquet'):
                files.append(file)
                try:
                    df_1 = pd.read_parquet(os.path.join(path_files,file))
                    df = pd.concat([df,df_1])
                    no_empty.append(file)
                except OSError:
                    pass

df = df.reset_index(drop=True)        
print(f"Total files *.parquet found: {len(files)}, Total non-empty files: {len(no_empty)}")

## Analisis exploratorio de los datos
### Lecturas

In [None]:
# DataFrame information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165411 entries, 0 to 165410
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   ReadId         165411 non-null  object        
 1   TimeSpan       165411 non-null  datetime64[ns]
 2   SensorId       165411 non-null  object        
 3   Value          165411 non-null  float64       
 4   LocalTimeSpan  165411 non-null  datetime64[ns]
dtypes: datetime64[ns](2), float64(1), object(2)
memory usage: 6.3+ MB


In [88]:
# Verify null values per column
print(df.isna().sum())

ReadId           0
TimeSpan         0
SensorId         0
Value            0
LocalTimeSpan    0
dtype: int64


No hay valores ausentes y las columnas tienen el formato adecuado.

In [90]:
# Show DataFrame
df.head(5)

Unnamed: 0,ReadId,TimeSpan,SensorId,Value,LocalTimeSpan
0,05543cf3-ec51-4760-a9af-8efa01a203e6,2025-01-18 13:48:44.190,063276ed-657e-40a2-a17b-0c685af518f6,31.2,2025-01-18 07:48:44.190
1,a3c80788-c698-41cc-835f-aedd98adb72c,2025-01-18 13:48:44.195,22f8b472-4022-41a9-b7b7-37e83a532df0,35.55,2025-01-18 07:48:44.195
2,4e223d79-4791-4c2a-9d88-97c8efa170bc,2025-01-18 13:48:44.199,267e3d44-9b7f-460c-b34b-603fe0fa9972,36.9,2025-01-18 07:48:44.199
3,1cc74f1d-89bf-47d0-81a9-4de727f9f1e4,2025-01-18 13:48:44.202,8139d3c7-5a4c-40d3-b3a6-8db4da769001,41.55,2025-01-18 07:48:44.202
4,29cc45c7-eed9-4901-8fe1-1423c043d289,2025-01-18 13:48:44.206,d080f284-c51f-4a48-a4d7-b7acec0672ba,36.3,2025-01-18 07:48:44.206


Parece que las columnas 'TimeSpan' y 'LocalTimeSpan' solo varian por unas horas, es decir, es la misma informacion.

In [None]:
# Verify time difference in columns
delta_col = df['TimeSpan']-df['LocalTimeSpan']
delta_col.head(5)

0   0 days 06:00:00
1   0 days 06:00:00
2   0 days 06:00:00
3   0 days 06:00:00
4   0 days 06:00:00
dtype: timedelta64[ns]

Veamos cuantos valores diferentes hay en las columnas 'ReadId' y 'SensorId'.

In [94]:
# Count unique values
print(df['ReadId'].nunique())

165407


In [95]:
# Count unique values
print(df['SensorId'].nunique())

40


### Tablas

In [None]:
# Read DataFrame
df_cities = pd.read_csv('../data/raw/dimCities.csv',sep=',',header=0)

# Show DataFrame
df_cities.head(2)

In [None]:
# Read DataFrame
df_devices = pd.read_csv('../data/raw/dimDevices.csv',sep=',',header=0)

# Show DataFrame
df_devices.head(2)

In [None]:
# Read DataFrame
df_locations = pd.read_csv('../data/raw/dimLocations.csv',sep=',',header=0)

# Show DataFrame
df_locations.head(2)

In [None]:
# Read DataFrame
df_sensors = pd.read_csv('../data/raw/dimSensors.csv',sep=',',header=0)

# Show DataFrame
df_sensors.head(2)

In [None]:
# Read DataFrame
df_sublocations = pd.read_csv('../data/raw/dimSublocations.csv',sep=',',header=0)

# Show DataFrame
df_sublocations.head(2)

In [None]:
# Read DataFrame
df_unidades = pd.read_csv('../data/raw/dimUnidades.csv',sep=',',header=0)

# Show DataFrame
df_unidades.head(2)