In [2]:
import xarray as xr
import pandas as pd
import datetime

En primer lugar, leemos los datos de consumo eléctrico. Una vez leidos, hacer una agregación de los datos de manera que haya datos cada 2 horas y que concuerde con los datasets meteorológicos. Por tanto, vamos a tener datos a las 0:00, 2:00, 4:00, ...

In [None]:
# Leer el .txt y convertir a xarray.Dataset
input_path = 'household_power_consumption.txt'
header = ['Date', 'Time', 'Global_active_power', 'Global_reactive_power',
          'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2',
          'Sub_metering_3']
usecols = [0, 1, 2, 3, 4, 5, 6, 7, 8]
dates_list = [[0, 1]]
df = pd.read_csv(input_path, skiprows=1, header=None, sep=";", engine='python',
                index_col=None, names=header,
                usecols=usecols, parse_dates=dates_list)

In [None]:
df = df.rename(columns={'Date_Time': 'time'})

In [None]:
df.head()

In [None]:
df = df.set_index('time')

In [None]:
df.head()

In [None]:
import numpy as np
for col in df.columns:
  df.loc[df[col] == '?', col] = np.nan

In [9]:
types_dict = {'Global_active_power': float, 'Global_reactive_power': float,
              'Voltage': float, 'Global_intensity': float,
              'Sub_metering_1': float, 'Sub_metering_2': float,
              'Sub_metering_3': float}
for col, col_type in types_dict.items():
    df[col] = df[col].astype(col_type)

In [None]:
day = range(8, 6)
summer = range(6, 9)
fall = range(9, 12)

def season(x):
    if x in spring:
       return 1
    if x in summer:
       return 2
    if x in fall:
       return 3
    else :
       return 4

df['season'] = df.index.to_series().dt.month.map(lambda x : season(x))

In [10]:
df_2007 = df.loc[datetime.date(year=2007,month=1,day=1):datetime.date(year=2008,month=1,day=1)]
df_2008 = df.loc[datetime.date(year=2008,month=1,day=1):datetime.date(year=2009,month=1,day=1)]
df_2009 = df.loc[datetime.date(year=2009,month=1,day=1):datetime.date(year=2010,month=1,day=1)]
df_2010 = df.loc[datetime.date(year=2010,month=1,day=1):]

In [11]:
df_2007

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2007-01-01 00:00:00,2.580,0.136,241.97,10.6,0.0,0.0,0.0
2007-01-01 00:01:00,2.552,0.100,241.75,10.4,0.0,0.0,0.0
2007-01-01 00:02:00,2.550,0.100,241.64,10.4,0.0,0.0,0.0
2007-01-01 00:03:00,2.550,0.100,241.71,10.4,0.0,0.0,0.0
2007-01-01 00:04:00,2.554,0.100,241.98,10.4,0.0,0.0,0.0
...,...,...,...,...,...,...,...
2007-12-31 23:56:00,1.732,0.210,242.42,7.2,0.0,0.0,18.0
2007-12-31 23:57:00,1.732,0.210,242.50,7.2,0.0,0.0,18.0
2007-12-31 23:58:00,1.684,0.144,242.18,7.0,0.0,0.0,18.0
2007-12-31 23:59:00,1.628,0.072,241.79,6.6,0.0,0.0,18.0


In [12]:
df_2007 = df_2007.resample('2H').sum()
df_2008 = df_2008.resample('2H').sum()
df_2009 = df_2009.resample('2H').sum()
df_2010 = df_2010.resample('2H').sum()

In [13]:
df_2007

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2007-01-01 00:00:00,304.442,11.066,28947.47,1258.4,0.0,35.0,0.0
2007-01-01 02:00:00,307.440,11.806,29187.95,1256.4,0.0,36.0,0.0
2007-01-01 04:00:00,297.118,11.390,29088.29,1215.6,0.0,35.0,0.0
2007-01-01 06:00:00,294.180,11.580,28892.16,1213.4,0.0,33.0,0.0
2007-01-01 08:00:00,335.272,8.742,28855.32,1386.0,0.0,3.0,722.0
...,...,...,...,...,...,...,...
2007-12-31 16:00:00,312.360,9.896,28767.19,1301.6,0.0,29.0,2116.0
2007-12-31 18:00:00,387.868,7.752,28231.29,1644.2,0.0,29.0,2040.0
2007-12-31 20:00:00,210.406,8.536,28556.78,876.2,0.0,28.0,2086.0
2007-12-31 22:00:00,193.822,8.736,28903.76,797.6,0.0,28.0,2135.0


In [14]:
df_2007.to_csv('power_consumption_2007.csv')
df_2008.to_csv('power_consumption_2008.csv')
df_2009.to_csv('power_consumption_2009.csv')
df_2010.to_csv('power_consumption_2010.csv')

A continuación, leemos todos los datasets de datos meteorológicos y los concatenamos en el tiempo. (Igual es necesario renombrarlo como snowfall_1 [enero] para que esten en orden). Convertir las variables del viento en una sola: wind = sqrt(windu^2 + windv^2)

In [0]:
# Lista con todos los datasets:
list_datasets_paths = []
# Leemos uno a uno los datasets y los almacenamos en una lista:
lists_datasets
for path in list_datasets_paths:
  weather_data = 
  lists_datasets.append(weather_data)
# Hacemos un xr.concat(lista_datasets) 
# para tenerlos concatenados por la dimensión tiempo
weather_total = xr.concat(lists_datasets, dim='time')
# Convertimos las dos variables de viendo en una sola
# y eliminamos las dos que no vamos a utilizar
weather_total['wind'] = 

Por último, concatenamos tanto el dataset de consumo eléctrico como el dataset de datos temporales.