In [47]:
import pandas as pd

def generate_unique_df(paths):
    df_final = pd.DataFrame()

    for path in paths:
        df = pd.read_csv(path, encoding="latin", skiprows=8, sep=';')
        df_1 = df.groupby(['DATA (YYYY-MM-DD)'])['PRECIPITAÇÃO TOTAL. HORÁRIO (mm)', 'PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO. HORARIA (mB)', 'TEMPERATURA DO AR - BULBO SECO. HORARIA (°C)', 'UMIDADE RELATIVA DO AR. HORARIA (%)'].mean()
        df_2 = pd.DataFrame({'TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)': df.groupby(['DATA (YYYY-MM-DD)'])['TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)'].max()})
        df_new = pd.concat([df_1, df_2], axis=1)
        df_final = pd.concat([df_final, df_new])
    
    df_final.rename(columns = {'PRECIPITAÇÃO TOTAL. HORÁRIO (mm)': 'rainfall_volume', 'PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO. HORARIA (mB)' : 'atmospheric_pressure', 'TEMPERATURA DO AR - BULBO SECO. HORARIA (°C)' : 'dry_bulb_temperature', 'UMIDADE RELATIVA DO AR. HORARIA (%)' : 'relative_humidity', 'TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)' : 'maximum_temperature'}, inplace=True)

    df_final[df_final < 0]  = pd.NA
    df_final.dropna(inplace=True)
    
    return df_final

In [48]:
# Generate X and Y bases according to the series passed and the paramters tau (time delay), m (embedding dimension) and k (number of steps ahead to predict)
def generate_XY(serie, tau, m, k):
    N = serie.size

    X = np.zeros((N - k - (m - 1) * tau, m))
    Y = np.zeros(N - k - (m - 1) * tau)

    for key, i in enumerate(range((m - 1) * tau, N - k)):
        for m_ in range(m):
            X[key, m_] = serie[i - m_ * tau]
        Y[key] = serie[i + k]
    
    return (X, Y)

In [51]:
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import os

def generate_bases(df):
    for (columnName, columnData) in df.iteritems():
        if not os.path.isdir(columnName):
            os.mkdir(columnName)
        
        np.savetxt(columnName + '/' + columnName + '.csv', columnData.values)
        
        (X, Y) = generate_XY(columnData.values, 1, 3, 5)
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=200, shuffle=False)    

        min_max_scaler = preprocessing.MinMaxScaler()
        XY_train = min_max_scaler.fit_transform(np.concatenate((X_train, Y_train.reshape(-1, 1)), axis=1)) 
        XY_test = min_max_scaler.transform(np.concatenate((X_test, Y_test.reshape(-1, 1)), axis=1))

        X_train = XY_train[:, :-1]
        Y_train = XY_train[:, -1]
        X_test = XY_test[:, :-1]
        Y_test = XY_test[:, -1]        

        np.savetxt(columnName + '/' + 'X_train.csv', X_train)
        np.savetxt(columnName + '/' + 'Y_train.csv', Y_train)
        np.savetxt(columnName + '/' + 'X_test.csv', X_test)
        np.savetxt(columnName + '/' + 'Y_test.csv', Y_test)         

In [52]:
df = generate_unique_df(['INMET_SE_MG_A521_PAMPULHA_01-01-2010_A_31-12-2010.csv', 'INMET_SE_MG_A521_PAMPULHA_01-01-2011_A_31-12-2011.csv', 'INMET_SE_MG_A521_PAMPULHA_01-01-2012_A_31-12-2012.csv', 'INMET_SE_MG_A521_PAMPULHA_01-01-2013_A_31-12-2013.csv'])
generate_bases(df)