In [1]:
import pandas as pd

In [2]:
UPSTREAM_PATH = "../data/ANA HIDROWEB/RIO MEIA PONTE/60640000-MONTANTE DE GOIANIA.csv"
DOWNSTREAM_PATH = "../data/ANA HIDROWEB/RIO MEIA PONTE/60650000-JUSANTE DE GOIANIA.csv"
AFTER_PATH = "../data/ANA HIDROWEB/RIO MEIA PONTE/60655001-UHE SAO SIMAO FAZENDA BONITA DE BAIXO.csv"

PROCESSED_PATH = "../data/ANA HIDROWEB/RIO MEIA PONTE/processed.csv"

In [3]:
upstream_data = pd.read_csv(
    UPSTREAM_PATH,
    sep=";",
    header=0,
    parse_dates=["Data"],
    dayfirst=False,
    low_memory=False,
)
# Convert columns to float, coercing errors to NaN
upstream_data["Chuva (mm)"] = pd.to_numeric(upstream_data["Chuva (mm)"], errors="coerce")
upstream_data["Nível (cm)"] = pd.to_numeric(upstream_data["Nível (cm)"], errors="coerce")
upstream_data["Vazão (m3/s)"] = pd.to_numeric(upstream_data["Vazão (m3/s)"], errors="coerce")

downstream_data = pd.read_csv(
    DOWNSTREAM_PATH,
    sep=";",
    header=0,
    parse_dates=["Data"],
    dayfirst=False,
    low_memory=False,
)
# Convert columns to float, coercing errors to NaN
downstream_data["Chuva (mm)"] = pd.to_numeric(downstream_data["Chuva (mm)"], errors="coerce")
downstream_data["Nível (cm)"] = pd.to_numeric(downstream_data["Nível (cm)"], errors="coerce")
downstream_data["Vazão (m3/s)"] = pd.to_numeric(downstream_data["Vazão (m3/s)"], errors="coerce")

after_data = pd.read_csv(
    AFTER_PATH,
    sep=";",
    header=0,
    parse_dates=["Data"],
    dayfirst=False,
    low_memory=False,
)
# Convert columns to float, coercing errors to NaN
after_data["Chuva (mm)"] = pd.to_numeric(after_data["Chuva (mm)"], errors="coerce")
after_data["Nível (cm)"] = pd.to_numeric(after_data["Nível (cm)"], errors="coerce")
after_data["Vazão (m3/s)"] = pd.to_numeric(after_data["Vazão (m3/s)"], errors="coerce")

# Drop columns that start with 'Unnamed'
upstream_data = upstream_data.loc[:, ~upstream_data.columns.str.contains('^Unnamed')]
downstream_data = downstream_data.loc[:, ~downstream_data.columns.str.contains('^Unnamed')]
after_data = after_data.loc[:, ~after_data.columns.str.contains('^Unnamed')]

downstream_data.head()

Unnamed: 0,Data,Hora,Chuva (mm),Nível (cm),Vazão (m3/s)
0,08/11/2013,05:30:00,,126.0,22.3
1,08/11/2013,05:45:00,0.0,125.0,21.9
2,08/11/2013,06:00:00,0.0,125.0,21.9
3,08/11/2013,06:15:00,0.0,125.0,21.9
4,08/11/2013,06:30:00,0.0,125.0,21.9


In [4]:
data = pd.merge(
    upstream_data,
    downstream_data,
    on=["Data", "Hora"],
    suffixes=("_upstream", "_downstream"),
    how="inner",
)
data = pd.merge(
    data,
    after_data,
    on=["Data", "Hora"],
    suffixes=("", "_after"),
    how="inner",
)
# Manually rename the columns from after_data to add '_after' suffix
for col in ["Chuva (mm)", "Nível (cm)", "Vazão (m3/s)"]:
    if col in data.columns and f"{col}_after" not in data.columns:
        data.rename(columns={col: f"{col}_after"}, inplace=True)

data.head()


Unnamed: 0,Data,Hora,Chuva (mm)_upstream,Nível (cm)_upstream,Vazão (m3/s)_upstream,Chuva (mm)_downstream,Nível (cm)_downstream,Vazão (m3/s)_downstream,Chuva (mm)_after,Nível (cm)_after,Vazão (m3/s)_after
0,08/11/2013,06:00:00,0.0,203.0,15.5,0.0,125.0,21.9,,,
1,08/11/2013,07:00:00,0.0,203.0,15.5,0.0,125.0,21.9,,,
2,08/11/2013,08:00:00,0.0,203.0,15.5,0.0,125.0,21.9,,,
3,08/11/2013,09:00:00,0.4,204.0,15.7,0.0,125.0,21.9,,,
4,08/11/2013,10:00:00,0.0,204.0,15.7,0.0,124.0,21.5,,,


In [5]:
data.rename(
    columns={
        "Chuva (mm)_upstream": "chuva_upstream",
        "Nível (cm)_upstream": "nivel_upstream",
        "Chuva (mm)_downstream": "chuva_downstream",
        "Nível (cm)_downstream": "nivel_downstream",
        "Chuva (mm)_after": "chuva_after",
        "Nível (cm)_after": "nivel_after",
        "Data": "data",
        "Hora": "hora",
        "Vazão (m3/s)_upstream": "vazao_upstream",
        "Vazão (m3/s)_downstream": "vazao_downstream",
        "Vazão (m3/s)_after": "vazao_after",
    },
    inplace=True,
)

data["datetime"] = pd.to_datetime(data["data"] + " " + data["hora"], dayfirst=True)
data.drop(columns=["data", "hora"], inplace=True)
data.set_index("datetime", inplace=True)

data.head()

Unnamed: 0_level_0,chuva_upstream,nivel_upstream,vazao_upstream,chuva_downstream,nivel_downstream,vazao_downstream,chuva_after,nivel_after,vazao_after
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013-11-08 06:00:00,0.0,203.0,15.5,0.0,125.0,21.9,,,
2013-11-08 07:00:00,0.0,203.0,15.5,0.0,125.0,21.9,,,
2013-11-08 08:00:00,0.0,203.0,15.5,0.0,125.0,21.9,,,
2013-11-08 09:00:00,0.4,204.0,15.7,0.0,125.0,21.9,,,
2013-11-08 10:00:00,0.0,204.0,15.7,0.0,124.0,21.5,,,


In [6]:
# Count missing values in each column
missing_counts = data.isnull().sum()
missing_counts

chuva_upstream       3745
nivel_upstream      14113
vazao_upstream      14454
chuva_downstream     3713
nivel_downstream     4626
vazao_downstream     4623
chuva_after         12671
nivel_after          5403
vazao_after         14048
dtype: int64

In [7]:
# Fill missing values with the next valid observation
data.bfill(inplace=True)

In [8]:
# Check if the index is ordered
is_ordered = data.index.is_monotonic_increasing
print(f"Data is ordered by index: {is_ordered}")

Data is ordered by index: True


## Filter years and resample

In [9]:
# Filter out data beyond 2024
data = data[data.index.year <= 2024]
data.tail()

Unnamed: 0_level_0,chuva_upstream,nivel_upstream,vazao_upstream,chuva_downstream,nivel_downstream,vazao_downstream,chuva_after,nivel_after,vazao_after
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-12-31 19:30:00,0.0,256.0,21.88,0.0,182.0,46.34,0.0,170.0,68.25
2024-12-31 20:00:00,0.0,257.0,22.11,0.0,181.0,45.85,0.0,170.0,68.25
2024-12-31 21:30:00,0.0,256.0,21.88,11.8,207.0,58.82,0.0,170.0,68.25
2024-12-31 22:00:00,0.0,256.0,21.88,1.4,252.0,82.79,0.0,169.0,67.59
2024-12-31 22:30:00,0.0,257.0,22.11,3.0,251.0,82.24,0.0,170.0,68.25


In [10]:
# Filter out data before 2014
data = data[data.index.year >= 2014]
data.head()

Unnamed: 0_level_0,chuva_upstream,nivel_upstream,vazao_upstream,chuva_downstream,nivel_downstream,vazao_downstream,chuva_after,nivel_after,vazao_after
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-01-01 00:00:00,0.0,217.0,18.8,0.0,150.0,32.1,0.0,290.0,69.0
2014-01-01 01:00:00,0.0,216.0,18.6,0.0,149.0,31.7,0.0,292.0,70.2
2014-01-01 02:00:00,0.0,216.0,18.6,0.0,148.0,31.3,0.0,292.0,70.2
2014-01-01 03:00:00,0.0,216.0,18.6,0.0,147.0,30.8,0.0,293.0,70.8
2014-01-01 04:00:00,0.0,216.0,18.6,0.0,146.0,30.4,0.0,291.0,69.6


In [11]:
# resample the data to daily frequency and aggregate
data = data.resample("D").agg([
    'mean',
    'max',
    'min',
    ('q25', lambda x: x.quantile(0.25)),
    ('q75', lambda x: x.quantile(0.75))
])

In [12]:
# Flatten MultiIndex columns and give them more readable names
data.columns = [
    f"{var}_{stat}"
    for var, stat in data.columns
]

# Example: 'chuva_upstream_mean', 'nivel_downstream_max', etc.
data.head()

Unnamed: 0_level_0,chuva_upstream_mean,chuva_upstream_max,chuva_upstream_min,chuva_upstream_q25,chuva_upstream_q75,nivel_upstream_mean,nivel_upstream_max,nivel_upstream_min,nivel_upstream_q25,nivel_upstream_q75,...,nivel_after_mean,nivel_after_max,nivel_after_min,nivel_after_q25,nivel_after_q75,vazao_after_mean,vazao_after_max,vazao_after_min,vazao_after_q25,vazao_after_q75
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01,0.0,0.0,0.0,0.0,0.0,219.083333,222.0,216.0,217.0,221.0,...,288.958333,293.0,284.0,286.0,291.0,68.375,70.8,65.4,66.6,69.6
2014-01-02,0.008696,0.2,0.0,0.0,0.0,222.956522,226.0,219.0,221.5,224.0,...,304.0,326.0,288.0,298.0,309.0,75.104348,85.3,67.8,73.9,78.1
2014-01-03,0.0,0.0,0.0,0.0,0.0,225.318182,231.0,219.0,220.5,230.0,...,293.909091,306.0,280.0,288.25,302.0,71.686364,78.7,63.0,67.95,76.3
2014-01-04,0.0,0.0,0.0,0.0,0.0,213.863636,221.0,208.0,212.0,215.75,...,277.909091,280.0,271.0,276.75,279.0,61.8,63.0,58.2,61.05,62.85
2014-01-05,0.0,0.0,0.0,0.0,0.0,204.904762,208.0,201.0,204.0,206.0,...,269.142857,272.0,264.0,268.0,270.0,56.485714,58.2,53.4,55.8,57.0


## Feature engineering

#### Novas features:

- chuva_acumulada_2_dias: soma da precipitação dos últimos 2 dias
- chuva_acumulada_3_dias: soma da precipitação dos últimos 3 dias
- dias_sem_chuva: número de dias sem chuva
- variacao_chuva: taxa de variação da precipitação em relação ao dia anterior
- variacao_nivel: taxa de variação do nível do rio em relação ao dia anterior
- variacao_vazao: taxa de variação da vazão em relação ao dia anterior


In [13]:
data['chuva_upstream_acumulada_2_dias'] = data['chuva_upstream_mean'].rolling(window=2).sum()
data['chuva_downstream_acumulada_2_dias'] = data['chuva_downstream_mean'].rolling(window=2).sum()
data['chuva_after_acumulada_2_dias'] = data['chuva_after_mean'].rolling(window=2).sum()
data['chuva_upstream_acumulada_3_dias'] = data['chuva_upstream_mean'].rolling(window=3).sum()
data['chuva_downstream_acumulada_3_dias'] = data['chuva_downstream_mean'].rolling(window=3).sum()
data['chuva_after_acumulada_3_dias'] = data['chuva_after_mean'].rolling(window=3).sum()
data['chuva_upstream_change'] = data['chuva_upstream_mean'].diff()
data['chuva_downstream_change'] = data['chuva_downstream_mean'].diff()
data['chuva_after_change'] = data['chuva_after_mean'].diff()
data['nivel_upstream_change'] = data['nivel_upstream_mean'].diff()
data['nivel_downstream_change'] = data['nivel_downstream_mean'].diff()
data['nivel_after_change'] = data['nivel_after_mean'].diff()
data['vazao_upstream_change'] = data['vazao_upstream_mean'].diff()
data['vazao_downstream_change'] = data['vazao_downstream_mean'].diff()
data['vazao_after_change'] = data['vazao_after_mean'].diff()

In [14]:
# Save the processed data to a CSV file
data.to_csv(PROCESSED_PATH, sep=";", index=True)
print(f"Processed data saved to {PROCESSED_PATH}")

Processed data saved to ../data/ANA HIDROWEB/RIO MEIA PONTE/processed.csv
