In [1]:
import pandas as pd

In [2]:
UPSTREAM_PATH = "../data/ANA HIDROWEB/RIO MEIA PONTE/60640000-MONTANTE DE GOIANIA.csv"
DOWNSTREAM_PATH = "../data/ANA HIDROWEB/RIO MEIA PONTE/60650000-JUSANTE DE GOIANIA.csv"
AFTER_PATH = "../data/ANA HIDROWEB/RIO MEIA PONTE/60655001-UHE SAO SIMAO FAZENDA BONITA DE BAIXO.csv"

PROCESSED_PATH = "../data/ANA HIDROWEB/RIO MEIA PONTE/processed.csv"

In [3]:
upstream_data = pd.read_csv(
    UPSTREAM_PATH,
    sep=";",
    header=0,
    parse_dates=["Data"],
    dayfirst=False,
    low_memory=False,
)
# Convert columns to float, coercing errors to NaN
upstream_data["Chuva (mm)"] = pd.to_numeric(upstream_data["Chuva (mm)"], errors="coerce")
upstream_data["Nível (cm)"] = pd.to_numeric(upstream_data["Nível (cm)"], errors="coerce")
upstream_data["Vazão (m3/s)"] = pd.to_numeric(upstream_data["Vazão (m3/s)"], errors="coerce")

downstream_data = pd.read_csv(
    DOWNSTREAM_PATH,
    sep=";",
    header=0,
    parse_dates=["Data"],
    dayfirst=False,
    low_memory=False,
)
# Convert columns to float, coercing errors to NaN
downstream_data["Chuva (mm)"] = pd.to_numeric(downstream_data["Chuva (mm)"], errors="coerce")
downstream_data["Nível (cm)"] = pd.to_numeric(downstream_data["Nível (cm)"], errors="coerce")
downstream_data["Vazão (m3/s)"] = pd.to_numeric(downstream_data["Vazão (m3/s)"], errors="coerce")

after_data = pd.read_csv(
    AFTER_PATH,
    sep=";",
    header=0,
    parse_dates=["Data"],
    dayfirst=False,
    low_memory=False,
)
# Convert columns to float, coercing errors to NaN
after_data["Chuva (mm)"] = pd.to_numeric(after_data["Chuva (mm)"], errors="coerce")
after_data["Nível (cm)"] = pd.to_numeric(after_data["Nível (cm)"], errors="coerce")
after_data["Vazão (m3/s)"] = pd.to_numeric(after_data["Vazão (m3/s)"], errors="coerce")

# Drop columns that start with 'Unnamed'
upstream_data = upstream_data.loc[:, ~upstream_data.columns.str.contains('^Unnamed')]
downstream_data = downstream_data.loc[:, ~downstream_data.columns.str.contains('^Unnamed')]
after_data = after_data.loc[:, ~after_data.columns.str.contains('^Unnamed')]

downstream_data.head()

Unnamed: 0,Data,Hora,Chuva (mm),Nível (cm),Vazão (m3/s)
0,08/11/2013,05:30:00,,126.0,22.3
1,08/11/2013,05:45:00,0.0,125.0,21.9
2,08/11/2013,06:00:00,0.0,125.0,21.9
3,08/11/2013,06:15:00,0.0,125.0,21.9
4,08/11/2013,06:30:00,0.0,125.0,21.9


In [4]:
data = pd.merge(
    upstream_data,
    downstream_data,
    on=["Data", "Hora"],
    suffixes=("_upstream", "_downstream"),
    how="inner",
)
data = pd.merge(
    data,
    after_data,
    on=["Data", "Hora"],
    suffixes=("", "_after"),
    how="inner",
)

# Manually rename the columns from after_data to add '_after' suffix
for col in ["Chuva (mm)", "Nível (cm)", "Vazão (m3/s)"]:
    if col in data.columns and f"{col}_after" not in data.columns:
        data.rename(columns={col: f"{col}_after"}, inplace=True)

data.head()

Unnamed: 0,Data,Hora,Chuva (mm)_upstream,Nível (cm)_upstream,Vazão (m3/s)_upstream,Chuva (mm)_downstream,Nível (cm)_downstream,Vazão (m3/s)_downstream,Chuva (mm)_after,Nível (cm)_after,Vazão (m3/s)_after
0,08/11/2013,06:00:00,0.0,203.0,15.5,0.0,125.0,21.9,,,
1,08/11/2013,07:00:00,0.0,203.0,15.5,0.0,125.0,21.9,,,
2,08/11/2013,08:00:00,0.0,203.0,15.5,0.0,125.0,21.9,,,
3,08/11/2013,09:00:00,0.4,204.0,15.7,0.0,125.0,21.9,,,
4,08/11/2013,10:00:00,0.0,204.0,15.7,0.0,124.0,21.5,,,


In [5]:
data.rename(
    columns={
        "Chuva (mm)_upstream": "rain_upstream",
        "Nível (cm)_upstream": "level_upstream",
        "Chuva (mm)_downstream": "rain_downstream",
        "Nível (cm)_downstream": "level_downstream",
        "Chuva (mm)_after": "rain_after",
        "Nível (cm)_after": "level_after",
        "Vazão (m3/s)_upstream": "flow_upstream",
        "Vazão (m3/s)_downstream": "flow_downstream",
        "Vazão (m3/s)_after": "flow_after",
        "Data": "date",
        "Hora": "hour",
    },
    inplace=True,
)

data["datetime"] = pd.to_datetime(data["date"] + " " + data["hour"], dayfirst=True)
data.drop(columns=["date", "hour"], inplace=True)
data.set_index("datetime", inplace=True)

data.head()

Unnamed: 0_level_0,rain_upstream,level_upstream,flow_upstream,rain_downstream,level_downstream,flow_downstream,rain_after,level_after,flow_after
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013-11-08 06:00:00,0.0,203.0,15.5,0.0,125.0,21.9,,,
2013-11-08 07:00:00,0.0,203.0,15.5,0.0,125.0,21.9,,,
2013-11-08 08:00:00,0.0,203.0,15.5,0.0,125.0,21.9,,,
2013-11-08 09:00:00,0.4,204.0,15.7,0.0,125.0,21.9,,,
2013-11-08 10:00:00,0.0,204.0,15.7,0.0,124.0,21.5,,,


In [6]:
# Count missing values in each column
missing_counts = data.isnull().sum()
missing_counts

rain_upstream        3745
level_upstream      14113
flow_upstream       14454
rain_downstream      3713
level_downstream     4626
flow_downstream      4623
rain_after          12671
level_after          5403
flow_after          14048
dtype: int64

In [7]:
# Fill missing values with the next valid observation
data.bfill(inplace=True)

In [8]:
# Check if the index is ordered
is_ordered = data.index.is_monotonic_increasing
print(f"Data is ordered by index: {is_ordered}")

Data is ordered by index: True


## Resample

In [9]:
# resample the data to daily frequency and aggregate
data = data.resample("D").agg([
    'mean',
    'max',
    'min',
    ('q25', lambda x: x.quantile(0.25)),
    ('q75', lambda x: x.quantile(0.75))
])
# The rename did not work because after resampling, "datetime" is no longer a column but the index.
# If you want to reset the index and rename it to "date", use:
data.reset_index(inplace=True)
data.rename(columns={"datetime": "date"}, inplace=True)
data.set_index("date", inplace=True)

In [10]:
# Flatten MultiIndex columns and give them more readable names
data.columns = [
    f"{var}_{stat}"
    for var, stat in data.columns
]

# Example: 'chuva_upstream_mean', 'nivel_downstream_max', etc.
data.head()

Unnamed: 0_level_0,rain_upstream_mean,rain_upstream_max,rain_upstream_min,rain_upstream_q25,rain_upstream_q75,level_upstream_mean,level_upstream_max,level_upstream_min,level_upstream_q25,level_upstream_q75,...,level_after_mean,level_after_max,level_after_min,level_after_q25,level_after_q75,flow_after_mean,flow_after_max,flow_after_min,flow_after_q25,flow_after_q75
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-11-08,0.022222,0.4,0.0,0.0,0.0,198.611111,204.0,193.0,195.25,202.75,...,248.0,248.0,248.0,248.0,248.0,42.7,42.7,42.7,42.7,42.7
2013-11-09,0.0,0.0,0.0,0.0,0.0,195.086957,198.0,192.0,194.0,197.0,...,248.0,248.0,248.0,248.0,248.0,42.7,42.7,42.7,42.7,42.7
2013-11-10,0.0,0.0,0.0,0.0,0.0,195.041667,205.0,191.0,192.0,197.0,...,248.0,248.0,248.0,248.0,248.0,42.7,42.7,42.7,42.7,42.7
2013-11-11,0.0,0.0,0.0,0.0,0.0,200.565217,207.0,189.0,195.0,206.0,...,248.0,248.0,248.0,248.0,248.0,42.7,42.7,42.7,42.7,42.7
2013-11-12,0.1,1.2,0.0,0.0,0.0,193.416667,260.0,181.0,185.0,188.0,...,248.0,248.0,248.0,248.0,248.0,42.7,42.7,42.7,42.7,42.7


## Feature engineering

#### Novas features:

- chuva_acumulada_2_dias: soma da precipitação dos últimos 2 dias
- chuva_acumulada_3_dias: soma da precipitação dos últimos 3 dias
- dias_sem_chuva: número de dias sem chuva
- variacao_chuva: taxa de variação da precipitação em relação ao dia anterior
- variacao_nivel: taxa de variação do nível do rio em relação ao dia anterior
- variacao_vazao: taxa de variação da vazão em relação ao dia anterior


In [11]:
data['rain_upstream_acc_2_days'] = data['rain_upstream_mean'].rolling(window=2).sum()
data['rain_downstream_acc_2_days'] = data['rain_downstream_mean'].rolling(window=2).sum()
data['rain_after_acc_2_days'] = data['rain_after_mean'].rolling(window=2).sum()
data['rain_upstream_acc_3_days'] = data['rain_upstream_mean'].rolling(window=3).sum()
data['rain_downstream_acc_3_days'] = data['rain_downstream_mean'].rolling(window=3).sum()
data['rain_after_acc_3_days'] = data['rain_after_mean'].rolling(window=3).sum()

## Filter

In [12]:
# Filter out data beyond 2024
data = data[data.index.year <= 2024]
data.tail()

Unnamed: 0_level_0,rain_upstream_mean,rain_upstream_max,rain_upstream_min,rain_upstream_q25,rain_upstream_q75,level_upstream_mean,level_upstream_max,level_upstream_min,level_upstream_q25,level_upstream_q75,...,flow_after_max,flow_after_min,flow_after_q25,flow_after_q75,rain_upstream_acc_2_days,rain_downstream_acc_2_days,rain_after_acc_2_days,rain_upstream_acc_3_days,rain_downstream_acc_3_days,rain_after_acc_3_days
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-12-27,0.008696,0.2,0.0,0.0,0.0,308.565217,320.0,299.0,304.25,311.0,...,128.98,88.7,89.45,109.8125,0.076988,0.135631,0.231495,0.082117,0.171528,0.236623
2024-12-28,0.0,0.0,0.0,0.0,0.0,291.55,299.0,279.0,287.0,295.0,...,87.96,74.32,77.81,86.47,0.008696,0.047826,0.021739,0.076988,0.135631,0.231495
2024-12-29,0.209756,8.4,0.0,0.0,0.0,277.512195,300.0,268.0,276.0,278.0,...,74.32,66.28,66.93,72.27,0.209756,0.395122,0.068293,0.218452,0.442948,0.090032
2024-12-30,0.0,0.0,0.0,0.0,0.0,274.564103,279.0,266.0,271.0,278.0,...,138.07,72.27,95.18,129.43,0.209756,0.436148,0.068293,0.209756,0.436148,0.068293
2024-12-31,0.009091,0.4,0.0,0.0,0.0,261.159091,267.0,256.0,257.0,263.0,...,82.81,67.59,68.25,77.81,0.009091,0.413753,0.136364,0.218847,0.808875,0.204656


In [13]:
# Filter out data before 2014
data = data[data.index.year >= 2014]
data.head()

Unnamed: 0_level_0,rain_upstream_mean,rain_upstream_max,rain_upstream_min,rain_upstream_q25,rain_upstream_q75,level_upstream_mean,level_upstream_max,level_upstream_min,level_upstream_q25,level_upstream_q75,...,flow_after_max,flow_after_min,flow_after_q25,flow_after_q75,rain_upstream_acc_2_days,rain_downstream_acc_2_days,rain_after_acc_2_days,rain_upstream_acc_3_days,rain_downstream_acc_3_days,rain_after_acc_3_days
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-01,0.0,0.0,0.0,0.0,0.0,219.083333,222.0,216.0,217.0,221.0,...,70.8,65.4,66.6,69.6,0.0,0.008696,0.0,0.0,0.026087,0.0
2014-01-02,0.008696,0.2,0.0,0.0,0.0,222.956522,226.0,219.0,221.5,224.0,...,85.3,67.8,73.9,78.1,0.008696,0.217391,0.0,0.008696,0.226087,0.0
2014-01-03,0.0,0.0,0.0,0.0,0.0,225.318182,231.0,219.0,220.5,230.0,...,78.7,63.0,67.95,76.3,0.008696,0.217391,0.0,0.008696,0.217391,0.0
2014-01-04,0.0,0.0,0.0,0.0,0.0,213.863636,221.0,208.0,212.0,215.75,...,63.0,58.2,61.05,62.85,0.0,0.0,0.0,0.008696,0.217391,0.0
2014-01-05,0.0,0.0,0.0,0.0,0.0,204.904762,208.0,201.0,204.0,206.0,...,58.2,53.4,55.8,57.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Save the processed data to a CSV file
data.to_csv(PROCESSED_PATH, sep=";", index=True)
print(f"Processed data saved to {PROCESSED_PATH}")

Processed data saved to ../data/ANA HIDROWEB/RIO MEIA PONTE/processed.csv
