In [6]:
import pandas as pd
import parse as p
import os
import numpy as np

In [4]:
folder = './data/input/reserva'
folder_new = './data/input/cleaned'
folder_input = './data/input/cleaned'
folder_output = './data/output'

In [3]:
for root, dirs, files in os.walk(folder):
    for file in files:
        if file.endswith(".pdf"):
            # Copy the file to the folder_new
            filename_new = folder_new + '/' + file
            # Get the old file, expanding the sub-path
            filename_old = os.path.join(root, file)
            # Copy from old to new if new does not exist
            if not os.path.exists(filename_new):
                print(filename_old)
                os.system('cp ' + filename_old + ' ' + filename_new)   

./data/input/reserva/2024_02_08.pdf
./data/input/reserva/2024_01_29.pdf
./data/input/reserva/2024_01_28.pdf
./data/input/reserva/2024_01_31.pdf
./data/input/reserva/2024_01_30.pdf
./data/input/reserva/2024_01_27.pdf
./data/input/reserva/2024_02_03.pdf
./data/input/reserva/2024_02_02.pdf
./data/input/reserva/2024_02_01.pdf
./data/input/reserva/2024_02_05.pdf
./data/input/reserva/2024_02_04.pdf
./data/input/reserva/2024_02_06.pdf
./data/input/reserva/2024_02_07.pdf


In [7]:
import importlib
importlib.reload(p)

filenames_raw = sorted(os.listdir(folder_input))
# filenames = [f for f in filenames_raw if p.filter_weekly(f)]
filenames = filenames_raw[-400:][::-1]

for i, filename in enumerate(filenames):
    if i % 50 == 0:
        print(f'{i} of {len(filenames)}: {filename} / share completed: {round(i/len(filenames)*100, 2)}%')
    full_filename = os.path.join(folder_new, filename)
    filename_output = filename.replace('pdf', 'csv')
    filename_output = os.path.join(folder_output, filename_output)
    if not os.path.exists(filename_output):
        try:
            tables = p.get_tables(full_filename)
            df_raw = p.get_df(tables)
            df = p.clean_df(df_raw)
            df.to_csv(filename_output, index=False)
        except Exception as e:
            print(full_filename)
            print(e)
            pass

0 of 400: 2024_02_08.pdf / share completed: 0.0%
50 of 400: 2023_12_19.pdf / share completed: 12.5%
100 of 400: 2023_10_30.pdf / share completed: 25.0%
150 of 400: 2023_09_10.pdf / share completed: 37.5%
200 of 400: 2023_07_20.pdf / share completed: 50.0%
250 of 400: 2023_05_31.pdf / share completed: 62.5%
300 of 400: 2023_04_10.pdf / share completed: 75.0%
350 of 400: 2023_02_19.pdf / share completed: 87.5%


In [11]:
import os

filenames_all = sorted(os.listdir(folder_output))
df_all = p.get_full_df(filenames_all)
df_all.to_csv('./data/datasets/all_parsed.csv', index=False)
df_all = p.correct_issues(df_all)

def remove_bad_rows(df):
    df_all = p.add_cols(df)

    df_all['date_lag'] = df_all.groupby(['province', 'reservoir'])['date'].shift(1)

    cols = ['rainfallsince', 'stored_hm3', 'capacity_hm3']
    for var in ['rainfallsince', 'stored_hm3']:
        df_all[f'{var}_diff'] = df_all.groupby(['province', 'reservoir'])[var].diff()
        df_all[f'{var}_diff_0'] = df_all[f'{var}_diff']
        for lags in range(1, 10):
            df_all[f'{var}_diff_{lags}'] = df_all.groupby(['province', 'reservoir'])[f'{var}_diff'].shift(lags)
            
    df_all['bad_data'] = df_all.rainfallsince_diff_0 < -10
    df_all['bad_data_for_year'] = df_all.groupby(['province', 'reservoir', 'year_climatic'])['bad_data'].transform('any')
    df_all['problem_month'] = df_all.month.isin([1, 10, 11, 12])
    df_all['bad_data_and_month'] = df_all.bad_data & df_all.problem_month

    df_all['stored_hm3_diff_relative'] = df_all.stored_hm3_diff / df_all.capacity_hm3
    df_reg = df_all.query('~bad_data_and_month').query('rainfallsince_diff >=0').copy()

    df_reg['bad_data_for_year'] = df_reg.groupby(['province', 'reservoir', 'year_climatic'])['bad_data'].transform('any')
    df_reg = df_reg.sort_values(['province', 'reservoir', 'date'])
    df_reg['date_diff'] = (df_reg.date - df_reg.date_lag).dt.days
    df_reg[['province', 'reservoir', 'date']].head(10)

    df_reg['bad_data_for_year'] = df_reg.groupby(['province', 'reservoir', 'year_climatic'])['bad_data'].transform('any')
    assert df_reg.bad_data_for_year.sum() == 0

    def add_lags(df, var_name, lags=np.arange(-5, 5), groups=['province', 'reservoir']):
        for lag in lags:
            df[f'{var_name}_lag_{lag}'] = df.groupby(groups)[var_name].shift(lag)
            
        lag_vars = [f'{var_name}_lag_{lag}' for lag in lags]
        return df, lag_vars

    df_reg['suspicious_storage'] = (np.abs(df_reg['stored_hm3_diff']) > 2) & (np.abs(df_reg['stored_hm3_diff_relative']) > 0.05)
    df_reg['high_rain'] = (df_reg.rainfallsince_diff > 10) | (df_reg.rainfallsince_diff_1 > 10)
    df_reg['bad_storage'] = df_reg.suspicious_storage & (~df_reg.high_rain)
    df_reg, lag_vars = add_lags(df_reg, 'bad_storage')

    df_reg['surrounding_bad_storage'] = df_reg[lag_vars].max(axis=1)
    df_reg = df_reg.query('surrounding_bad_storage==0').copy()
    return df_reg

df_removed = remove_bad_rows(df_all)
df_removed = df_removed[df_all.columns]

len(df_removed), len(df_all)
df_all.to_csv('./data/datasets/all_parsed_cleaned.csv', index=False)

In [26]:
def pick_monthly(df):
    df_monthly = df[df.ds.str.slice(8, 10) == "01"].copy()
    return df_monthly

# Pick only the first day of the month from df_all
df_monthly = pick_monthly(df_all)
df_monthly_cleaned = pick_monthly(df_removed)

df_monthly.to_csv('./data/datasets/monthly.csv', index=False)
df_monthly_cleaned.to_csv('./data/datasets/monthly_cleaned.csv', index=False)