In [14]:
import os
import json
import time
import numpy as np
import pandas as pd
import pickle
from utils.meteo import load_meteo
from tqdm import tqdm 
from tqdm.notebook import tqdm_notebook
from multiprocessing import Pool
from pandarallel import pandarallel
from matplotlib import pyplot as plt

tqdm.pandas()
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


  from pandas import Panel


In [17]:
# Выбираем целевые гидрологические посты
target_station_ids = [6005, 6022, 6296, 6027, 5004, 5012, 5024, 5805]

# Загружаем мэпинг ближайших станций и метео станций к произвольной
with open("temp_data/nearest_stantions.pkl", "rb") as f:
    nearest_stantions = pickle.load(f)
    
with open("temp_data/nearest_meteo.pkl", "rb") as f:
    nearest_meteo = pickle.load(f)

In [3]:
meteo = {}
meteo_ids = [i.split(".")[0] for i in os.listdir("datasets/meteo_new/") if i.endswith(".csv")] +\
            [i.split(".")[0] for i in os.listdir("datasets/meteo/") if i.endswith(".csv")]

meteo_ids = list(set(meteo_ids))


for meteo_id in tqdm_notebook(meteo_ids):
    meteo[int(meteo_id)] = load_meteo(meteo_id).interpolate()
    
meteo_columns = meteo[4443141].columns

HBox(children=(FloatProgress(value=0.0, max=114.0), HTML(value='')))




In [20]:
def calculate_meteo(item):
    identifier = item["identifier"]
    near_meteo = nearest_meteo[identifier]
    date = item["time"]
    
    meteo_dist = []
    meteo_data = []
    
    for i, dist in near_meteo:
        try:
            meteo_data.append(meteo[i].loc[date])
            meteo_dist.append(dist)
        except KeyError:
            continue
    
    if not meteo_data:
        return pd.Series(dtype=np.float32)

    meteo_dist_ = np.array(meteo_dist)
    meteo_dist_ = np.exp(-(meteo_dist_ / 10))
    
    meteo_data = pd.concat(meteo_data, axis=1)
    
    result = (meteo_data.fillna(0) * np.expand_dims(meteo_dist_, axis=0)).sum(axis=1)
    sum_dist = ((~meteo_data.isna()).astype(np.float32) * np.expand_dims(meteo_dist_, axis=0)).sum(axis=1)
    
    
    result = result / sum_dist
    result["min_stantion_dist"] = min(meteo_dist)
    
    meteo_dist_exp_sum_ = meteo_dist_.sum()
    result["mean_stantion_dist"] = sum([i * j / meteo_dist_exp_sum_ for i, j in zip(meteo_dist, meteo_dist_)]) 
    
    return result.round(2)

In [21]:
nearest_stantions_ = {}

In [22]:
dataset = pd.read_csv("datasets/hydro_2018-2020/new_data_all.csv", sep=";")
dataset["time"] = pd.to_datetime(dataset["time"])
dataset = dataset[dataset["identifier"].isin(nearest_meteo)]
dataset = dataset.sort_index()

def add_missing_dates(data, identifier):
    data = data.set_index("time").sort_index()
    data = data.asfreq("D")
    data = data.reset_index()
    data["identifier"] = identifier
    
    return data


dataset = dataset.set_index("identifier")
dataset = pd.concat([add_missing_dates(dataset.loc[identifier], identifier) 
                                       for identifier in dataset.index.unique()])

dataset = dataset[dataset["identifier"].isin([i[0] for i in \
                                             ((dataset.groupby("identifier")["time"].min().dt.year < 1997) &
                                              (dataset.groupby("identifier")["time"].max().dt.year > 2012)).items()\
                                              if i[1]])]

dataset = dataset.rename(columns={"max_level": "level"})
stantion_ids = dataset["identifier"].unique()

In [23]:
# calculcate nearest stantions

dataset = dataset.set_index(["identifier", "time"])

new_dataset = []


stantions = target_station_ids + [i for i in stantion_ids if i not in target_station_ids][:30]

for stantion_id in stantions:
    d = dataset.loc[stantion_id]
    d["identifier"] = stantion_id
    
    nst = [i for i in nearest_stantions[stantion_id] if i[0] in stantion_ids][:3]
    
    print("Fot stantion", stantion_id, "number of near_stantions is", len(nst))
    
    for i in range(3):
        if len(nst) > i:
            if stantion_id in target_station_ids:
                nearest_stantions_[stantion_id] = nearest_stantions_.get(stantion_id, []) + [nst[i]]
            d["near_level_{}".format(i+1)] = dataset.loc[nst[i][0]]["level"]
            d["near_dist_{}".format(i+1)] = nst[i][1]
            d["near_dist_{}".format(i+1)][d["near_level_{}".format(i+1)].isna()] = -1
            d["near_level_{}".format(i+1)] = d["near_level_{}".format(i+1)].fillna(-1)
        else:
            if stantion_id in target_station_ids:
                nearest_stantions_[stantion_id] = nearest_stantions_.get(stantion_id, [])
            d["near_level_{}".format(i+1)] = -1
            d["near_dist_{}".format(i+1)] = -1
            
    new_dataset.append(d)
    
dataset = pd.concat(new_dataset).reset_index()
dataset = dataset.set_index(["identifier", "time"])

Fot stantion 6005 number of near_stantions is 2
Fot stantion 6022 number of near_stantions is 3
Fot stantion 6296 number of near_stantions is 3
Fot stantion 6027 number of near_stantions is 3
Fot stantion 5004 number of near_stantions is 3
Fot stantion 5012 number of near_stantions is 3
Fot stantion 5024 number of near_stantions is 3


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Fot stantion 5805 number of near_stantions is 3
Fot stantion 5116 number of near_stantions is 3
Fot stantion 5292 number of near_stantions is 3
Fot stantion 5216 number of near_stantions is 3
Fot stantion 5094 number of near_stantions is 3
Fot stantion 5740 number of near_stantions is 3
Fot stantion 6437 number of near_stantions is 3
Fot stantion 5354 number of near_stantions is 3
Fot stantion 5383 number of near_stantions is 3
Fot stantion 5190 number of near_stantions is 3
Fot stantion 6496 number of near_stantions is 3
Fot stantion 6016 number of near_stantions is 3
Fot stantion 6345 number of near_stantions is 3
Fot stantion 5446 number of near_stantions is 3
Fot stantion 5135 number of near_stantions is 3
Fot stantion 6387 number of near_stantions is 3
Fot stantion 6287 number of near_stantions is 3
Fot stantion 6443 number of near_stantions is 3
Fot stantion 5026 number of near_stantions is 3
Fot stantion 5364 number of near_stantions is 3
Fot stantion 5166 number of near_stantio

In [24]:
nearest_stantions_

{6005: [(6246, 93.8), (6545, 989.0)],
 6022: [(6020, 60.2), (6549, 140.1), (6016, 156.4)],
 6296: [(6022, 6.1), (6023, 17.4), (6020, 60.5)],
 6027: [(6507, 55.7), (6026, 82.0), (6443, 92.7)],
 5004: [(5675, 22.3), (5044, 39.9), (5002, 73.8)],
 5012: [(5013, 4.3), (5009, 34.0), (5008, 41.3)],
 5024: [(5415, 33.3), (5420, 35.9), (5663, 55.7)],
 5805: [(5803, 58.1), (5664, 79.2), (5033, 84.1)]}

In [25]:
dataset = dataset.reset_index()

In [26]:
dataset

Unnamed: 0,identifier,time,level,near_level_1,near_dist_1,near_level_2,near_dist_2,near_level_3,near_dist_3
0,6005,1984-01-01,80.0,30.0,93.8,-1.0,-1.0,-1.0,-1.0
1,6005,1984-01-02,76.0,30.0,93.8,-1.0,-1.0,-1.0,-1.0
2,6005,1984-01-03,70.0,27.0,93.8,-1.0,-1.0,-1.0,-1.0
3,6005,1984-01-04,69.0,25.0,93.8,-1.0,-1.0,-1.0,-1.0
4,6005,1984-01-05,69.0,23.0,93.8,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...
495662,5020,2020-09-27,598.0,-1.0,-1.0,489.0,47.6,-1.0,-1.0
495663,5020,2020-09-28,603.0,-1.0,-1.0,485.0,47.6,-1.0,-1.0
495664,5020,2020-09-29,603.0,-1.0,-1.0,481.0,47.6,-1.0,-1.0
495665,5020,2020-09-30,603.0,-1.0,-1.0,475.0,47.6,-1.0,-1.0


In [27]:
meteo_data = dataset.parallel_apply(calculate_meteo, axis=1)
dataset_with_meteo = pd.concat((dataset, meteo_data), axis=1)

In [30]:
dataset_with_meteo.to_csv("datasets/final_dataset.csv", index=False)