# Data import

In [3]:
import psycopg2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

In [4]:

conn_string = "host='localhost' dbname='weather_env' user='postgres' password='postgres'"
conn = psycopg2.connect(conn_string)
cur = conn.cursor()
cur.execute("""SELECT * FROM environment_data.pivoted;""")
records = cur.fetchall()
cur.close()
df = pd.DataFrame(records, columns=[x[0] for x in cur.description])
df.index = pd.to_datetime(df[['measure_year', 'measure_month', 'measure_day']].rename({'measure_year': 'year', 'measure_month': 'month', 'measure_day': 'day'}, axis=1))

# Similar Weeks

In [None]:
gdf = df[['arnhem_waterlevel']]\
    .groupby([df.index.year.astype(str) + '_' + df.index.isocalendar().week.astype(str).str.zfill(2), df.index.weekday])\
    .mean().unstack(level=1).sort_index().iloc[:-1]
gdf

In [None]:
cdf = gdf.T.corr()['2022_10'].sort_values(ascending=False).dropna()
cdf

In [None]:
def next_week(week):
    year, week = week.split('_')
    year, week = int(year), int(week)
    if week == 53:
        year += 1
        week = 1
    else:
        week += 1
    _next = f'{year}_{week}'
    return _next

old = cdf.head(15).index
new = list(map(next_week, old))
list(zip(old, new))

In [None]:
gdf.loc[[x for x in map(next_week, old) if x in gdf.index]].T.plot()

# Similar `n` days

In [5]:
subset = df['arnhem_waterlevel'].values
n = 30
gdf = pd.DataFrame([subset[a:b] for a, b in enumerate(range(n, subset.shape[0]))]).dropna()
gdf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,1209.0,1201.0,1195.0,1190.0,1175.0,1169.0,1154.0,1135.0,1112.0,1088.0,...,1010.0,1010.0,1020.0,1059.0,1106.0,1177.0,1188.0,1193.0,1193.0,1190.0
1,1201.0,1195.0,1190.0,1175.0,1169.0,1154.0,1135.0,1112.0,1088.0,1046.0,...,1010.0,1020.0,1059.0,1106.0,1177.0,1188.0,1193.0,1193.0,1190.0,1182.0
2,1195.0,1190.0,1175.0,1169.0,1154.0,1135.0,1112.0,1088.0,1046.0,1018.0,...,1020.0,1059.0,1106.0,1177.0,1188.0,1193.0,1193.0,1190.0,1182.0,1172.0
3,1190.0,1175.0,1169.0,1154.0,1135.0,1112.0,1088.0,1046.0,1018.0,1004.0,...,1059.0,1106.0,1177.0,1188.0,1193.0,1193.0,1190.0,1182.0,1172.0,1161.0
4,1175.0,1169.0,1154.0,1135.0,1112.0,1088.0,1046.0,1018.0,1004.0,991.0,...,1106.0,1177.0,1188.0,1193.0,1193.0,1190.0,1182.0,1172.0,1161.0,1151.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92168,821.8,826.0,821.5,820.7,819.8,823.2,829.0,830.9,819.4,806.7,...,776.0,772.1,767.4,762.3,757.9,751.7,742.5,736.7,730.6,729.4
92169,826.0,821.5,820.7,819.8,823.2,829.0,830.9,819.4,806.7,796.3,...,772.1,767.4,762.3,757.9,751.7,742.5,736.7,730.6,729.4,734.8
92170,821.5,820.7,819.8,823.2,829.0,830.9,819.4,806.7,796.3,783.0,...,767.4,762.3,757.9,751.7,742.5,736.7,730.6,729.4,734.8,742.1
92171,820.7,819.8,823.2,829.0,830.9,819.4,806.7,796.3,783.0,774.3,...,762.3,757.9,751.7,742.5,736.7,730.6,729.4,734.8,742.1,758.5


In [6]:
def corr(a, b):
    a = (a - np.mean(a))/(np.std(a)*len(a))
    b = (b - np.mean(b))/(np.std(b))
    return np.correlate(a, b)[0]

In [7]:
month_num = gdf.shape[0]-1
val = gdf.loc[month_num]
cdf = pd.Series([corr(val, x) for _, x in gdf.iterrows()]).sort_values(ascending=False).dropna()
cdf

In [None]:
# result = gdf.loc[[x for x in cdf.loc[cdf>.95].index + n if x in gdf.index]].dropna()
result = gdf.loc[[x for x in cdf.head(20).index + n if x in gdf.index]].dropna()
likely = result.shape[0]
likely

In [None]:
result.T.plot()
plt.title(f'The results of the {n} days after the {likely} most similar {n} day periods')

In [None]:
sub = result.copy()
sub.loc['mean'] = sub.mean(axis=0)
sub.loc['median'] = sub.median(axis=0)
sub.loc['max'] = sub.max(axis=0)
sub.loc['min'] = sub.min(axis=0)
sub.T[['mean', 'median', 'max', 'min',]].plot()
plt.title(f'The results of the {n} days after the {likely} most similar {n} day periods')

In [None]:
x = pd.MultiIndex.from_product([result.T.columns, result.T.index]).get_level_values(1)
y = result.T.values.flatten()
ret = np.histogram2d(x, y, (result.T.index.size, 50), density=True)[0].max()
ret = plt.hist2d(x, y, (result.T.index.size, 50), density=True, cmin=ret*.2)
# ret = plt.hist2d(x, y, (result.T.index.size, 50), density=True)
plt.colorbar()
plt.title(f'The results of the {n} days after the {likely} most similar {n} day periods')

In [None]:
y = result.T.values.flatten()
plt.hist(y, 50, density=True, histtype='step')
plt.title(f'The results of the {n} days after the {likely} most similar {n} day periods')
result.index.values