In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

import preprocessing as pp

In [None]:
raw_data = pd.read_csv('consumption_and_temperatures.csv')
raw_data

### Look for na values

In [None]:
raw_data.isna().sum().sort_values(ascending=False)

In [None]:
raw_data.isnull().sum().sort_values(ascending=False)

In [None]:
raw_data['timestamp'] = pd.to_datetime(raw_data['timestamp'])

### Plot inputs and targets

In [None]:
for i in range(1, 6):
    fig, axs = plt.subplots(2, 1, figsize=(30, 20), sharex=True)

    raw_data[['timestamp',f'NO{i}_temperature']].set_index('timestamp').plot(ax=axs[0], title=f'temperature on location {i}')
    raw_data[['timestamp',f'NO{i}_consumption']].set_index('timestamp').plot(ax=axs[1], title=f'consumption on location {i}')
    plt.plot()

### Look for correlations

In [None]:
plt.figure()
sns.heatmap(raw_data.corr(method='pearson', numeric_only=True), annot=True, cmap=plt.cm.Reds)


In [None]:
data_seasons, _ = pp.add_season_columns(raw_data)
plt.figure(figsize=(20, 20))
sns.heatmap(data_seasons.corr(method='pearson', numeric_only=True), annot=True, cmap=plt.cm.Reds)

### Plot output by input

In [None]:
for i in range(1, 6):
    plt.figure()
    plt.scatter(raw_data[f'NO{i}_temperature'], raw_data[f'NO{i}_consumption'], alpha=0.2)
    plt.title(f'NO{i}_consumption by NO{i}_temperature')
    plt.xlabel(f'NO{i}_temperature')
    plt.ylabel(f'NO{i}_consumption')
    plt.plot()

In [None]:
data_hours, _ = pp.add_hour_columns(raw_data)
data_hours.columns

In [None]:
for i in range(1, 6):
    fig, axs = plt.subplots(2, 1, figsize=(30, 40), sharex=True)
    for h in range(24):
        data_hours[data_hours[f'hour_{h}']][['timestamp',f'NO{i}_temperature']].set_index('timestamp').plot(ax=axs[0], title=f'temperature on location {i}', label=f'temp hour={h}')
        data_hours[data_hours[f'hour_{h}']][['timestamp',f'NO{i}_consumption']].set_index('timestamp').plot(ax=axs[1], title=f'consumption on location {i}', label=f'cons hour={h}')
    plt.legend()
    plt.plot()

In [None]:
targets = [ f'NO{k}_consumption' for k in range(1, 6) ]

In [None]:
inputs = [ f'NO{k}_temperature' for k in range(1, 6) ]

In [None]:
columns_to_plot = raw_data.drop(columns='timestamp', inplace=False).columns
columns_to_plot = inputs

In [None]:
df = raw_data.copy()
mean = df[columns_to_plot].mean()
mean['timestamp'] = raw_data['timestamp']

std = df[columns_to_plot].std()
std['timestamp'] = raw_data['timestamp']

threshold = 3.5 * std[columns_to_plot]
anomalies = (df[columns_to_plot] - mean[columns_to_plot]).abs() > threshold

df_anomalies = df[columns_to_plot][anomalies[columns_to_plot]]

df_anomalies['timestamp'] = raw_data['timestamp']


for target in columns_to_plot:
    plt.figure(figsize=(10, 6))
    plt.plot(df['timestamp'], df[target], zorder=1)
    plt.scatter(df_anomalies['timestamp'], df_anomalies[target], color='red', label='Anomalies', zorder=2)
    plt.title(f'{target} with Anomalies Detected by Standard Deviation Thresholding')
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.show()

In [None]:
df_anomalies = pp.manage_anomalies(raw_data, threshold=3.5)

In [None]:
df1 = df_anomalies[:30000]
df2 = df_anomalies[30000:]

In [None]:
df1.max(), df2.max()

In [None]:
columns = df1.drop(columns='timestamp', inplace=False).columns
for column in columns:
    df2[column][df2[column] > df1.max()[column]] = df1.max()[column]
    df2[column][df2[column] < df1.min()[column]] = df1.min()[column]

In [None]:
df2.describe()