Ce fichier explore une méthode par corrélation temporelle. Pour faire la prédiction à un instant, on regrade à quel moment le signal etait proche du signal à ce moment. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm

calendrier_challenge = pd.read_parquet("Données/challenge_data/calendrier_challenge.parquet")
questionnaire = pd.read_parquet("Données/challenge_data/questionnaire.parquet")
consos_challenge = pd.read_parquet("Données/challenge_data/consos_challenge.parquet")
temperatures = pd.read_parquet("Données/challenge_data/temperatures.parquet")

min_date = datetime.datetime(2009,7,15)
max_date=datetime.datetime(2011,1,1)
all_clients = list(set(consos_challenge["id_client"]))
day = datetime.datetime(2011,1,2)-datetime.datetime(2011,1,1)
hour = datetime.datetime(2011,1,2,12)-datetime.datetime(2011,1,2,11)
client_challenges = np.array(questionnaire[questionnaire["participe_challenge"]==True].id_client)
client_nochallenges = np.array(questionnaire[questionnaire["participe_challenge"]==False].id_client)
n_days = (max_date-min_date).days-365

In [3]:
def correlation(big_signal,small_signal):
    n_small = len(small_signal)
    n_big = len(big_signal)
    result = []
    for i in range(n_big-n_small):
        result.append(np.sum(small_signal*(big_signal[i:n_small+i]))/np.sum(big_signal[i:n_small+i]))
    return np.array(result)


def load_signal(id_client,min_time,max_time):

    data = consos_challenge[consos_challenge["id_client"]==id_client]
    data1 = data[data["horodate"]>min_time]
    return data1[data1["horodate"]<=max_time].puissance_W


Correlation : tout les signaux sur le même jour 8h-13h ou 18h-21h

In [4]:
def predict_corr(begin_date,end_date,idclient):
    data_time = consos_challenge[consos_challenge["horodate"]>begin_date-day]
    data_time = data_time[data_time["horodate"]<=begin_date]
    signal_around = data_time[data_time["id_client"]==idclient]['puissance_W']
    signal_around = signal_around-np.mean(signal_around)
    signal_around = signal_around/np.mean(signal_around**2)**0.5
    max_corr = 0
    max_id = 0
    max_signal = None
    for client in client_nochallenges:
        if idclient != client:
            signal_compared = np.array(data_time[data_time["id_client"]==client]['puissance_W'])
            signal_compared = signal_compared-np.mean(signal_compared)
            signal_compared=signal_compared/(np.mean(signal_compared**2)**0.5+1e-5)
            corr = np.mean(signal_compared*signal_around)
            if corr>max_corr:
                max_corr = corr
                max_id = client
                max_signal = np.array(load_signal(client,begin_date,end_date))

    return max_signal,max_id

y_predict = []
y_test = []
for id in client_challenges:
    for n_day in tqdm(range(n_days)):
        begin_predict = min_date+(365+n_day)*day+18*hour
        end_predict = min_date+(365+n_day)*day+21*hour
        signal_to_predict = np.array(load_signal(id,begin_predict,end_predict))
        signal,id_corr = predict_corr(begin_predict,end_predict,id)
        y_predict.append(signal)
        y_test.append(np.array(load_signal(id,begin_predict,end_predict)))

y_test = np.array(y_test)
y_predict=np.array(y_predict)
print(np.mean((y_test-y_predict)**2)**0.5)

plt.plot(y_predict[10])
plt.plot(y_test[10])
plt.show()
# plt.plot(signal_around)
# plt.plot(signal_compared)

  0%|          | 0/307 [00:00<?, ?it/s]

 40%|████      | 68/170 [00:34<00:51,  1.98it/s]
  0%|          | 0/307 [00:34<?, ?it/s]


KeyboardInterrupt: 