### Import des données

In [777]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime

In [778]:
df_corkstation = pd.read_csv("Corkstation_15072019_01022011.csv")

In [779]:
df_vst = pd.read_csv("df_vst_group6.csv")

### Prétraitement


Interpolating incomplete data from df_corkstation

In [780]:
# Changing index to date column
df_corkstation['date'] = pd.to_datetime(df_corkstation['date'])
df_corkstation.set_index('date', inplace=True)

# Interpolating over missing data
df_corkstation = df_corkstation.replace({' ': '-1'}, regex=True)
df_corkstation['wetb'] = pd.to_numeric(df_corkstation['wetb'])
df_corkstation['vis'] = pd.to_numeric(df_corkstation['vis'])
df_corkstation = df_corkstation.replace({-1: np.nan}, regex=True)
df_corkstation = df_corkstation.interpolate()

Resample and interpolating

In [781]:
# Resample for each 30 min and interpolating
df_corkstation_resampled = df_corkstation.resample('30T').interpolate(method='linear')

Adding week-end, season info

In [782]:
# Lists to be used
weekend_list = []
season_list = []
work_time_list = []

# Get season function
def get_season(date):
    year = date.year
    seasons = {
        'spring': (pd.Timestamp(year=year, month=3, day=21), pd.Timestamp(year=year, month=6, day=20)),
        'summer': (pd.Timestamp(year=year, month=6, day=20), pd.Timestamp(year=year, month=9, day=20)),
        'fall': (pd.Timestamp(year=year, month=9, day=21), pd.Timestamp(year=year, month=12, day=20)),
        'winter': (pd.Timestamp(year=year, month=12, day=21), pd.Timestamp(year=year + 1, month=3, day=19))
    }
    
    if seasons['spring'][0] <= date <= seasons['spring'][1]:
        return 1
    elif seasons['summer'][0] <= date <= seasons['summer'][1]:
        return 2
    elif seasons['fall'][0] <= date <= seasons['fall'][1]:
        return 3
    else:
        return 4
    
# Adding info to lists
for date in df_corkstation_resampled.index.tolist():
    if date.weekday() == 5 or date.weekday() == 6:
        weekend_list.append(1)
    else:
        weekend_list.append(0)

    season_list.append(get_season(date))

    if date.hour >= 8 and date.hour <= 18:
        work_time_list.append(1)
    else:
        work_time_list.append(0)

# Creating dataframes with info
weekend_df = pd.DataFrame({'weekend': weekend_list})
season_df = pd.DataFrame({'season': season_list})
work_time_df = pd.DataFrame({'season': work_time_list})

# Concating created info to the dataframe corkstation
time_info = pd.concat([weekend_df, season_df, work_time_df], axis=1)
time_info.set_index(df_corkstation_resampled.index, inplace=True)
df_corkstation_resampled = pd.concat([df_corkstation_resampled, time_info], axis=1)

Deleting last data from dataframes to match the consumption data

In [783]:
date_to_be_cropped = datetime.datetime(2010, 12, 14, 23, 0)
df_corkstation_cropped = df_corkstation_resampled[df_corkstation_resampled.index <= date_to_be_cropped]

df_vst['time'] = pd.to_datetime(df_vst['time'])
df_vst.set_index('time', inplace=True)
df_vst_cropped = df_vst[df_vst.index<=date_to_be_cropped]

Merging two dataframes

In [784]:
df_corkstation_cropped = df_corkstation_cropped.drop(columns=['temp'])
df_to_train_and_test = pd.concat([df_corkstation_cropped, df_vst_cropped], axis=1)
df_to_train_and_test


Unnamed: 0,ind,rain,ind.1,ind.2,wetb,dewpt,vappr,rhum,msl,ind.3,...,w,sun,vis,clht,clamt,weekend,season,season.1,mean,temp
2009-07-15 00:00:00,3.0,0.00,0.0,0.0,11.40,11.30,13.30,98.0,1002.80,2.0,...,81.0,0.0,25000.0,999.0,1.0,0,2,0,0.341659,11.60
2009-07-15 00:30:00,3.0,0.00,0.0,0.0,11.25,11.05,13.15,97.0,1003.15,2.0,...,46.0,0.0,27500.0,999.0,1.0,0,2,0,0.274463,11.50
2009-07-15 01:00:00,3.0,0.00,0.0,0.0,11.10,10.80,13.00,96.0,1003.50,2.0,...,11.0,0.0,30000.0,999.0,1.0,0,2,0,0.231365,11.40
2009-07-15 01:30:00,3.0,0.05,0.0,0.0,11.00,10.70,12.90,96.0,1003.95,2.0,...,11.0,0.0,30000.0,999.0,1.0,0,2,0,0.202708,11.30
2009-07-15 02:00:00,3.0,0.10,0.0,0.0,10.90,10.60,12.80,96.0,1004.40,2.0,...,11.0,0.0,30000.0,999.0,1.0,0,2,0,0.182256,11.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010-12-14 21:00:00,3.0,0.00,0.0,1.0,-0.30,-1.10,5.70,92.0,1041.60,2.0,...,0.0,0.0,8000.0,999.0,0.0,0,3,0,0.956582,0.10
2010-12-14 21:30:00,3.0,0.00,0.5,1.0,-0.35,-1.05,5.70,93.0,1041.80,2.0,...,0.0,0.0,8000.0,999.0,0.0,0,3,0,0.890195,0.00
2010-12-14 22:00:00,3.0,0.00,1.0,1.0,-0.40,-1.00,5.70,94.0,1042.00,2.0,...,0.0,0.0,8000.0,999.0,0.0,0,3,0,0.849046,-0.10
2010-12-14 22:30:00,3.0,0.00,0.5,1.0,-0.30,-0.95,5.70,93.0,1042.20,2.0,...,0.0,0.0,8000.0,999.0,0.0,0,3,0,0.779234,0.05
