### Import des données

In [955]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

In [932]:
df_corkstation = pd.read_csv("Corkstation_15072019_01022011.csv")

In [933]:
df_vst = pd.read_csv("df_vst_group6.csv")

### Prétraitement


Interpolating incomplete data from df_corkstation

In [934]:
# Changing index to date column
df_corkstation['date'] = pd.to_datetime(df_corkstation['date'])
df_corkstation.set_index('date', inplace=True)

# Interpolating over missing data
df_corkstation = df_corkstation.replace({' ': '-1'}, regex=True)
df_corkstation['wetb'] = pd.to_numeric(df_corkstation['wetb'])
df_corkstation['vis'] = pd.to_numeric(df_corkstation['vis'])
df_corkstation = df_corkstation.replace({-1: np.nan}, regex=True)
df_corkstation = df_corkstation.interpolate()

Resample and interpolating

In [935]:
# Resample for each 30 min and interpolating
df_corkstation_resampled = df_corkstation.resample('30T').interpolate(method='linear')

Adding week-end, season info

In [936]:
# Lists to be used
weekend_list = []
season_list = []
work_time_list = []
year_list = []
month_list = []
day_list = []
hour_list = []
minute_list=[]
weekofyear_list = []

# Get season function
def get_season(date):
    year = date.year
    seasons = {
        'spring': (pd.Timestamp(year=year, month=3, day=21), pd.Timestamp(year=year, month=6, day=20)),
        'summer': (pd.Timestamp(year=year, month=6, day=20), pd.Timestamp(year=year, month=9, day=20)),
        'fall': (pd.Timestamp(year=year, month=9, day=21), pd.Timestamp(year=year, month=12, day=20)),
        'winter': (pd.Timestamp(year=year, month=12, day=21), pd.Timestamp(year=year + 1, month=3, day=19))
    }
    
    if seasons['spring'][0] <= date <= seasons['spring'][1]:
        return 1
    elif seasons['summer'][0] <= date <= seasons['summer'][1]:
        return 2
    elif seasons['fall'][0] <= date <= seasons['fall'][1]:
        return 3
    else:
        return 4
    
# Adding info to lists
for date in df_corkstation_resampled.index.tolist():
    if date.weekday() == 5 or date.weekday() == 6:
        weekend_list.append(1)
    else:
        weekend_list.append(0)

    season_list.append(get_season(date))
    year_list.append(date.year)
    month_list.append(date.month)
    day_list.append(date.day)
    hour_list.append(date.hour)
    minute_list.append(date.minute)
    weekofyear_list.append(date.weekofyear)

    if date.hour >= 8 and date.hour <= 18:
        work_time_list.append(1)
    else:
        work_time_list.append(0)

# Creating dataframes with info
weekend_df = pd.DataFrame({'weekend': weekend_list})
season_df = pd.DataFrame({'season': season_list})
work_time_df = pd.DataFrame({'work_time': work_time_list})
year_df = pd.DataFrame({'year': year_list})
month_df = pd.DataFrame({'month': month_list})
day_df = pd.DataFrame({'day': day_list})
hour_df = pd.DataFrame({'hour': hour_list})
minute_df = pd.DataFrame({'minute': minute_list})
weekofyear_df = pd.DataFrame({'weekofyear': weekofyear_list})

# Concating created info to the dataframe corkstation
time_info = pd.concat([weekend_df, season_df, work_time_df, year_df, month_df, day_df, hour_df, minute_df, weekofyear_df], axis=1)
time_info.set_index(df_corkstation_resampled.index, inplace=True)
df_corkstation_resampled = pd.concat([df_corkstation_resampled, time_info], axis=1)

Deleting last data from dataframes to match the consumption data

In [937]:
date_to_be_cropped = datetime.datetime(2010, 12, 14, 23, 0)
df_corkstation_cropped = df_corkstation_resampled[df_corkstation_resampled.index <= date_to_be_cropped]

df_vst['time'] = pd.to_datetime(df_vst['time'])
df_vst.set_index('time', inplace=True)
df_vst_cropped = df_vst[df_vst.index<=date_to_be_cropped]

Merging two dataframes

In [938]:
df_corkstation_cropped = df_corkstation_cropped.drop(columns=['temp'])
df_to_train_and_test = pd.concat([df_corkstation_cropped, df_vst_cropped], axis=1)

In [939]:
#plt.figure(figsize=(13, 5))
#date_to_be_cropped = datetime.datetime(2010, 4, 1, 0, 0)
#df_plot = df_to_train_and_test[df_to_train_and_test.index <= date_to_be_cropped]
#plt.plot(df_plot.index, df_plot['mean'])

Decision Tree

In [953]:
parameters = ['work_time', 'season', 'rhum', 'temp', 'vappr', 'year', 'month', 'day', 'year', 'minute', 'weekofyear']

x_train, x_test, y_train, y_test = train_test_split(df_to_train_and_test[parameters], df_to_train_and_test['mean'], test_size=0.2, random_state=42)

regr = RandomForestRegressor()
regr.fit(x_train, y_train)

GridSearchCV

In [958]:
parameters = {  'criterion': ('squared_error', 'absolute_error', 'friedman_mse', 'poisson'),
                'max_depth': [None, 1, 2, 3, 4, 5],
                'n_estimators': [50, 100, 200, 500],
                'max_features': (1, 'sqrt', 'log2', 10),
                'n_jobs': [-1],
             }

clf_gridSearch = GridSearchCV(regr, parameters)
clf_gridSearch.fit(x_train, y_train)

clf_gridSearch.cv_results_

#y_train_predicted = regr.predict(x_train)
#score_train = regr.score(x_train, y_train)


KeyboardInterrupt: 