# THIS FILE IS FOR MERGING EXT DATA

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import linear, relu, sigmoid

import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Look into counter train and test sets

In [3]:
data_train = pd.read_parquet(Path("data") / "train.parquet")
data_test = pd.read_parquet(Path("data") / "test.parquet")

In [4]:
data_all = pd.concat([data_train, data_test])
data_all = data_all.sort_values(by=['counter_name', 'date'])

In [5]:
data_all.date.max()

Timestamp('2021-09-09 23:00:00')

In [6]:
data_train.date.max()

Timestamp('2021-08-09 23:00:00')

In [9]:
data_test.date.max()

Timestamp('2021-09-09 23:00:00')

In [99]:
data_all.nunique(axis=0)

counter_id                     56
counter_name                   56
site_id                        30
site_name                      30
bike_count                    998
date                         8973
counter_installation_date      22
coordinates                    30
counter_technical_id           30
latitude                       30
longitude                      30
log_bike_count                998
dtype: int64

# Look into weather ww

In [100]:
weather = pd.read_csv('ext_data/weather.csv')
weather["date"] = pd.to_datetime(weather["date"], format = "%Y-%m-%d %H:%M:%S")
weather = weather.loc[:, weather.columns.intersection(['date','ww','w1','w2'])]
weather

Unnamed: 0,date,ww,w1,w2
0,2021-01-01 00:00:00,2,0.0,0.0
1,2021-01-01 03:00:00,40,0.0,0.0
2,2021-01-01 06:00:00,3,1.0,1.0
3,2021-01-01 09:00:00,10,1.0,1.0
4,2021-01-01 12:00:00,2,2.0,2.0
...,...,...,...,...
3317,2020-09-30 09:00:00,3,2.0,2.0
3318,2020-09-30 12:00:00,1,2.0,2.0
3319,2020-09-30 15:00:00,1,2.0,2.0
3320,2020-09-30 18:00:00,3,2.0,2.0


In [101]:
weather.nunique(axis=0)

date    3321
ww        42
w1         9
w2         9
dtype: int64

In [102]:
weather.date.max()

Timestamp('2021-10-21 12:00:00')

In [103]:
weather[weather.duplicated(keep=False)]

Unnamed: 0,date,ww,w1,w2
2017,2020-11-20 18:00:00,1,1.0,1.0
2018,2020-11-20 18:00:00,1,1.0,1.0


In [104]:
weather.drop(2017, inplace=True)

In [105]:
weather.isna().sum()

date     0
ww       0
w1       7
w2      10
dtype: int64

In [106]:
weather.dropna(inplace = True)

In [107]:
weather_0 = weather.loc[:, weather.columns.intersection(['date','ww'])]
weather_1 = weather.loc[:, weather.columns.intersection(['date','w1'])]
weather_2 = weather.loc[:, weather.columns.intersection(['date','w2'])]

In [108]:
weather_1["date"] = weather_1["date"] + pd.Timedelta(hours=-1)
weather_2["date"] = weather_2["date"] + pd.Timedelta(hours=-2)

In [109]:
weather_0.nunique(axis=0)

date    3311
ww        42
dtype: int64

 # Include confinement

In [110]:
confinement = pd.read_csv('ext_data\confinement.csv', sep=';')

In [111]:
confinement["date"] = pd.to_datetime(confinement["date"], format = "%Y-%m-%d %H:%M:%S")

In [112]:
confinement.drop('Unnamed: 0', axis=1,inplace=True)
confinement

Unnamed: 0,date,Confinement ce jour
0,2020-09-01 00:00:00,0
1,2020-09-01 01:00:00,0
2,2020-09-01 02:00:00,0
3,2020-09-01 03:00:00,0
4,2020-09-01 04:00:00,0
...,...,...
9884,2021-10-17 20:00:00,0
9885,2021-10-17 21:00:00,0
9886,2021-10-17 22:00:00,0
9887,2021-10-17 23:00:00,0


In [113]:
confinement.rename({'Confinement ce jour':'conf'}, axis=1, inplace = True)

# Merge ww and confinement

In [114]:
ext_full = pd.merge(confinement, weather_2, how='left', on='date')
ext_full = pd.merge(ext_full, weather_1, how='left', on='date')
ext_full = pd.merge(ext_full, weather_0, how='left', on='date')
ext_full.replace(np.nan, 0, inplace=True)
ext_full["ww"] = ext_full["w1"] + ext_full["w2"] + ext_full["ww"] 

In [115]:
ext_full

Unnamed: 0,date,conf,w2,w1,ww
0,2020-09-01 00:00:00,0,0.0,0.0,1.0
1,2020-09-01 01:00:00,0,0.0,0.0,0.0
2,2020-09-01 02:00:00,0,0.0,0.0,0.0
3,2020-09-01 03:00:00,0,0.0,0.0,2.0
4,2020-09-01 04:00:00,0,0.0,0.0,0.0
...,...,...,...,...,...
9884,2021-10-17 20:00:00,0,0.0,0.0,0.0
9885,2021-10-17 21:00:00,0,0.0,0.0,1.0
9886,2021-10-17 22:00:00,0,0.0,0.0,0.0
9887,2021-10-17 23:00:00,0,0.0,0.0,0.0


In [116]:
ext_full.drop(columns=['w2','w1'], inplace=True)

In [117]:
ext_full

Unnamed: 0,date,conf,ww
0,2020-09-01 00:00:00,0,1.0
1,2020-09-01 01:00:00,0,0.0
2,2020-09-01 02:00:00,0,0.0
3,2020-09-01 03:00:00,0,2.0
4,2020-09-01 04:00:00,0,0.0
...,...,...,...
9884,2021-10-17 20:00:00,0,0.0
9885,2021-10-17 21:00:00,0,1.0
9886,2021-10-17 22:00:00,0,0.0
9887,2021-10-17 23:00:00,0,0.0


In [118]:
ext_full.to_csv(r'C:\Users\CEDI\Dropbox\Education\X\Python for Data Science\bike_counters\ext_data\ext_2f.csv')

# Adding tempneb

In [119]:
weather = pd.read_csv('ext_data/weather.csv')
weather["date"] = pd.to_datetime(weather["date"], format = "%Y-%m-%d %H:%M:%S")
tempneb = weather.loc[:, weather.columns.intersection(['date', 't', 'nbas'])]
tempneb

Unnamed: 0,date,t,nbas
0,2021-01-01 00:00:00,272.75,1.0
1,2021-01-01 03:00:00,271.25,1.0
2,2021-01-01 06:00:00,271.95,5.0
3,2021-01-01 09:00:00,272.45,1.0
4,2021-01-01 12:00:00,276.95,7.0
...,...,...,...
3317,2020-09-30 09:00:00,289.95,7.0
3318,2020-09-30 12:00:00,292.05,7.0
3319,2020-09-30 15:00:00,291.55,7.0
3320,2020-09-30 18:00:00,290.15,8.0


In [120]:
ext_full.date.max()

Timestamp('2021-10-18 00:00:00')

In [121]:
ext_4f = pd.merge_asof(
        ext_full.sort_values("date"), tempneb.sort_values("date"), on="date"
)

In [122]:
ext_4f

Unnamed: 0,date,conf,ww,t,nbas
0,2020-09-01 00:00:00,0,1.0,285.75,0.0
1,2020-09-01 01:00:00,0,0.0,285.75,0.0
2,2020-09-01 02:00:00,0,0.0,285.75,0.0
3,2020-09-01 03:00:00,0,2.0,283.95,0.0
4,2020-09-01 04:00:00,0,0.0,283.95,0.0
...,...,...,...,...,...
9884,2021-10-17 20:00:00,0,0.0,285.65,0.0
9885,2021-10-17 21:00:00,0,1.0,282.75,0.0
9886,2021-10-17 22:00:00,0,0.0,282.75,0.0
9887,2021-10-17 23:00:00,0,0.0,282.75,0.0


In [123]:
ext_4f['t'] -= 273.15

In [124]:
ext_4f.isna().sum()

date     0
conf     0
ww       0
t        0
nbas    15
dtype: int64

In [125]:
ext_full.replace(np.nan, 0, inplace=True)

In [126]:
ext_4f

Unnamed: 0,date,conf,ww,t,nbas
0,2020-09-01 00:00:00,0,1.0,12.6,0.0
1,2020-09-01 01:00:00,0,0.0,12.6,0.0
2,2020-09-01 02:00:00,0,0.0,12.6,0.0
3,2020-09-01 03:00:00,0,2.0,10.8,0.0
4,2020-09-01 04:00:00,0,0.0,10.8,0.0
...,...,...,...,...,...
9884,2021-10-17 20:00:00,0,0.0,12.5,0.0
9885,2021-10-17 21:00:00,0,1.0,9.6,0.0
9886,2021-10-17 22:00:00,0,0.0,9.6,0.0
9887,2021-10-17 23:00:00,0,0.0,9.6,0.0


In [184]:
ext_4f.to_csv(r'C:\Users\CEDI\Dropbox\Education\X\Python for Data Science\bike_counters\ext_data\ext_4f.csv')

# Adding Holidays, Bank Holidays and Strike rate

In [130]:
bvs = pd.read_csv('ext_data/bank_vacs_strike.csv')
bvs["date"] = pd.to_datetime(bvs["date"], format = "%m/%d/%Y %H:%M")
bvs

Unnamed: 0,date,vacances_zone_c,bank_days,strike_rate
0,2020-09-01 00:00:00,0,0,0.0
1,2020-09-01 01:00:00,0,0,0.0
2,2020-09-01 02:00:00,0,0,0.0
3,2020-09-01 03:00:00,0,0,0.0
4,2020-09-01 04:00:00,0,0,0.0
...,...,...,...,...
9882,2021-10-17 20:00:00,0,0,
9883,2021-10-17 21:00:00,0,0,
9884,2021-10-17 22:00:00,0,0,
9885,2021-10-17 23:00:00,0,0,


In [131]:
bvs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9887 entries, 0 to 9886
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   date             9887 non-null   datetime64[ns]
 1   vacances_zone_c  9887 non-null   int64         
 2   bank_days        9887 non-null   int64         
 3   strike_rate      8974 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 309.1 KB


In [132]:
ext_7f = pd.merge(ext_4f, bvs, how='left', on='date')

In [133]:
ext_7f

Unnamed: 0,date,conf,ww,t,nbas,vacances_zone_c,bank_days,strike_rate
0,2020-09-01 00:00:00,0,1.0,12.6,0.0,0.0,0.0,0.0
1,2020-09-01 01:00:00,0,0.0,12.6,0.0,0.0,0.0,0.0
2,2020-09-01 02:00:00,0,0.0,12.6,0.0,0.0,0.0,0.0
3,2020-09-01 03:00:00,0,2.0,10.8,0.0,0.0,0.0,0.0
4,2020-09-01 04:00:00,0,0.0,10.8,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
9884,2021-10-17 20:00:00,0,0.0,12.5,0.0,0.0,0.0,
9885,2021-10-17 21:00:00,0,1.0,9.6,0.0,0.0,0.0,
9886,2021-10-17 22:00:00,0,0.0,9.6,0.0,0.0,0.0,
9887,2021-10-17 23:00:00,0,0.0,9.6,0.0,0.0,0.0,


In [134]:
ext_7f.isna().sum()

date                 0
conf                 0
ww                   0
t                    0
nbas                15
vacances_zone_c      2
bank_days            2
strike_rate        915
dtype: int64

In [135]:
ext_7f.vacances_zone_c.replace(np.nan, 0, inplace=True)

In [136]:
ext_7f.bank_days.replace(np.nan, 0, inplace=True)

In [137]:
ext_7f.isna().sum()

date                 0
conf                 0
ww                   0
t                    0
nbas                15
vacances_zone_c      0
bank_days            0
strike_rate        915
dtype: int64

In [138]:
ext_7f.to_csv(r'C:\Users\CEDI\Dropbox\Education\X\Python for Data Science\bike_counters\ext_data\ext_7f.csv')