In [13]:
import pandas as pd 
import numpy as np 
import os
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = 'darkgrid')

# Loading the data
meteo_orly = pd.read_csv(os.path.join("..", "Datasets", "meteo_orly.csv"))
meteo_paris = pd.read_csv(os.path.join("..", "Datasets", "meteo_paris.csv"))

In [14]:
# Checking Date Range for both Datasets:

meteo_orly['date'] = pd.to_datetime(meteo_orly['date'])
print("Minimum Date Orly:", meteo_orly['date'].min())
print("Maximum Date Orly:", meteo_orly['date'].max())

meteo_paris['datetime'] = pd.to_datetime(meteo_paris['datetime'])
print("Minimum Date Paris:", meteo_paris['datetime'].min())
print("Maximum Date Paris:", meteo_paris['datetime'].max())



Minimum Date Orly: 2020-09-01 00:00:00
Maximum Date Orly: 2021-10-21 12:00:00
Minimum Date Paris: 2020-09-01 00:00:00
Maximum Date Paris: 2021-11-30 00:00:00


We will be selecting the below features from meteo_paris:
- datetime
- precip, scaled
- visibility, scaled
- icon, which is a description of the general weather conditions of the day. (muted for now)

<strong>NOTE: AT THIS STAGE WE ARE TAKING DAILY DATA WITHOUT CONSIDERING THE STATION IT COMES FROM

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin  
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

class ColumnSelectorParis(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_slice = X[['datetime','precip','visibility']] #'icon'
        X_slice = X_slice.rename(columns={'precip': 'precipitation', 'datetime': 'date'})
        return X_slice

class ScaleNumericalParis(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_scaled = X.copy()
        scaler = StandardScaler()
        numerical_columns = X.select_dtypes(include='number').columns
        X_scaled[numerical_columns] = scaler.fit_transform(X[numerical_columns])
        return X_scaled

paris_preprocess = Pipeline([
    ("ColumnSelectorParis", ColumnSelectorParis()),
    ("ScaleNumericalParis", ScaleNumericalParis()),
])        
        

In [16]:
paris_clean = paris_preprocess.fit_transform(meteo_paris)

We now tackle the Orly dataset, from we select to following features:
- Date
- Temperature, scaled

To merge the data, we will need this dataset to be sorted by increasing date

In [18]:
class ColumnSelectorOrly(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_slice = X[['date','t', 'rr1']]
        X_slice = X_slice.rename(columns={'t': 'temperature', 'rr1': 'precip'})
        return X_slice

class ScaleNumericalOrly(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_scaled = X.copy()
        scaler = StandardScaler()
        numerical_columns = X.select_dtypes(include='number').columns
        X_scaled[numerical_columns] = scaler.fit_transform(X[numerical_columns])
        return X_scaled

class SortDateOrly(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.sort_values('date')

orly_preprocess = Pipeline([
    ("ColumnSelectorOrly", ColumnSelectorOrly()),
    ("ScaleNumericalOrly", ScaleNumericalOrly()),
    ("SortDateOrly", SortDateOrly())
])        
        

In [19]:
orly_clean = orly_preprocess.fit_transform(meteo_orly)

In [20]:
orly_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3322 entries, 3082 to 411
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         3322 non-null   datetime64[ns]
 1   temperature  3322 non-null   float64       
 2   precip       3313 non-null   float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 103.8 KB


In [22]:
orly_clean.dropna(inplace=True)

In [21]:
paris_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456 entries, 0 to 455
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           456 non-null    datetime64[ns]
 1   precipitation  456 non-null    float64       
 2   visibility     456 non-null    float64       
dtypes: datetime64[ns](1), float64(2)
memory usage: 10.8 KB


We can now merge the datasets, and export a csv file that we will use to augment our existing design matrix.

In [28]:
merged_data = pd.merge_asof(orly_clean, paris_clean, on='date')

merged_data.drop(['precipitation', 'visibility'], axis=1, inplace=True)

merged_data.to_csv(os.path.join("..", "Datasets", "weather_data_cleaned.csv"), index=False)


## Now we look at Covid-19 cases

In [29]:
merged_data.head()

Unnamed: 0,date,temperature,precip
0,2020-09-01 00:00:00,-0.01867,-0.155221
1,2020-09-01 03:00:00,-0.281418,-0.155221
2,2020-09-01 06:00:00,-0.237627,-0.155221
3,2020-09-01 09:00:00,0.784171,-0.155221
4,2020-09-01 12:00:00,1.178293,-0.155221
