## PACKAGES

In [1]:
import numpy as np
import pandas as pd
import math
import statistics
import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns

## 1. Loading, converting and cleaning of the data (note: aggregate the data in order to have weekly frequency)

In [2]:
w_df=pd.read_csv('Weather_ts.csv',sep=',',parse_dates=['Date Time'],index_col='Date Time')
w_df.head(2)

Unnamed: 0_level_0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
Date Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
01.01.2009 00:10:00,996.52,-8.02,265.4,-8.9,93.3,3.33,3.11,0.22,1.94,3.12,1307.75,1.03,1.75,152.3
01.01.2009 00:20:00,996.57,-8.41,265.01,-9.28,93.4,3.23,3.02,0.21,1.89,3.03,1309.8,0.72,1.5,136.1


In [3]:
w_df.columns

Index(['p (mbar)', 'T (degC)', 'Tpot (K)', 'Tdew (degC)', 'rh (%)',
       'VPmax (mbar)', 'VPact (mbar)', 'VPdef (mbar)', 'sh (g/kg)',
       'H2OC (mmol/mol)', 'rho (g/m**3)', 'wv (m/s)', 'max. wv (m/s)',
       'wd (deg)'],
      dtype='object')

In [4]:
# So first we have to convert the column 'Date Time' in formato datetime
w_df.index = pd.to_datetime(w_df.index, format='%d.%m.%Y %H:%M:%S')


# Now we want to group it with weeakly bases, use as aggregator the mean
w_df = w_df.resample('W').mean()

# from this modification we obtain
w_df.head()


Unnamed: 0_level_0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
Date Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2009-01-04,996.446696,-4.29207,269.146087,-6.185148,87.255409,4.511496,3.918348,0.593009,2.451287,3.934887,1289.293165,1.293252,2.310504,177.589652
2009-01-11,999.146161,-11.057847,262.168393,-13.171438,84.714841,2.831081,2.369058,0.462044,1.477421,2.372718,1327.335734,1.312153,2.256984,158.967391
2009-01-18,991.30876,-1.717867,272.126657,-4.664692,81.310595,5.561796,4.483313,1.07872,2.820536,4.526012,1270.352133,1.879524,2.970417,178.882599
2009-01-25,970.486319,1.151438,276.68122,-1.678264,82.035218,6.752004,5.444335,1.307619,3.499306,5.61372,1230.040417,2.553998,3.973105,188.601022
2009-02-01,990.295337,-2.464573,271.454127,-3.848323,90.406944,5.104276,4.617619,0.486429,2.905952,4.66375,1272.179296,1.981885,3.170645,46.615129


In [5]:
# now we can clean the data
w_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 418 entries, 2009-01-04 to 2017-01-01
Freq: W-SUN
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   p (mbar)         418 non-null    float64
 1   T (degC)         418 non-null    float64
 2   Tpot (K)         418 non-null    float64
 3   Tdew (degC)      418 non-null    float64
 4   rh (%)           418 non-null    float64
 5   VPmax (mbar)     418 non-null    float64
 6   VPact (mbar)     418 non-null    float64
 7   VPdef (mbar)     418 non-null    float64
 8   sh (g/kg)        418 non-null    float64
 9   H2OC (mmol/mol)  418 non-null    float64
 10  rho (g/m**3)     418 non-null    float64
 11  wv (m/s)         418 non-null    float64
 12  max. wv (m/s)    418 non-null    float64
 13  wd (deg)         418 non-null    float64
dtypes: float64(14)
memory usage: 49.0 KB


In [6]:
w_df.isnull().sum()

p (mbar)           0
T (degC)           0
Tpot (K)           0
Tdew (degC)        0
rh (%)             0
VPmax (mbar)       0
VPact (mbar)       0
VPdef (mbar)       0
sh (g/kg)          0
H2OC (mmol/mol)    0
rho (g/m**3)       0
wv (m/s)           0
max. wv (m/s)      0
wd (deg)           0
dtype: int64

As we can see we don't have any missing data

## 2) Divide your dataset into training and testing

In [7]:
# following the instruction we divide the df in:

# TRAIN all the index before '2016-01-01'
w_train=w_df[w_df.index < '2016-01-01']
# TEST all index after >= '2016-01-01'
w_test=w_df[w_df.index >= '2016-01-01']

______

In [8]:
%store w_df
%store w_test
%store w_train

Stored 'w_df' (DataFrame)
Stored 'w_test' (DataFrame)
Stored 'w_train' (DataFrame)
