In [1]:
import pandas as pd
import matplotlib as plt
import datetime
import typing
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots

%load_ext autoreload
%autoreload 2

In [2]:
data = pd.read_csv('../DATA/processed/dataset.csv', parse_dates=['date'])
data = data.set_index('date')

In [3]:
wind_dir = {'Ветер, дующий с востока': 0,
            'Ветер, дующий с востоко-северо-востока': 45/2,
            'Ветер, дующий с северо-востока': 45,
            'Ветер, дующий с северо-северо-востока': 45 + 45/2,
            'Ветер, дующий с севера': 90,
            'Ветер, дующий с северо-северо-запад': 90 + 45/2,
            'Ветер, дующий с северо-запада': 135,
            'Ветер, дующий с западо-северо-запада': 135 + 45/2,
            'Ветер, дующий с запада': 180,
            'Ветер, дующий с западо-юго-запада': 180+45/2,
            'Ветер, дующий с юго-запада': 225,
            'Ветер, дующий с юго-юго-запада': 225 + 45/2,
            'Ветер, дующий с юга': 270,
            'Ветер, дующий с юго-юго-востока': 270 + 45/2,
            'Ветер, дующий с юго-востока': 315,
            'Ветер, дующий с востоко-юго-востока': 315 + 45/2,
            'Штиль, безветрие': None,
            }

In [4]:
def prepare_features(data: pd.DataFrame) -> pd.DataFrame:
    """Features preparation for anomaly detection and clustering"""
    # textual values encoding
    data['hum_meteo'] = data.hum_meteo.fillna(method='bfill')
    data['pres_meteo'] = data.hum_meteo.fillna(method='bfill')
    data['wind_speed'] = data.hum_meteo.fillna(method='bfill')
    data['temp_meteo'] = data.temp_meteo.fillna(method='bfill')
    data['prec_amount'] = data.prec_amount.fillna(method='bfill')
    data.loc[data.prec_amount == 'Осадков нет', 'prec_amount'] = 0
    data.loc[data.prec_amount == 'Следы осадков', 'prec_amount'] = 0
    data['prec_amount'] = data.prec_amount.astype(float)

    
    # fill missing value before PCA
    data['P1'] = data.P1.interpolate()
    
    for c in data.columns:
        data[c].fillna((data[c].mean()), inplace=True)
    
    return data

In [5]:
sel_columns = ['P1','hum_meteo', 'temp_meteo', 'pres_meteo', 'wind_speed', 'prec_amount']

In [6]:
data = data[sel_columns]
data = prepare_features(data)

In [7]:
data['day_of_week'] = data.index.dayofweek
data['weekend'] = data.day_of_week.apply(lambda d: d == 5 or d == 6).astype(int)
data['hour'] = data.index.hour
data['night'] = data.hour.apply(lambda h: 0<= h < 7).astype(int)
data['morning'] = data.hour.apply(lambda h: 7<= h < 12).astype(int)
data['day'] = data.hour.apply(lambda h: 12<= h < 17).astype(int)
data['evening'] = data.hour.apply(lambda h: 17<= h < 24).astype(int)
data['sin_day'] = np.sin(2*np.pi*data.day_of_week/7)
data['cos_day'] = np.cos(2*np.pi*data.day_of_week/7)
data['sin_hour'] = np.sin(2*np.pi*data.hour/24)
data['cos_hour'] = np.cos(2*np.pi*data.hour/24)
data.head()

Unnamed: 0_level_0,P1,hum_meteo,temp_meteo,pres_meteo,wind_speed,prec_amount,day_of_week,weekend,hour,night,morning,day,evening,sin_day,cos_day,sin_hour,cos_hour
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2019-04-01 00:00:00,5.645,59.0,6.7,59.0,59.0,0.0,0,0,0,1,0,0,0,0.0,1.0,0.0,1.0
2019-04-01 00:05:00,5.513333,59.0,6.7,59.0,59.0,0.0,0,0,0,1,0,0,0,0.0,1.0,0.0,1.0
2019-04-01 00:10:00,5.504643,59.0,6.7,59.0,59.0,0.0,0,0,0,1,0,0,0,0.0,1.0,0.0,1.0
2019-04-01 00:15:00,6.790714,59.0,6.7,59.0,59.0,0.0,0,0,0,1,0,0,0,0.0,1.0,0.0,1.0
2019-04-01 00:20:00,7.344643,59.0,6.7,59.0,59.0,0.0,0,0,0,1,0,0,0,0.0,1.0,0.0,1.0


In [8]:
for c in data.columns:
    if data[c].isnull().values.any():
        print(c)

# Add lag variable

In [9]:
data.columns

Index(['P1', 'hum_meteo', 'temp_meteo', 'pres_meteo', 'wind_speed',
       'prec_amount', 'day_of_week', 'weekend', 'hour', 'night', 'morning',
       'day', 'evening', 'sin_day', 'cos_day', 'sin_hour', 'cos_hour'],
      dtype='object')

In [10]:
columns = ['P1',
          # 'temp_meteo', 'pres_meteo', 'hum_meteo', 'wind_direction',
          # 'wind_speed', 'prec_amount',
          ]

In [11]:
shift_values = [1, 12, 12*2, 12*4, 12*8, 12*24]

In [12]:
for c in columns:
    for s in shift_values:
        data[f'{c}_shift_{s}'] = data[c].shift(s)

In [13]:
data = data[str(data.index[0]+datetime.timedelta(minutes=(5*int(np.max(shift_values)+1)))):]

In [14]:
data.head()

Unnamed: 0_level_0,P1,hum_meteo,temp_meteo,pres_meteo,wind_speed,prec_amount,day_of_week,weekend,hour,night,...,sin_day,cos_day,sin_hour,cos_hour,P1_shift_1,P1_shift_12,P1_shift_24,P1_shift_48,P1_shift_96,P1_shift_288
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-04-02 00:05:00,6.453333,81.0,1.5,81.0,81.0,0.0,1,0,0,1,...,0.781831,0.62349,0.0,1.0,5.804,6.596429,7.202949,4.354,2.132857,5.513333
2019-04-02 00:10:00,6.015,81.0,1.5,81.0,81.0,0.0,1,0,0,1,...,0.781831,0.62349,0.0,1.0,6.453333,6.038929,6.808571,5.090667,1.885714,5.504643
2019-04-02 00:15:00,5.991071,81.0,1.5,81.0,81.0,0.0,1,0,0,1,...,0.781831,0.62349,0.0,1.0,6.015,6.181429,6.94,4.934286,9.555385,6.790714
2019-04-02 00:20:00,5.8675,81.0,1.5,81.0,81.0,0.0,1,0,0,1,...,0.781831,0.62349,0.0,1.0,5.991071,5.964667,6.672143,4.685769,1.819615,7.344643
2019-04-02 00:25:00,6.085595,81.0,1.5,81.0,81.0,0.0,1,0,0,1,...,0.781831,0.62349,0.0,1.0,5.8675,6.457889,6.88881,4.941154,1.824615,7.048214


# Train test split

In [15]:
sel_columns = ['hum_meteo', 'temp_meteo', 'pres_meteo', 'wind_speed', 'prec_amount']

In [16]:
y = data.P1
data = data.drop(columns='P1')

In [17]:
idx_split = str(data.index[-1]-datetime.timedelta(days=4))
idx_split


'2020-01-24 00:00:00'

In [18]:
X_test = data[idx_split:]
X_train = data[:idx_split]
y_test = y[idx_split:]
y_train = y[:idx_split]

In [20]:
scaler = StandardScaler()
col = X_train.columns
X_train[col] = scaler.fit_transform(X_train[col])
X_test[col] = scaler.transform(X_test[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try 

In [None]:
#poly = PolynomialFeatures(2)
#X_train = poly.fit_transform(X_train)
#X_test = poly.transform(X_test)
#names = poly.get_feature_names(columns)

In [21]:
def predict(X_test, model, y: typing.List):
    start_idx = X_test.index[0] + datetime.timedelta(minutes=5*int(np.max(shift_values)))
    X = X_test[str(start_idx):]
    i = len(y)
    split = i
    for index, row in X.iterrows():
        columns = ['P1']
        for c in columns:
            for s in shift_values:
                row.loc[f'{c}_shift_{s}'] = y[i-s]
        #print(row)
        p = model.predict(row.values.reshape(1, -1))
        #print(p)
        y = np.append(y, p)
        i += 1
    prediction =y[split:]
    return prediction

In [22]:
[f'{c}_shift_{s}' for c in columns for s in shift_values]

['P1_shift_1',
 'P1_shift_12',
 'P1_shift_24',
 'P1_shift_48',
 'P1_shift_96',
 'P1_shift_288']

In [23]:
X_test = X_test.drop(columns=[f'{c}_shift_{s}' for c in columns for s in shift_values])
X_test.head()

Unnamed: 0_level_0,hum_meteo,temp_meteo,pres_meteo,wind_speed,prec_amount,day_of_week,weekend,hour,night,morning,day,evening,sin_day,cos_day,sin_hour,cos_hour
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2020-01-24 00:00:00,-3.345794,-1.57786,-3.345794,-3.345794,-0.493936,0.506536,-0.627986,-1.661325,1.558387,-0.512989,-0.512989,-0.641689,-0.891515,-1.799153,3.970635e-18,2.0
2020-01-24 00:05:00,-0.042431,-1.833724,-0.042431,-0.042431,-0.329753,0.506536,-0.627986,-1.661325,1.558387,-0.512989,-0.512989,-0.641689,-0.623527,-1.272826,1.644691e-18,1.414214
2020-01-24 00:10:00,-0.042431,-1.833724,-0.042431,-0.042431,-0.329753,0.506536,-0.627986,-1.661325,1.558387,-0.512989,-0.512989,-0.641689,-0.623527,-1.272826,1.644691e-18,1.414214
2020-01-24 00:15:00,-0.042431,-1.833724,-0.042431,-0.042431,-0.329753,0.506536,-0.627986,-1.661325,1.558387,-0.512989,-0.512989,-0.641689,-0.623527,-1.272826,1.644691e-18,1.414214
2020-01-24 00:20:00,-0.042431,-1.833724,-0.042431,-0.042431,-0.329753,0.506536,-0.627986,-1.661325,1.558387,-0.512989,-0.512989,-0.641689,-0.623527,-1.272826,1.644691e-18,1.414214


In [24]:
model = Ridge(alpha=3)
model.fit(X_train, y_train)
start_idx = X_test.index[0] + datetime.timedelta(minutes=5*int(np.max(shift_values)))
prediction = predict(X_test, model, y_test[:str(start_idx)].values)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
mean_absolute_error(y_test[str(y_test.index[0] + datetime.timedelta(minutes=5*int(np.max(shift_values)))):], prediction)

In [None]:
result = pd.DataFrame(index=y_test[str(start_idx):].index)
result['y_test'] = y_test[str(start_idx):].values
result['pred'] = prediction

In [None]:
result.plot()

In [None]:
for i, r in X_test.iterrows():
    r
type(r)