In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import typing
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.base import clone

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots

%load_ext autoreload
%autoreload 2

np.random.seed(42)

In [2]:
wind_dir = {'Ветер, дующий с востока': 0,
            'Ветер, дующий с востоко-северо-востока': 45/2,
            'Ветер, дующий с северо-востока': 45,
            'Ветер, дующий с северо-северо-востока': 45 + 45/2,
            'Ветер, дующий с севера': 90,
            'Ветер, дующий с северо-северо-запад': 90 + 45/2,
            'Ветер, дующий с северо-запада': 135,
            'Ветер, дующий с западо-северо-запада': 135 + 45/2,
            'Ветер, дующий с запада': 180,
            'Ветер, дующий с западо-юго-запада': 180+45/2,
            'Ветер, дующий с юго-запада': 225,
            'Ветер, дующий с юго-юго-запада': 225 + 45/2,
            'Ветер, дующий с юга': 270,
            'Ветер, дующий с юго-юго-востока': 270 + 45/2,
            'Ветер, дующий с юго-востока': 315,
            'Ветер, дующий с востоко-юго-востока': 315 + 45/2,
            'Штиль, безветрие': None,
            }

In [3]:
def prepare_features(data: pd.DataFrame) -> pd.DataFrame:
    """Features preparation for anomaly detection and clustering"""
    # textual values encoding
    data['hum_meteo'] = data.hum_meteo.fillna(method='bfill')
    data['pres_meteo'] = data.hum_meteo.fillna(method='bfill')
    data['wind_speed'] = data.hum_meteo.fillna(method='bfill')
    data['temp_meteo'] = data.temp_meteo.fillna(method='bfill')
    data['prec_amount'] = data.prec_amount.fillna(method='bfill')
    data.loc[data.prec_amount == 'Осадков нет', 'prec_amount'] = 0
    data.loc[data.prec_amount == 'Следы осадков', 'prec_amount'] = 0
    data['prec_amount'] = data.prec_amount.astype(float)

    
    # fill missing value before PCA
    data['P1'] = data.P1.interpolate()
    
    for c in data.columns:
        data[c].fillna((data[c].mean()), inplace=True)
    
    return data

In [4]:
data = pd.read_csv('../DATA/processed/dataset.csv', parse_dates=['date'])
data = data.set_index('date')

In [5]:
sel_columns = ['P1','hum_meteo', 'temp_meteo', 'pres_meteo', 'wind_speed', 'prec_amount']

In [6]:
data = data[sel_columns]
data = prepare_features(data)

In [7]:
data['day_of_week'] = data.index.dayofweek
data['weekend'] = data.day_of_week.apply(lambda d: d == 5 or d == 6).astype(int)
data['hour'] = data.index.hour
data['night'] = data.hour.apply(lambda h: 0<= h < 7).astype(int)
data['morning'] = data.hour.apply(lambda h: 7<= h < 12).astype(int)
data['day'] = data.hour.apply(lambda h: 12<= h < 17).astype(int)
data['evening'] = data.hour.apply(lambda h: 17<= h < 24).astype(int)
data['sin_day'] = np.sin(2*np.pi*data.day_of_week/7)
data['cos_day'] = np.cos(2*np.pi*data.day_of_week/7)
data['sin_hour'] = np.sin(2*np.pi*data.hour/24)
data['cos_hour'] = np.cos(2*np.pi*data.hour/24)

data.drop(columns=['day_of_week', 'hour', 'night', 'day', 'morning', 'evening'], inplace=True)

In [8]:
data = data.resample('1H').mean()

In [9]:
for c in data.columns:
    if data[c].isnull().values.any():
        print(c)

In [10]:
data.head()

Unnamed: 0_level_0,P1,hum_meteo,temp_meteo,pres_meteo,wind_speed,prec_amount,weekend,sin_day,cos_day,sin_hour,cos_hour
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-04-01 00:00:00,5.897103,59.0,6.7,59.0,59.0,0.0,0,0.0,1.0,0.0,1.0
2019-04-01 01:00:00,4.495774,59.0,6.7,59.0,59.0,0.0,0,0.0,1.0,0.258819,0.965926
2019-04-01 02:00:00,4.090302,59.0,6.7,59.0,59.0,0.0,0,0.0,1.0,0.5,0.866025
2019-04-01 03:00:00,6.187858,60.833333,5.05,60.833333,60.833333,0.0,0,0.0,1.0,0.707107,0.707107
2019-04-01 04:00:00,3.100704,61.0,4.9,61.0,61.0,0.0,0,0.0,1.0,0.866025,0.5


In [11]:
def pp(start, end, n):
    start_u = start.value//10**9
    end_u = end.value//10**9

    return pd.DatetimeIndex((10**9*np.random.randint(start_u, end_u, n)).view('M8[ns]'))

In [12]:
start_idx = data.index[0]
end_idx = data.index[-1] - datetime.timedelta(days=2)
num_samples = 200

In [13]:
def generate_chanks(series, n, start, end):
    chanks = []
    for idx in pp(start, end, n):
        c = series[str(idx):str(idx+datetime.timedelta(days=2))]
        chanks.append(c)
    return chanks

In [14]:
chanks = generate_chanks(data, num_samples, start_idx, end_idx)

In [15]:
data.columns

Index(['P1', 'hum_meteo', 'temp_meteo', 'pres_meteo', 'wind_speed',
       'prec_amount', 'weekend', 'sin_day', 'cos_day', 'sin_hour', 'cos_hour'],
      dtype='object')

In [16]:
target_column = 'P1'
columns = ['hum_meteo', 'temp_meteo', 'pres_meteo', 'wind_speed',
           'prec_amount', 'weekend', 'sin_day', 'cos_day', 'sin_hour', 'cos_hour']

In [17]:
def create_sample(chank, target_col, columns):
    X = dict()
    y = dict()
    d1 = chank.iloc[:24]
    d2 = chank.iloc[24:]
    for i in range(24):
        X[f'lag_{i}'] = d1[target_col][-(i+1)]
        for c in columns:
            X[f'{c}_forec_{i}'] = d2[c][i]
        y[f'{target_col}_forec_{i}'] = d2[target_col][i]
    return X, y
    

In [18]:
def prepare_data_from_chanks(chanks, target, col):
    df = pd.DataFrame(index=range(len(chanks)))
    for i in range(len(chanks)):
        x, y = create_sample(chanks[i], target, col)
        for key, value in x.items():
            df.loc[i, key] = value
        for key, value in y.items():
            df.loc[i, key] = value
    return df

In [19]:
df = prepare_data_from_chanks(chanks, target_column, columns)

In [20]:
df.head()

Unnamed: 0,lag_0,hum_meteo_forec_0,temp_meteo_forec_0,pres_meteo_forec_0,wind_speed_forec_0,prec_amount_forec_0,weekend_forec_0,sin_day_forec_0,cos_day_forec_0,sin_hour_forec_0,...,P1_forec_14,P1_forec_15,P1_forec_16,P1_forec_17,P1_forec_18,P1_forec_19,P1_forec_20,P1_forec_21,P1_forec_22,P1_forec_23
0,72.789728,92.0,-0.1,92.0,92.0,0.0,0.0,0.433884,-0.900969,-0.258819,...,24.093406,20.050236,22.225619,22.905058,23.455595,22.686901,22.783294,24.10639,27.334777,35.165661
1,8.003139,69.0,8.2,69.0,69.0,0.0,1.0,-0.974928,-0.222521,0.965926,...,7.686551,6.431607,5.493934,4.93464,6.008252,6.717623,5.636171,4.370779,4.83794,5.305594
2,39.440934,85.0,-2.425,85.0,85.0,0.0,1.0,-0.974928,-0.222521,0.0,...,11.320501,11.337687,10.38919,10.099167,9.516086,8.986262,8.601895,8.115241,8.444033,7.462515
3,5.420429,82.0,-3.5,82.0,82.0,0.0,0.0,0.0,1.0,0.258819,...,15.673033,17.41385,20.235811,22.269894,22.489576,23.540574,23.897363,24.372181,23.597249,24.192394
4,41.618765,84.833333,15.783333,84.833333,84.833333,0.0,0.0,0.781831,0.62349,0.707107,...,18.872964,19.363339,18.465655,16.063664,13.261524,11.903197,12.740069,13.159749,12.690423,14.163106


In [21]:
x_columns = [i for i in df.columns if 'P1_forec_' not in i]
y_columns = [i for i in df.columns if 'P1_forec_' in i]

In [22]:
X, y = df[x_columns], df[y_columns]

In [23]:
for c in X.columns:
    if X[c].isnull().values.any():
        print(c)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [25]:
X_train.head()

Unnamed: 0,lag_0,hum_meteo_forec_0,temp_meteo_forec_0,pres_meteo_forec_0,wind_speed_forec_0,prec_amount_forec_0,weekend_forec_0,sin_day_forec_0,cos_day_forec_0,sin_hour_forec_0,...,hum_meteo_forec_23,temp_meteo_forec_23,pres_meteo_forec_23,wind_speed_forec_23,prec_amount_forec_23,weekend_forec_23,sin_day_forec_23,cos_day_forec_23,sin_hour_forec_23,cos_hour_forec_23
169,19.227366,53.916667,18.933333,53.916667,53.916667,0.083333,0.0,0.0,1.0,1.0,...,62.0,15.1,62.0,62.0,0.0,0.0,0.781831,0.62349,0.965926,0.258819
97,16.770247,84.0,-2.9,84.0,84.0,0.0,0.0,0.0,1.0,0.258819,...,89.5,0.2,89.5,89.5,0.8,0.0,0.781831,0.62349,0.0,1.0
31,22.883739,62.0,2.3,62.0,62.0,0.0,0.0,-0.433884,-0.900969,-0.866025,...,66.0,0.9,66.0,66.0,0.0,1.0,-0.974928,-0.222521,-0.965926,0.258819
12,8.065404,81.0,15.9,81.0,81.0,2.0,0.0,0.974928,-0.222521,-0.965926,...,91.0,14.7,91.0,91.0,13.0,0.0,0.433884,-0.900969,-0.866025,-0.5
35,32.474349,59.0,23.8,59.0,59.0,0.5,0.0,-0.433884,-0.900969,-0.5,...,44.0,21.6,44.0,44.0,0.0,1.0,-0.974928,-0.222521,-0.258819,-0.965926


In [26]:
model = Lasso(alpha=0.5)
model.fit(X_train, y_train[y_columns[2]])
pred = model.predict(X_test)
mean_absolute_error(y_test[y_columns[2]], pred)

4.706737279194167

In [27]:
def train_models(model, x_train, y_train, y_columns):
    models = []
    for i in y_columns:
        local_model = clone(model)
        local_model.fit(x_train, y_train[i])
        models.append(local_model)
    return models

In [28]:
def get_mae(models, X_test, y_test, y_columns):
    scores = []
    for i in range(len(y_columns)):
        local_model = models[i]
        prediction = local_model.predict(X_test)
        mae = mean_absolute_error(y_test[y_columns[i]], prediction)
        scores.append(mae)
    return scores

In [29]:
mod = Lasso(alpha=0.5)
models = train_models(mod, X_train, y_train, y_columns)

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


In [30]:
get_mae(models, X_test, y_test, y_columns)

[1.6855277180131614,
 3.627030197253213,
 4.706737279194167,
 5.846983108690145,
 6.717582016609823,
 7.531584435859543,
 8.083377956106466,
 9.483691467605768,
 9.756815465930778,
 9.456937737476649,
 9.848005691060397,
 10.104360556364533,
 10.523370186502266,
 10.676059708374936,
 10.962171686728919,
 11.58697745802654,
 12.486832199590129,
 12.552495477735253,
 12.646728461704914,
 12.236098546083031,
 9.68981129209236,
 9.017342330647669,
 9.138275304749204,
 8.836077566437316]

In [31]:
get_mae(models, X_test, y_test, y_columns)

[1.6855277180131614,
 3.627030197253213,
 4.706737279194167,
 5.846983108690145,
 6.717582016609823,
 7.531584435859543,
 8.083377956106466,
 9.483691467605768,
 9.756815465930778,
 9.456937737476649,
 9.848005691060397,
 10.104360556364533,
 10.523370186502266,
 10.676059708374936,
 10.962171686728919,
 11.58697745802654,
 12.486832199590129,
 12.552495477735253,
 12.646728461704914,
 12.236098546083031,
 9.68981129209236,
 9.017342330647669,
 9.138275304749204,
 8.836077566437316]

In [32]:
def predict_on_chank(chank, models):
    d1 = chank.iloc[:24]
    d2 = chank.iloc[24:]
    X = pd.DataFrame(index=range(1))
    for i in range(24):
        X[f'lag_{i}'] = d1.iloc[-(i+1)]
    y = d2
    pred = []
    for mod in models:
        pred.append(mod.predict(X)[0])
    return y,pred    

In [33]:
y_chank, pred = predict_on_chank(chanks[56], models)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
result = pd.DataFrame({'actul': y_chank})
result['prediction'] = pred
result.plot()

In [None]:
def print_coef(model):
    coef = pd.DataFrame(index=range(len(x_columns)))
    coef['featurs'] = x_columns
    coef['importance'] = model.coef_
    print(coef)