In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error

# Data preprocessing

In [2]:
data = data = pd.read_csv('../DATA/processed/dataset.csv', parse_dates=['date'])
data = data.set_index('date')

In [3]:
data.loc[data.prec_amount == 'Осадков нет', 'prec_amount'] = 0
data.loc[data.prec_amount == 'Следы осадков', 'prec_amount'] = 0
data.prec_amount.fillna(value=0, inplace=True)
data['prec_amount'] = data.prec_amount.astype(float)

In [4]:
data['P1'] = data.P1.interpolate()
data['P2'] = data.P2.interpolate()
data['pressure'] = data.pressure.interpolate()
data['temperature'] = data.temperature.interpolate()
data['humidity'] = data.humidity.interpolate()

In [5]:
data.temp_meteo.fillna(value=data.temp_meteo.mean(), inplace=True)
data.pres_meteo.fillna(value=data.pres_meteo.mean(), inplace=True)
data.hum_meteo.fillna(value=data.hum_meteo.mean(), inplace=True)
data.wind_speed.fillna(value=data.wind_speed.mean(), inplace=True)

Провереям, в каких столбцах есть None

In [6]:
for c in data.columns:
    if data[c].isnull().values.any():
        print(c)

wind_direction
precipitation
visibility
dew_point_temp


In [7]:
data.head()

Unnamed: 0_level_0,P1,P2,pressure,temperature,humidity,temp_meteo,pres_meteo,hum_meteo,wind_direction,wind_speed,precipitation,prec_amount,visibility,dew_point_temp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-04-01 00:00:00,5.645,3.137143,98513.24,7.0925,55.9125,11.689907,100032.739742,64.597719,,1.105419,,0.0,,
2019-04-01 00:05:00,5.513333,3.040595,98521.94,7.02,56.035,11.689907,100032.739742,64.597719,,1.105419,,0.0,,
2019-04-01 00:10:00,5.504643,3.011786,98522.76,7.04125,55.885,11.689907,100032.739742,64.597719,,1.105419,,0.0,,
2019-04-01 00:15:00,6.790714,3.461071,98529.485,6.98125,55.9075,11.689907,100032.739742,64.597719,,1.105419,,0.0,,
2019-04-01 00:20:00,7.344643,4.065357,98529.3075,6.97875,55.855,11.689907,100032.739742,64.597719,,1.105419,,0.0,,


# Train test split

In [8]:
columns = ['hum_meteo', 'temp_meteo', 'pres_meteo', 'wind_speed', 'prec_amount']
X = data[columns]
y = data.P1

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

In [10]:
X_train.head()

Unnamed: 0_level_0,hum_meteo,temp_meteo,pres_meteo,wind_speed,prec_amount
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-11-07 01:40:00,91.0,11.2,98018.3344,1.0,0.0
2019-12-17 06:45:00,85.0,2.8,99818.1814,2.0,0.5
2019-06-04 09:50:00,31.0,19.8,101098.0726,1.0,0.0
2020-01-12 00:05:00,86.0,-2.3,100364.8016,2.0,0.0
2019-05-29 07:55:00,56.0,22.2,100204.8152,0.0,0.0


# Simple model

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
model = LinearRegression()
model.fit(X_train, y_train)
prediction = model.predict(X_test)

In [13]:
mean_absolute_error(y_test, prediction)

9.426661833867616

# Features importance

In [14]:
coef = pd.DataFrame(index=range(len(columns)))
coef['featurs'] = columns
coef['importance'] = model.coef_
coef

Unnamed: 0,featurs,importance
0,hum_meteo,3.782624
1,temp_meteo,-0.904981
2,pres_meteo,2.802272
3,wind_speed,-3.83232
4,prec_amount,-0.340636


# Add time features

In [30]:
data['day_of_week'] = data.index.dayofweek
data['weekend'] = data.day_of_week.apply(lambda d: d == 5 or d == 6).astype(int)
data['hour'] = data.index.hour
data['night'] = data.hour.apply(lambda h: 0<= h < 7).astype(int)
data['morning'] = data.hour.apply(lambda h: 7<= h < 12).astype(int)
data['day'] = data.hour.apply(lambda h: 12<= h < 17).astype(int)
data['evening'] = data.hour.apply(lambda h: 17<= h < 24).astype(int)
data['sin_day'] = np.sin(2*np.pi*data.day_of_week/7)
data['cos_day'] = np.cos(2*np.pi*data.day_of_week/7)
data['sin_hour'] = np.sin(2*np.pi*data.hour/24)
data['cos_hour'] = np.cos(2*np.pi*data.hour/24)
data.head()

Unnamed: 0_level_0,P1,P2,pressure,temperature,humidity,temp_meteo,pres_meteo,hum_meteo,wind_direction,wind_speed,...,hour,night,morning,day,evening,sin_day,cos_day,sin_hour,cos_sin,cos_hour
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-04-01 00:00:00,5.645,3.137143,98513.24,7.0925,55.9125,11.689907,100032.739742,64.597719,,1.105419,...,0,1,0,0,0,0.0,1.0,0.0,1.0,1.0
2019-04-01 00:05:00,5.513333,3.040595,98521.94,7.02,56.035,11.689907,100032.739742,64.597719,,1.105419,...,0,1,0,0,0,0.0,1.0,0.0,1.0,1.0
2019-04-01 00:10:00,5.504643,3.011786,98522.76,7.04125,55.885,11.689907,100032.739742,64.597719,,1.105419,...,0,1,0,0,0,0.0,1.0,0.0,1.0,1.0
2019-04-01 00:15:00,6.790714,3.461071,98529.485,6.98125,55.9075,11.689907,100032.739742,64.597719,,1.105419,...,0,1,0,0,0,0.0,1.0,0.0,1.0,1.0
2019-04-01 00:20:00,7.344643,4.065357,98529.3075,6.97875,55.855,11.689907,100032.739742,64.597719,,1.105419,...,0,1,0,0,0,0.0,1.0,0.0,1.0,1.0


In [31]:
#columns = ['hum_meteo', 'temp_meteo', 'pres_meteo', 'wind_speed', 'prec_amount', 'day_of_week', 'weekend',
#          'night', 'morning', 'day', 'evening',
#          'hour']
columns = ['hum_meteo', 'temp_meteo', 'pres_meteo', 'wind_speed', 'prec_amount', 'weekend',
           'sin_day', 'cos_day', 'sin_hour', 'cos_hour']
X = data[columns]
y = data.P1

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

In [33]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [34]:
model = LinearRegression()
model.fit(X_train, y_train)
prediction = model.predict(X_test)

In [35]:
mean_absolute_error(y_test, prediction)

9.109203483068152

In [36]:
coef = pd.DataFrame(index=range(len(columns)))
coef['featurs'] = columns
coef['importance'] = model.coef_
coef

Unnamed: 0,featurs,importance
0,hum_meteo,4.773506
1,temp_meteo,-0.836561
2,pres_meteo,3.01311
3,wind_speed,-4.267691
4,prec_amount,-0.67594
5,weekend,0.304601
6,sin_day,-0.24254
7,cos_day,0.087451
8,sin_hour,-3.188235
9,cos_hour,0.508644


# Add polinomial features and interaction

In [37]:
poly = PolynomialFeatures(2)
X_train = poly.fit_transform(X_train)
X_test = poly.transform(X_test)
names = poly.get_feature_names(columns)

In [68]:
#model = LinearRegression()
model = Ridge(alpha=1.2)
model.fit(X_train, y_train)
prediction = model.predict(X_test)

In [69]:
mean_absolute_error(y_test, prediction)

8.194999498736765

In [70]:
coef = pd.DataFrame(index=range(len(names)))
coef['features'] = names
coef['importance'] = model.coef_
coef

Unnamed: 0,features,importance
0,1,0.000000
1,hum_meteo,6.693438
2,temp_meteo,-0.637944
3,pres_meteo,2.884333
4,wind_speed,-3.969827
...,...,...
61,cos_day sin_hour,-0.229726
62,cos_day cos_hour,-0.398459
63,sin_hour^2,0.319306
64,sin_hour cos_hour,0.297073
