In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Data preprocessing

In [16]:
data = data = pd.read_csv('../DATA/processed/dataset.csv', parse_dates=['date'])
data = data.set_index('date')

In [17]:
data.loc[data.prec_amount == 'Осадков нет', 'prec_amount'] = 0
data.loc[data.prec_amount == 'Следы осадков', 'prec_amount'] = 0
data.prec_amount.fillna(value=0, inplace=True)
data['prec_amount'] = data.prec_amount.astype(float)

In [18]:
data['P1'] = data.P1.interpolate()
data['P2'] = data.P2.interpolate()
data['pressure'] = data.pressure.interpolate()
data['temperature'] = data.temperature.interpolate()
data['humidity'] = data.humidity.interpolate()

In [19]:
data.temp_meteo.fillna(value=data.temp_meteo.mean(), inplace=True)
data.pres_meteo.fillna(value=data.pres_meteo.mean(), inplace=True)
data.hum_meteo.fillna(value=data.hum_meteo.mean(), inplace=True)
data.wind_speed.fillna(value=data.wind_speed.mean(), inplace=True)

Провереям, в каких столбцах есть None

In [20]:
for c in data.columns:
    if data[c].isnull().values.any():
        print(c)

wind_direction
precipitation
visibility
dew_point_temp


In [21]:
data.head()

Unnamed: 0_level_0,P1,P2,pressure,temperature,humidity,temp_meteo,pres_meteo,hum_meteo,wind_direction,wind_speed,precipitation,prec_amount,visibility,dew_point_temp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-04-01 00:00:00,5.645,3.137143,98513.24,7.0925,55.9125,12.003271,100026.352198,64.131817,,1.103918,,0.0,,
2019-04-01 00:05:00,5.513333,3.040595,98521.94,7.02,56.035,12.003271,100026.352198,64.131817,,1.103918,,0.0,,
2019-04-01 00:10:00,5.504643,3.011786,98522.76,7.04125,55.885,12.003271,100026.352198,64.131817,,1.103918,,0.0,,
2019-04-01 00:15:00,6.790714,3.461071,98529.485,6.98125,55.9075,12.003271,100026.352198,64.131817,,1.103918,,0.0,,
2019-04-01 00:20:00,7.344643,4.065357,98529.3075,6.97875,55.855,12.003271,100026.352198,64.131817,,1.103918,,0.0,,


# Train test split

In [22]:
columns = ['hum_meteo', 'temp_meteo', 'pres_meteo', 'wind_speed', 'prec_amount']
X = data[columns]
y = data.P1

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

In [24]:
X_train.head()

Unnamed: 0_level_0,hum_meteo,temp_meteo,pres_meteo,wind_speed,prec_amount
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-05-29 03:15:00,61.0,20.3,100111.4898,0.0,0.0
2019-04-19 18:25:00,23.0,12.1,101324.72,1.0,0.0
2019-11-12 22:55:00,71.0,4.2,101204.7302,1.0,0.0
2019-11-29 23:40:00,87.0,2.2,98938.2562,1.0,0.0
2019-10-18 23:45:00,82.0,13.1,100338.1372,2.0,0.0


# Model

In [25]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [26]:
model = LinearRegression()
model.fit(X_train, y_train)
prediction = model.predict(X_test)

In [27]:
mean_absolute_error(y_test, prediction)

9.436314424415318

# Features importance

In [28]:
coef = pd.DataFrame(index=range(len(columns)))
coef['featurs'] = columns
coef['importance'] = model.coef_
coef

Unnamed: 0,featurs,importance
0,hum_meteo,3.70323
1,temp_meteo,-1.187858
2,pres_meteo,2.770879
3,wind_speed,-3.872401
4,prec_amount,-0.348653
