In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PowerTransformer, StandardScaler
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
%matplotlib inline
plt.rcParams['figure.figsize'] = [13, 8]

In [3]:
data = pd.read_csv('/home/alena/Tasks/Course/train.csv')
data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129


In [6]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns]

In [7]:
data['datetime'] = pd.to_datetime(data['datetime'])
data.loc[:, 'day'] = data['datetime'].dt.day
data.loc[:, 'month'] = data['datetime'].dt.month
data.loc[:, 'year'] = data['datetime'].dt.year
data.loc[:, 'hour'] = data['datetime'].dt.hour
data.loc[:, 'dayofweek'] = data['datetime'].dt.dayofweek
data.loc[:, 'weekend'] = np.where(data['dayofweek'].isin([5,6]),1,0)

In [8]:
data.loc[:, 'good_weather'] = data['weather'].map(lambda x: x == 1).astype(int)

In [9]:
transformer = PowerTransformer()
data['windspeed'] = transformer.fit_transform(data['windspeed'].values.reshape(-1, 1))

In [10]:
data.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'day', 'month', 'year', 'hour', 'dayofweek', 'weekend', 'good_weather'],
      dtype='object')

In [None]:
cat_columns = ['season', 'holiday', 'workingday',
       'month', 'dayofweek', 'weekend', 'good_weather']

encoder = TargetEncoder(cols=cat_columns)
encoded_columns = [f'{column}_encoded' for column in cat_columns]
data[encoded_columns] = encoder.fit_transform(data[cat_columns], data['count'])

In [None]:
# добавить TargetEncoder для категор фичей
# добавить StandartScaler для числ фичей
# 
# найти гиперпараметры
# добавить этот пайплайн с гидрой в проект

In [11]:
categorical_columns = ['season', 'holiday', 'workingday',
       'month', 'dayofweek', 'weekend', 'good_weather']

numerical_columns = ['temp', 'atemp', 'humidity', 'windspeed', 'month', 'year', 'hour']

In [43]:
cat_pipe = Pipeline([
    ('selector', ColumnSelector(categorical_columns)),
    ('encoder', TargetEncoder(cols=categorical_columns))
])

num_pipe = Pipeline([
    ('selector', ColumnSelector(numerical_columns)),
    ('scaler', StandardScaler())
])

preprocessor = FeatureUnion([
    ('cat', cat_pipe),
    ('num', num_pipe)
])

model = LinearRegression()

pipeline_lin_reg = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)
])

In [55]:
X_train, X_test, y_train, y_test = train_test_split(data, data['count'], test_size=0.3,
                           shuffle=True)

pipeline_lin_reg.fit(X_train, y_train)
y_test_pred = pipeline_lin_reg.predict(X_test)
mean_absolute_error(y_test, y_test_pred)

104.88009750076546

In [52]:
cat_pipe = Pipeline([
    ('selector', ColumnSelector(categorical_columns)),
    ('encoder', TargetEncoder(cols=categorical_columns))

])

num_pipe = Pipeline([
    ('selector', ColumnSelector(numerical_columns))
])

preprocessor = FeatureUnion([
    ('cat', cat_pipe),
    ('num', num_pipe)
])

model = RandomForestRegressor()

pipeline_random_forest = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)
])

In [56]:
X_train, X_test, y_train, y_test = train_test_split(data, data['count'], test_size=0.3,
                           shuffle=True)

pipeline_random_forest.fit(X_train, y_train)
y_test_pred = pipeline_random_forest.predict(X_test)
mean_absolute_error(y_test, y_test_pred)

26.546633241477853