In [3]:
import pandas as pd
import numpy as np
data = pd.read_csv('https://raw.githubusercontent.com/MicrosoftDocs/ml-basics/master/data/daily-bike-share.csv')
data.dtypes

instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
rentals         int64
dtype: object

In [4]:
data.isnull().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
rentals       0
dtype: int64

In [5]:
data = data[['season'
             , 'mnth'
             , 'holiday'
             , 'weekday'
             , 'workingday'
             , 'weathersit'
             , 'temp'
             , 'atemp'
             , 'hum'
             , 'windspeed'
             , 'rentals']]

In [7]:
from sklearn.model_selection import train_test_split
X = data.drop('rentals',axis=1)
y = data['rentals']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [9]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='mean'))
      ,('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant'))
      ,('encoder', OrdinalEncoder())
])

# numeric_transformer = Pipeline(steps=[
#        ('imputer', SimpleImputer(strategy='median'))
#       ,('scaler', MinMaxScaler())
# ])
# categorical_transformer = Pipeline(steps=[
#        ('imputer', SimpleImputer(strategy='constant'))
#       ,('encoder', OneHotEncoder())
# ])
# pipeline = Pipeline(steps = [
#                ('preprocessor', preprocessor)
#               ,('regressor',RandomForestRegressor(n_estimators=300
#                                                  ,max_depth=10))
#            ])



numeric_features = ['temp', 'atemp', 'hum', 'windspeed']
categorical_features = ['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']
preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
   ,('categorical', categorical_transformer, categorical_features)
]) 

from sklearn.ensemble import RandomForestRegressor
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',RandomForestRegressor())
           ])

rf_model = pipeline.fit(X_train, y_train)
print (rf_model)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['temp', 'atemp', 'hum',
                                                   'windspeed']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('encoder',
                                                                   OrdinalEncoder())]),
                        

In [10]:
from sklearn.metrics import r2_score
predictions = rf_model.predict(X_test)
print (r2_score(y_test, predictions))

0.7695976015836663


In [12]:
import joblib
joblib.dump(rf_model, './rf_model.pkl')

# In other notebooks 
# rf_model = joblib.load('PATH/TO/rf_model.pkl')
# new_prediction = rf_model.predict(new_data)

['./rf_model.pkl']

In [14]:
# easily comparing

# regressors = [
#     regressor_1()
#    ,regressor_2()
#    ,regressor_3()
#    ....]
# for regressor in regressors:
#     pipeline = Pipeline(steps = [
#                ('preprocessor', preprocessor)
#               ,('regressor',regressor)
#            ])
#     model = pipeline.fit(X_train, y_train)
#     predictions = model.predict(X_test)
#     print (regressor)
#     print (f('Model r2 score:{r2_score(predictions, y_test)}')