In [3]:
import pandas as pd
import numpy as np
data = pd.read_csv('https://raw.githubusercontent.com/MicrosoftDocs/ml-basics/master/data/daily-bike-share.csv')
data.dtypes

instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
rentals         int64
dtype: object

In [4]:
data.isnull().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
rentals       0
dtype: int64

In [5]:
data = data[['season'
             , 'mnth'
             , 'holiday'
             , 'weekday'
             , 'workingday'
             , 'weathersit'
             , 'temp'
             , 'atemp'
             , 'hum'
             , 'windspeed'
             , 'rentals']]

In [7]:
from sklearn.model_selection import train_test_split
X = data.drop('rentals',axis=1)
y = data['rentals']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [9]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='mean'))
      ,('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant'))
      ,('encoder', OrdinalEncoder())
])

# numeric_transformer = Pipeline(steps=[
#        ('imputer', SimpleImputer(strategy='median'))
#       ,('scaler', MinMaxScaler())
# ])
# categorical_transformer = Pipeline(steps=[
#        ('imputer', SimpleImputer(strategy='constant'))
#       ,('encoder', OneHotEncoder())
# ])
# pipeline = Pipeline(steps = [
#                ('preprocessor', preprocessor)
#               ,('regressor',RandomForestRegressor(n_estimators=300
#                                                  ,max_depth=10))
#            ])



numeric_features = ['temp', 'atemp', 'hum', 'windspeed']
categorical_features = ['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']
preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features)
   ,('categorical', categorical_transformer, categorical_features)
]) 

from sklearn.ensemble import RandomForestRegressor
pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor',RandomForestRegressor())
           ])

rf_model = pipeline.fit(X_train, y_train)
print (rf_model)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['temp', 'atemp', 'hum',
                                                   'windspeed']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('encoder',
                                                                   OrdinalEncoder())]),
                        

In [10]:
from sklearn.metrics import r2_score
predictions = rf_model.predict(X_test)
print (r2_score(y_test, predictions))

0.7695976015836663


In [12]:
import joblib
joblib.dump(rf_model, './rf_model.pkl')

# In other notebooks 
# rf_model = joblib.load('PATH/TO/rf_model.pkl')
# new_prediction = rf_model.predict(new_data)

['./rf_model.pkl']

In [14]:
# easily comparing

# regressors = [
#     regressor_1()
#    ,regressor_2()
#    ,regressor_3()
#    ....]
# for regressor in regressors:
#     pipeline = Pipeline(steps = [
#                ('preprocessor', preprocessor)
#               ,('regressor',regressor)
#            ])
#     model = pipeline.fit(X_train, y_train)
#     predictions = model.predict(X_test)
#     print (regressor)
#     print (f('Model r2 score:{r2_score(predictions, y_test)}')

In [18]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import optuna

# Assuming X_train and y_train are your training data
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('encoder', OrdinalEncoder())
])

numeric_features = ['temp', 'atemp', 'hum', 'windspeed']
categorical_features = ['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ])

# Remove RandomForestRegressor and use Optuna for hyperparameter tuning
def objective(trial):
    # Define hyperparameters to be tuned
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 5, 30)

    # Create a pipeline with hyperparameters suggested by Optuna
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth))
    ])

    # Evaluate the score by cross-validation
    score = cross_val_score(pipeline, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    mean_score = score.mean()

    return mean_score

# Create Optuna study and optimize
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
best_n_estimators = best_params['n_estimators']
best_max_depth = best_params['max_depth']

# Create the final pipeline with the best hyperparameters
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=best_n_estimators, max_depth=best_max_depth))
])

# Fit the final model with the best hyperparameters to the entire training data
final_model = final_pipeline.fit(X_train, y_train)

# Save the best model using joblib
joblib.dump(final_model, './rf_model.pkl')

print("Best Hyperparameters:", best_params)
print("Final Model saved as rf_model.pkl")

[I 2024-01-04 11:52:07,243] A new study created in memory with name: no-name-879a42f9-40a2-4a4b-817a-2c00c28405b1
[I 2024-01-04 11:52:07,855] Trial 0 finished with value: -119848.29106513267 and parameters: {'n_estimators': 160, 'max_depth': 14}. Best is trial 0 with value: -119848.29106513267.
[I 2024-01-04 11:52:09,208] Trial 1 finished with value: -119974.1987709306 and parameters: {'n_estimators': 383, 'max_depth': 14}. Best is trial 1 with value: -119974.1987709306.
[I 2024-01-04 11:52:10,685] Trial 2 finished with value: -118445.79410340157 and parameters: {'n_estimators': 413, 'max_depth': 24}. Best is trial 1 with value: -119974.1987709306.
[I 2024-01-04 11:52:11,777] Trial 3 finished with value: -119572.60459386476 and parameters: {'n_estimators': 297, 'max_depth': 16}. Best is trial 1 with value: -119974.1987709306.
[I 2024-01-04 11:52:12,998] Trial 4 finished with value: -120620.33672389579 and parameters: {'n_estimators': 345, 'max_depth': 21}. Best is trial 4 with value: -

Best Hyperparameters: {'n_estimators': 53, 'max_depth': 27}
Final Model saved as rf_model.pkl
