# Predictions

### Running the Necessary Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import datetime

In [2]:
# Setting bike to the data in the BikeData.csv file
bike = pd.read_csv("prediction_data.csv")

In [3]:
# Giving glimpse into the structure of the data
bike.head()

Unnamed: 0,Rented Bike Count,Hour,Temperature (C),Humidity (%),Seasons,Holiday,Functioning Day,Precipitation,Day,Month,Year
0,254,0,-5.2,37,1,0,1,0,1,12,2017
1,204,1,-5.5,38,1,0,1,0,1,12,2017
2,173,2,-6.0,39,1,0,1,0,1,12,2017
3,107,3,-6.2,40,1,0,1,0,1,12,2017
4,78,4,-6.0,36,1,0,1,0,1,12,2017


### Checking For Missing Data
None Found

In [4]:
bike.isna().sum() # Returns 0 for all columns, meaning no missing data

Rented Bike Count    0
Hour                 0
Temperature (C)      0
Humidity (%)         0
Seasons              0
Holiday              0
Functioning Day      0
Precipitation        0
Day                  0
Month                0
Year                 0
dtype: int64

In [5]:
bike.dtypes

Rented Bike Count      int64
Hour                   int64
Temperature (C)      float64
Humidity (%)           int64
Seasons                int64
Holiday                int64
Functioning Day        int64
Precipitation          int64
Day                    int64
Month                  int64
Year                   int64
dtype: object

In [6]:
bike.head()

Unnamed: 0,Rented Bike Count,Hour,Temperature (C),Humidity (%),Seasons,Holiday,Functioning Day,Precipitation,Day,Month,Year
0,254,0,-5.2,37,1,0,1,0,1,12,2017
1,204,1,-5.5,38,1,0,1,0,1,12,2017
2,173,2,-6.0,39,1,0,1,0,1,12,2017
3,107,3,-6.2,40,1,0,1,0,1,12,2017
4,78,4,-6.0,36,1,0,1,0,1,12,2017


In [7]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Creating function to evaluate predictions later on
def eval_classification(y_true, y_preds):
    """
    Performs evaluation comparison on y_true labels vs. y_preds labels
    on a regressor model.
    """
    print("Regression metrics on the test set")
    print(f"R^2: {r2_score(y_test, y_preds)}")
    print(f"MAE: {mean_absolute_error(y_test, y_preds)}")
    print(f"MSE: {mean_squared_error(y_test, y_preds)}")
    
    return

### Splitting the data into training and testing sets then fitting and scoring the model

In [10]:
# Split into X & y
X = bike.drop(["Rented Bike Count"], axis=1)
y = bike["Rented Bike Count"]

#Splitting into training and test data
#X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

# Initializes the RandomForstRegressor model
clf = RandomForestRegressor(random_state=40)

# Fitting with training data, then scoring with test data
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8994250760637106

In [12]:
y_preds = clf.predict(X_test)
y_preds

array([ 105.72, 1537.57,  228.99, ..., 1738.03,   46.64,  662.89])

In [13]:
y_test

2140      72
7531    1723
1885     193
1310     125
7084      96
        ... 
8402     376
7920     640
5542    1870
6501     122
6350     760
Name: Rented Bike Count, Length: 1752, dtype: int64

### Testing the accuracy of the RandomForestRegressor model

In [18]:
print("Regression metrics on the test set")
print(f"R^2: {r2_score(y_test, y_preds)}")
print(f"MAE: {mean_absolute_error(y_test, y_preds)}")

Regression metrics on the test set
R^2: 0.8994250760637106
MAE: 125.73565639269407
