### PHASE 1 : GET DATA 

##### Pandas is a Python library used for working with data sets

In [None]:

import pandas as pd 

##### Matplotlib is a comprehensive library for creating static, animated, and interactive visualizations in Python. Matplotlib makes easy things easy and hard things possible.

In [None]:
import matplotlib.pyplot as plt

##### Numpy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays.

In [None]:
import numpy as np 

##### A line of code in Matplotlib that changes the default size of figures created in your Python script. Specifically, it sets the figure's width to 10 inches and its height to 5 inches. This affects all subsequent figures created using plt.figure() or functions that create subplots, unless explicitly overridden. 

In [None]:
# Set the default figure size for plots
plt.rcParams['figure.figsize'] = [10, 5]

### Importing Data

#####  The dataset is in CSV file but I want it to be a data frame. Now within pandas, that's what remember pd says within pandas there's a function called read_csv, which reads a CSV and turns it into a data frame. So this line here is telling me I've got a data frame, I'm calling bike_df. So bike_df is I'm taking this data and I'm turning it into a data frame.

In [None]:
bike_df = pd.read_csv('bike_sharing_daily.csv')
bike_df

#####  Here We are going to add another feature which is called date. And what it's going to do is use another pandas function, which takes a date time and turns a string into a date time. So this is a string and it's going to turn it into a datetime object, which is a kind of number.

In [None]:
bike_df['date'] = pd.to_datetime(bike_df['dteday'])
bike_df

### PHASE 2: PLAY WITH DATA

#### It's got three different things in it. It's got the date, it's got the count, and it's got the weather situation. Here, the weather situation was one, two or three. The count is a number with thousands. The date is a date.

##### Here, Yellow is bad weather. Purple is best weather and blue is the in-between.

In [None]:
# df.plot.scatter(x='dteday', y='cnt')
plt.scatter(bike_df['date'], bike_df['cnt'], c = bike_df['weathersit'])

In [None]:
# when the weather situation is one, the average count of bikes rented is:
bike_df[bike_df['weathersit']==1]['cnt'].mean()

In [None]:
# when the weather situation is two, the average count of bikes rented is:
bike_df[bike_df['weathersit']==2]['cnt'].mean()

In [None]:
# when the weather situation is three, the average count of bikes rented is:
bike_df[bike_df['weathersit']==3]['cnt'].mean()

#### It's got the temperature against the count.

In [None]:
plt.scatter(bike_df['atemp'], bike_df['cnt'])

### Temperature and Correlation

In [None]:
# Calculate the correlation between 'cnt' and 'atemp'
bike_df[['cnt', 'atemp']].corr()

In [None]:
# Calculate the correlation between 'cnt' and 'temp'
bike_df[['cnt', 'temp']].corr()

In [None]:
# Calculate the correlation among 'cnt', 'atemp', 'temp', 'hum', 'windspeed' and 'weathersit'
bike_df[['cnt', 'atemp', 'temp', 'hum', 'windspeed', 'weathersit']].corr()

### PHASE 3 : IMPLEMENT LEARNING
LEARNING MODEL - LINEAR REGRESSION 

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Create a linear regression model
lr = LinearRegression() 
# Fit the model using 'atemp' as the independent variable and 'cnt' as the dependent variable
lr.fit(bike_df['atemp'].values.reshape(-1, 1), bike_df['cnt'].values.reshape(-1, 1))

In [None]:
# Plot the scatter plot of 'atemp' vs 'cnt' and the regression line
plt.scatter(bike_df['atemp'], bike_df['cnt']) 
# Regression line
plt.plot(bike_df['atemp'], lr.predict(bike_df['atemp'].values.reshape(-1, 1)), c='red')

In [None]:
# plot fitted line
plt.scatter(bike_df['atemp'], bike_df['cnt'])
plt.plot(bike_df['atemp'], lr.predict(bike_df['atemp'].values.reshape(-1, 1)), c='red') 

# plot the residuals
for i in range(len(bike_df['atemp'])):
    plt.plot(
        [bike_df['atemp'].iloc[i], bike_df['atemp'].iloc[i]],
        [bike_df['cnt'].iloc[i], lr.predict([[bike_df['atemp'].iloc[i]]])[0][0]], 
        c='black', linestyle='--'
    )
plt.xlabel('atemp')
plt.ylabel('cnt')
plt.title('Residuals of the Linear Regression Model')

In [None]:
training_set = bike_df[bike_df['date'] < '2012-06-01']
validation_set = bike_df[bike_df['date'] >= '2012-06-01']

training_inputs = training_set[['atemp', 'workingday', 'hum', 'weathersit']].values
training_outputs = training_set[['cnt']].values

validation_inputs = validation_set[['atemp', 'workingday', 'hum', 'weathersit']].values
validation_outputs = validation_set[['cnt']].values

lr = LinearRegression()
lr.fit(training_inputs, training_outputs)

plt.scatter(training_set['date'], training_set['cnt'])
plt.scatter(training_set['date'], lr.predict(training_inputs))
plt.show()

# plt.scatter(lr.predict(validation_inputs), validation_outputs)
plt.scatter(validation_set['date'], validation_set['cnt'])
plt.scatter(validation_set['date'], lr.predict(validation_inputs))
plt.show()

rmse = np.sqrt(((lr.predict(validation_inputs) - validation_outputs)**2).mean())
rmse

In [None]:
training_set = bike_df[bike_df['date'] < '2012-06-01']
validation_set = bike_df[bike_df['date'] >= '2012-06-01']

training_inputs = training_set[['atemp', 'workingday', 'hum', 'weathersit', 'windspeed']].values
training_outputs = training_set[['cnt']].values

validation_inputs = validation_set[['atemp', 'workingday', 'hum', 'weathersit', 'windspeed']].values
validation_outputs = validation_set[['cnt']].values

lr = LinearRegression()
lr.fit(training_inputs, training_outputs)


# plt.scatter(lr.predict(validation_inputs), validation_outputs)
plt.scatter(validation_set['date'], validation_set['cnt'])
plt.scatter(validation_set['date'], lr.predict(validation_inputs))
plt.show()

plt.scatter(training_set['date'], training_set['cnt'])
plt.scatter(training_set['date'], lr.predict(training_inputs))
plt.show()

rmse = np.sqrt(((lr.predict(validation_inputs) - validation_outputs)**2).mean())
rmse

### BACK TO PHASE 2 : FEATURE ENGINEERING

This calculates the average of the past 7 days by taking the difference between today’s cumulative sum and the cumulative sum 7 days ago, and dividing by 7

In [None]:
bike_df['last_week'] = (bike_df['cnt'].cumsum()- bike_df['cnt'].cumsum().shift(7))/7
bike_df = bike_df.dropna()
bike_df

In [None]:
training_set = bike_df[bike_df['date'] < '2012-06-01']
validation_set = bike_df[bike_df['date'] >= '2012-06-01']

training_inputs = training_set[['atemp', 'workingday', 'hum', 'weathersit', 'last_week', 'windspeed']].values
training_outputs = training_set[['cnt']].values

validation_inputs = validation_set[['atemp', 'workingday', 'hum', 'weathersit', 'last_week', 'windspeed']].values
validation_outputs = validation_set[['cnt']].values

lr = LinearRegression()
lr.fit(training_inputs, training_outputs)

plt.scatter(training_set['date'], training_set['cnt'])
plt.scatter(training_set['date'], lr.predict(training_inputs))
plt.show()

# plt.scatter(lr.predict(validation_inputs), validation_outputs)
plt.scatter(validation_set['date'], validation_set['cnt'])
plt.scatter(validation_set['date'], lr.predict(validation_inputs))
plt.show()

rmse = np.sqrt(((lr.predict(validation_inputs) - validation_outputs)**2).mean())
rmse

In [None]:
# Safely assign 30-day rolling average
bike_df.loc[:, 'last_month'] = bike_df['cnt'].rolling(window=30).mean()

# Drop rows with NaN (first 29 rows)
bike_df = bike_df.dropna()
bike_df


In [None]:
training_set = bike_df[bike_df['date'] < '2012-06-01']
validation_set = bike_df[bike_df['date'] >= '2012-06-01']

training_inputs = training_set[['atemp', 'workingday', 'hum', 'weathersit', 'last_week', 'last_month', 'windspeed']].values
training_outputs = training_set[['cnt']].values

validation_inputs = validation_set[['atemp', 'workingday', 'hum', 'weathersit', 'last_week', 'last_month', 'windspeed']].values
validation_outputs = validation_set[['cnt']].values

lr = LinearRegression()
lr.fit(training_inputs, training_outputs)

plt.scatter(training_set['date'], training_set['cnt'])
plt.scatter(training_set['date'], lr.predict(training_inputs))
plt.show()

# plt.scatter(lr.predict(validation_inputs), validation_outputs)
plt.scatter(validation_set['date'], validation_set['cnt'])
plt.scatter(validation_set['date'], lr.predict(validation_inputs))
plt.show()

rmse = np.sqrt(((lr.predict(validation_inputs) - validation_outputs)**2).mean())
rmse