In [1]:
#Importing all libraries
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np 
%matplotlib inline

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process.kernels import RationalQuadratic, ConstantKernel as C
from sklearn.preprocessing import StandardScaler


Preparing training sets

In [2]:
clean_data = pd.read_csv('clean_data_final.csv')

In [3]:
clean_data['Time']

#The data in this csv file has already been transformed to a more workable formatting under
#certain assumptions:
    #1) Assumed that the scraper collection begain on Monday, Nov 6th, and ended the Tuesday next week, Nov 14th.

    #2) Considering this, there are two sets of data for Monday and Tuesday as data for their trends have 
    #been recorded twice (Nov 6th, Nov 7th, and again on Nov 13th, Nov 14th)
    #Therefore: Assumed that the trends across these pairs of Mondays and Tuesdays are similar enough
    #to be considered the same day, and changed the dates in the csv file accordingly. 
        #So, there's no date for Nov 13th nor the 14th on the csv, but rather their scraped data was simply
        #added to Nov 6th and the 7th. 

0      2023-11-06 02:10:21
1      2023-11-06 02:30:14
2      2023-11-06 02:50:08
3      2023-11-06 03:11:00
4      2023-11-06 03:30:55
              ...         
453    2023-11-07 10:40:01
454    2023-11-07 11:42:25
455    2023-11-07 12:05:30
456    2023-11-07 12:20:37
457    2023-11-07 12:36:21
Name: Time, Length: 458, dtype: object

In [4]:
#Converting csv file dataframe to a pandas datetime object

clean_data['Time_obj'] = pd.to_datetime(clean_data['Time'], errors = 'raise')
clean_data['Time_obj']

0     2023-11-06 02:10:21
1     2023-11-06 02:30:14
2     2023-11-06 02:50:08
3     2023-11-06 03:11:00
4     2023-11-06 03:30:55
              ...        
453   2023-11-07 10:40:01
454   2023-11-07 11:42:25
455   2023-11-07 12:05:30
456   2023-11-07 12:20:37
457   2023-11-07 12:36:21
Name: Time_obj, Length: 458, dtype: datetime64[ns]

In [5]:
#using the 'day' attritube to extract the day from the date
clean_data['day'] = clean_data['Time_obj'].dt.day
clean_data['day'].unique()

array([ 6,  7,  8,  9, 10, 11, 12], dtype=int32)

In [6]:
#Changing days to a transformable form:
#preparation for conversion from days -> hours

#To write days in terms of hours, one would simply be to multiply the day by 24.
#But what we currently have are days of the month, ex. the 6th, the 9th, the 12th, etc. So multiplying
#these numbers by 24 to convert them to hours is faulty logic. 

#Instead, we can first convert these dates to number them by their day of the week, 
#starting with day 0 and ending with day 6 for a total of 7 days.

#Once that's done, we can multiply those numbers (0-6) to accurately calculate the days in terms of hours,
#where 1 week is 168 hours long.

#Here, Monday will be the 0th day of the week since that's when the scraper is assumed to start working,
#So 12:00 AM on a Monday is considered hour 0 of day 0. 12:00 PM on Monday is considered the 12th hour
#of the whole week, 12:00 AM on Tuesday is considered the 24th hour of the week, so on and so forth.

#If anyone's got any better ideas than this, feel free to try them out since this is probably
#an overcomplicated solution

#-----------------------------------------------------------------------------------------------------------

replacing = True            #used to start while loop for changing the days dataframe
counter = 0                 #Beginning at Monday, here considered the 0th day of the week with 12 AM being the 0th hour
day = 6                     #The data is assumed to start on the 6th of November, so the day counter will start on that date

while(replacing):                                                               #Replaces the corresponding day from the raw data to a workable default day (Monday the 6th = day 0, Tuesday the 7th = day 1, etc)
    clean_data['day'] = clean_data['day'].replace(day, counter)                 #Replaces all numbers that match the day variable with the value of the counter
    day += 1
    counter += 1                                                                #Once all numbers are replaced for the current day, increment day and counter to move on and replace the values for the next day

    if (counter == 7):
        replacing = False

clean_data['day'].unique() #Viola

array([0, 1, 2, 3, 4, 5, 6], dtype=int32)

In [7]:
#Converting from days to hours
clean_data['days_as_hours'] = clean_data['day'] * 24
clean_data['days_as_hours'].unique()

array([  0,  24,  48,  72,  96, 120, 144], dtype=int32)

In [8]:
#Preparing final time by writing time as days, hours, and minutes in terms of hours
#Preparing minutes and hours, then adding to days

#Retrievting minutes
clean_data['mins'] = clean_data['Time_obj'].dt.minute

#Retrievting hours
clean_data['hrs'] = clean_data['Time_obj'].dt.hour

#Writing minutes in terms of hours
clean_data['mins_as_hrs'] = clean_data['mins'].divide(60.0)

#Adding the arrays to calculate the final time
clean_data['hrs_and_mins'] = clean_data['hrs'].add(clean_data['mins_as_hrs'])
clean_data['Final_time'] = clean_data['hrs_and_mins'].add(clean_data['days_as_hours'])

#double checking shape and formatting
clean_data['Final_time']

0       2.166667
1       2.500000
2       2.833333
3       3.183333
4       3.500000
         ...    
453    34.666667
454    35.700000
455    36.083333
456    36.333333
457    36.600000
Name: Final_time, Length: 458, dtype: float64

In [9]:
#Initialize training set, X axis
Time_scraped = np.array(clean_data['Final_time'])
Time_scraped = Time_scraped.reshape(-1, 1)

Time_scraped.shape

(458, 1)

In [10]:
#Initializing training sets, Y axis

#Pg1
y_pg1 = np.array(clean_data['PG1'])
y_pg1 = y_pg1.reshape(-1, 1)

#Pg2
y_pg2 = np.array(clean_data['PG2'])
y_pg2 = y_pg2.reshape(-1, 1)

#Pg3
y_pg3 = np.array(clean_data['PG3'])
y_pg3 = y_pg3.reshape(-1, 1)

#Pg4
y_pg4 = np.array(clean_data['PG4'])
y_pg4 = y_pg4.reshape(-1, 1)

#Pg5
y_pg5 = np.array(clean_data['PG5'])
y_pg5 = y_pg5.reshape(-1, 1)

#Pg6
y_pg6 = np.array(clean_data['PG6'])
y_pg6 = y_pg6.reshape(-1, 1)

Prompting User Input

In [11]:
#Requesting user input 
time_request = input("Enter a time in 00:00 24-hour time formatting: ")
day_request = input("Enter the day of the week (Monday, Tuesday, etc.): ")
print("Calculating empty spaces for each garage at " + time_request + " on " + day_request + ".")

time_request = pd.to_datetime(time_request, errors = 'raise')

#Changes String days to numbered version of the days of the week
match day_request:
    case "Monday":
        day_request = 0
    case "Tuesday":
        day_request = 1
    case "Wednesday":
        day_request = 2
    case "Thursday":
        day_request = 3
    case "Friday":
        day_request = 4
    case "Saturday":
        day_request = 5
    case "Sunday":
        day_request = 6

request = (day_request * 24) + time_request.hour + (time_request.minute / 60)

#Predictions need to be passed through as an array, so let's place this into one and reshape it
request_df = pd.DataFrame(request, index = [0], columns = ['Request'])
final_request = np.array(request_df).reshape(-1, 1)

final_request

Calculating empty spaces for each garage at 13:00 on Monday.


array([[13.]])

In [12]:
#PG1 Model
#Initialize the GPR
from sklearn.gaussian_process.kernels import ExpSineSquared

#Initializing kernels
noise_kernel_pg1 = C(1.0, (1e-3, 1e3)) * RationalQuadratic(length_scale = 1.0, alpha = 1.0, length_scale_bounds = (1e-5, 1e5), alpha_bounds = (1e-10, 1e10))
rbf_pg1 = 1.0**1 * RBF(length_scale = 1.0, length_scale_bounds = (1e-2, 1e2))

#linear_kernel_pg1 = C(1.0, (1e-3, 1e3))

#exp_kernel_pg1 = ExpSineSquared() #* RBF(length_scale = 100.0, length_scale_bounds = (1e-2, 1e2))


#Combined kernel_pg1
kernel_pg1 = (noise_kernel_pg1 * rbf_pg1)

#Initialize GaussianProcessRegressor and fit
gaussian_process_pg1 = GaussianProcessRegressor(kernel = kernel_pg1, alpha = 0.1, n_restarts_optimizer = 10, normalize_y=True)
#Alpha of kernel -> standard deviation

gaussian_process_pg1.fit(Time_scraped, y_pg1)
gaussian_process_pg1.kernel_

1.1**2 * RationalQuadratic(alpha=0.159, length_scale=6.06) * 0.898**2 * RBF(length_scale=6.76)

In [13]:
#Pg1 Prediction 
mean_prediction_pg1, std_prediction_pg1 = gaussian_process_pg1.predict(final_request, return_std = True)

In [14]:
#Pg2 Model

#Kernels
noise_kernel_pg2 = C(1.0, (1e-3, 1e3)) * RationalQuadratic(length_scale = 1.0, alpha = 1.0)
rbf_pg2 = 1.0**1 * RBF(length_scale = 1.0, length_scale_bounds = (1e-2, 1e2))

kernel_pg2 = (rbf_pg2 + noise_kernel_pg2)

#Initialize GaussianProcessRegressor and fit
gaussian_process_pg2 = GaussianProcessRegressor(kernel = kernel_pg2, alpha = 0.1, n_restarts_optimizer = 10, normalize_y=True)

gaussian_process_pg2.fit(Time_scraped, y_pg2)
gaussian_process_pg2.kernel_




0.677**2 * RBF(length_scale=3.57) + 0.699**2 * RationalQuadratic(alpha=1e+05, length_scale=43.2)

In [15]:
#Pg2 Prediction
mean_prediction_pg2, std_prediction_pg2 = gaussian_process_pg2.predict(final_request, return_std = True)

In [16]:
#Pg3 Model 

#Kernels
noise_kernel_pg3 = C(1.0, (1e-3, 1e3)) * RationalQuadratic(length_scale = 1.0, alpha = 1.0)
rbf_pg3 = 1.0**1 * RBF(length_scale = 1.0, length_scale_bounds = (1e-2, 1e2))

kernel_pg3 = (rbf_pg3 + noise_kernel_pg3)

#Initialize GaussianProcessRegressor and fit
gaussian_process_pg3 = GaussianProcessRegressor(kernel = kernel_pg3, alpha = 0.1, n_restarts_optimizer = 10, normalize_y=True)

gaussian_process_pg3.fit(Time_scraped, y_pg3)
gaussian_process_pg3.kernel_




0.464**2 * RBF(length_scale=2.54) + 0.877**2 * RationalQuadratic(alpha=1e+05, length_scale=5.95)

In [17]:
#Pg3 Prediction
mean_prediction_pg3, std_prediction_pg3 = gaussian_process_pg3.predict(final_request, return_std = True)

In [18]:
#Pg4 Model

#Kernels
noise_kernel_pg4 = C(1.0, (1e-3, 1e3)) * RationalQuadratic(length_scale = 1.0, alpha = 1.0, alpha_bounds = (1e-10, 1e10))
rbf_pg4 = 1.0**1 * RBF(length_scale = 1.0, length_scale_bounds = (1e-2, 1e2))

kernel_pg4 = (rbf_pg4 + noise_kernel_pg4)

#Initialize GaussianProcessRegressor and fit
gaussian_process_pg4 = GaussianProcessRegressor(kernel = kernel_pg4, alpha = 0.1, n_restarts_optimizer = 10, normalize_y=True)

gaussian_process_pg4.fit(Time_scraped, y_pg4)
gaussian_process_pg4.kernel_

0.89**2 * RBF(length_scale=4.7) + 0.356**2 * RationalQuadratic(alpha=1.35e+07, length_scale=2.61)

In [19]:
#pg4 prediction
mean_prediction_pg4, std_prediction_pg4 = gaussian_process_pg4.predict(final_request, return_std = True)

In [20]:
#Pg5 Model 

#Kernels
noise_kernel_pg5 = C(1.0, (1e-3, 1e3)) * RationalQuadratic(length_scale = 1.0, alpha = 1.0)
rbf_pg5 = 1.0**1 * RBF(length_scale = 1.0, length_scale_bounds = (1e-2, 1e2))

kernel_pg5 = (rbf_pg5 + noise_kernel_pg5)

#Initialize GaussianProcessRegressor and fit
gaussian_process_pg5 = GaussianProcessRegressor(kernel = kernel_pg5, alpha = 0.1, n_restarts_optimizer = 10, normalize_y=True)

gaussian_process_pg5.fit(Time_scraped, y_pg5)
gaussian_process_pg5.kernel_

0.872**2 * RBF(length_scale=3.59) + 0.362**2 * RationalQuadratic(alpha=1.17e+03, length_scale=43.2)

In [21]:
#pg5 prediction
mean_prediction_pg5, std_prediction_pg5 = gaussian_process_pg5.predict(final_request, return_std = True)

In [22]:
#Pg6 Model

#Kernels
noise_kernel_pg6 = C(1.0, (1e-3, 1e3)) * RationalQuadratic(length_scale = 1.0, alpha = 1.0, alpha_bounds = (1e-10, 1e10))
rbf_pg6 = 1.0**1 * RBF(length_scale = 1.0, length_scale_bounds = (1e-2, 1e2))

kernel_pg6 = (rbf_pg6 + noise_kernel_pg6)

#Initialize GaussianProcessRegressor and fit
gaussian_process_pg6 = GaussianProcessRegressor(kernel = kernel_pg6, alpha = 0.1, n_restarts_optimizer = 5, normalize_y=True)

gaussian_process_pg6.fit(Time_scraped, y_pg6)
gaussian_process_pg6.kernel_

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


0.876**2 * RBF(length_scale=4.75) + 0.353**2 * RationalQuadratic(alpha=1.96e+07, length_scale=2.39)

In [23]:
#Pg6 Prediction
mean_prediction_pg6, std_prediction_pg6 = gaussian_process_pg6.predict(final_request, return_std = True)

Collecting and Exporting Predictions as Csv

In [24]:
#Collect Predictions into one dataframe and use .T (transpose) to flip the column to a row
predictions_collection = pd.DataFrame([mean_prediction_pg1, mean_prediction_pg2, mean_prediction_pg3, mean_prediction_pg4, mean_prediction_pg5, mean_prediction_pg6]).T

#add column names
predictions_collection.columns = ['PG1_Values', 'PG2_Values', 'PG3_Values', 'PG4_Values', 'PG5_Values', 'PG6_Values']

#export to csv
predictions_collection.to_csv('predictions.csv')