In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

In [2]:
df_a = pd.read_csv("availability.csv") #availability
df_w = pd.read_csv("historical_weather.csv") #weather

In [3]:
df_a.dtypes

NUMBER                    int64
last_update              object
open                      int64
bike_stands               int64
available_bikes           int64
available_bike_stands     int64
dtype: object

In [4]:
df_a.head()

Unnamed: 0,NUMBER,last_update,open,bike_stands,available_bikes,available_bike_stands
0,1,2023-03-21 22:53:18,1,31,3,28
1,1,2023-03-21 23:13:31,1,31,3,28
2,1,2023-03-22 16:40:19,1,31,16,15
3,1,2023-03-22 16:50:24,1,31,16,15
4,1,2023-03-22 17:21:38,1,31,12,19


In [5]:
df_a[['bike_stands', 'available_bikes', 'available_bike_stands']].corr()

Unnamed: 0,bike_stands,available_bikes,available_bike_stands
bike_stands,1.0,0.205633,0.517013
available_bikes,0.205633,1.0,-0.711744
available_bike_stands,0.517013,-0.711744,1.0


In [6]:
df_w.dtypes

date         object
time         object
weather      object
temp        float64
humidity    float64
speed       float64
degrees       int64
dtype: object

In [7]:
df_w.head()

Unnamed: 0,date,time,weather,temp,humidity,speed,degrees
0,2023-04-09,18:38:45,moderate rain,12.61,8.75,200.0,80
1,2023-04-09,18:55:18,broken clouds,12.4,81.0,8.75,200
2,2023-04-10,16:20:02,light rain,9.23,72.0,12.86,290
3,2023-04-10,16:25:01,light rain,9.16,72.0,12.86,290
4,2023-04-10,16:30:02,light rain,9.23,72.0,12.86,290


# Changing time to Hour

In [8]:
df_a['last_update'] = pd.to_datetime(df_a['last_update'])
df_a['timehours_availability'] = df_a['last_update'].dt.strftime('%H')
df_a['date'] = df_a['last_update'].dt.date
df_a['bID'] = df_a.reset_index().index + 1 #ID number for model
df_a

Unnamed: 0,NUMBER,last_update,open,bike_stands,available_bikes,available_bike_stands,timehours_availability,date,bID
0,1,2023-03-21 22:53:18,1,31,3,28,22,2023-03-21,1
1,1,2023-03-21 23:13:31,1,31,3,28,23,2023-03-21,2
2,1,2023-03-22 16:40:19,1,31,16,15,16,2023-03-22,3
3,1,2023-03-22 16:50:24,1,31,16,15,16,2023-03-22,4
4,1,2023-03-22 17:21:38,1,31,12,19,17,2023-03-22,5
...,...,...,...,...,...,...,...,...,...
364071,117,2023-04-11 08:35:41,1,40,2,38,08,2023-04-11,364072
364072,117,2023-04-11 08:41:48,1,40,3,37,08,2023-04-11,364073
364073,117,2023-04-11 08:45:46,1,40,3,37,08,2023-04-11,364074
364074,117,2023-04-11 08:51:12,1,40,5,35,08,2023-04-11,364075


In [9]:
df_w['time'] = pd.to_datetime(df_w['time'])
df_w['timehours_weather'] = df_w['time'].dt.strftime('%H')
df_w['wID'] = df_w.reset_index().index + 1
df_w

Unnamed: 0,date,time,weather,temp,humidity,speed,degrees,timehours_weather,wID
0,2023-04-09,2023-04-14 18:38:45,moderate rain,12.61,8.75,200.00,80,18,1
1,2023-04-09,2023-04-14 18:55:18,broken clouds,12.40,81.00,8.75,200,18,2
2,2023-04-10,2023-04-14 16:20:02,light rain,9.23,72.00,12.86,290,16,3
3,2023-04-10,2023-04-14 16:25:01,light rain,9.16,72.00,12.86,290,16,4
4,2023-04-10,2023-04-14 16:30:02,light rain,9.23,72.00,12.86,290,16,5
...,...,...,...,...,...,...,...,...,...
197,2023-04-11,2023-04-14 08:35:01,broken clouds,8.98,76.00,5.14,210,08,198
198,2023-04-11,2023-04-14 08:40:01,broken clouds,8.98,76.00,5.14,210,08,199
199,2023-04-11,2023-04-14 08:45:02,broken clouds,8.98,76.00,5.14,210,08,200
200,2023-04-11,2023-04-14 08:50:02,broken clouds,9.08,75.00,5.66,200,08,201


In [10]:
df_a.columns

Index(['NUMBER', 'last_update', 'open', 'bike_stands', 'available_bikes',
       'available_bike_stands', 'timehours_availability', 'date', 'bID'],
      dtype='object')

In [11]:
df_w['date'] = pd.to_datetime(df_w['date'])
df_a['date'] = pd.to_datetime(df_a['date'])
print(df_w.dtypes)
print(df_a.dtypes)

date                 datetime64[ns]
time                 datetime64[ns]
weather                      object
temp                        float64
humidity                    float64
speed                       float64
degrees                       int64
timehours_weather            object
wID                           int64
dtype: object
NUMBER                             int64
last_update               datetime64[ns]
open                               int64
bike_stands                        int64
available_bikes                    int64
available_bike_stands              int64
timehours_availability            object
date                      datetime64[ns]
bID                                int64
dtype: object


In [12]:
df = pd.merge(df_a, df_w, on='date')

In [13]:
# One-hot encode the 'description' column
df = pd.get_dummies(df, columns= ['weather'])
df.head()

Unnamed: 0,NUMBER,last_update,open,bike_stands,available_bikes,available_bike_stands,timehours_availability,date,bID,time,...,speed,degrees,timehours_weather,wID,weather_broken clouds,weather_few clouds,weather_heavy intensity rain,weather_light rain,weather_moderate rain,weather_scattered clouds
0,1,2023-04-09 00:07:17,1,31,6,25,0,2023-04-09,2919,2023-04-14 18:38:45,...,200.0,80,18,1,0,0,0,0,1,0
1,1,2023-04-09 00:07:17,1,31,6,25,0,2023-04-09,2919,2023-04-14 18:55:18,...,8.75,200,18,2,1,0,0,0,0,0
2,1,2023-04-09 00:17:21,1,31,6,25,0,2023-04-09,2920,2023-04-14 18:38:45,...,200.0,80,18,1,0,0,0,0,1,0
3,1,2023-04-09 00:17:21,1,31,6,25,0,2023-04-09,2920,2023-04-14 18:55:18,...,8.75,200,18,2,1,0,0,0,0,0
4,1,2023-04-09 00:27:25,1,31,6,25,0,2023-04-09,2921,2023-04-14 18:38:45,...,200.0,80,18,1,0,0,0,0,1,0


In [14]:
df = df.drop(columns=['last_update'])
df.columns

Index(['NUMBER', 'open', 'bike_stands', 'available_bikes',
       'available_bike_stands', 'timehours_availability', 'date', 'bID',
       'time', 'temp', 'humidity', 'speed', 'degrees', 'timehours_weather',
       'wID', 'weather_broken clouds', 'weather_few clouds',
       'weather_heavy intensity rain', 'weather_light rain',
       'weather_moderate rain', 'weather_scattered clouds'],
      dtype='object')

In [15]:
df.columns
features = df[['NUMBER', 'timehours_weather', 'available_bikes', 'available_bike_stands', 'temp', 'humidity', 'speed', 'degrees', 'wID']]
features.head(1)

Unnamed: 0,NUMBER,timehours_weather,available_bikes,available_bike_stands,temp,humidity,speed,degrees,wID
0,1,18,6,25,12.61,8.75,200.0,80,1


In [16]:
# example of making a single class prediction
# X['timehours_weather'] = X['timehours_weather'].astype(int)
# X['date'] = X['date'].astype(int)
# X['time'] = X['time'].astype(int)
# X['time'] = X['time'].astype(int)

# X.dtypes

In [28]:
stationnumbers = list(features['NUMBER'].unique())
stationnumbers.sort()

for station_id in stationnumbers:
    station_data = features[features['NUMBER'] == station_id]
    y = station_data[['available_bikes', 'available_bike_stands']]
    X = station_data.drop(columns=['available_bikes', 'available_bike_stands'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # fit final model, train a machine learning model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Serialize the trained model into a file called model.pkl
    with open(f'model_{station_id}.pkl', 'wb') as handle:
        pickle.dump(model, handle, pickle.HIGHEST_PROTOCOL)

    # Deserialize the model.pkl file into an object called model
    with open(f'model_{station_id}.pkl', 'rb') as handle:
        model = pickle.load(handle)

    # Evaluate the model
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    # print(f"Mean Squared Error for station {station_id}:", mse)

Unnamed: 0,NUMBER,timehours_weather,temp,humidity,speed,degrees,wID
1735284,117,17,7.82,81.0,10.29,270,17


In [31]:
# one row of data-frame returns one row of prediction
X_test_one_row = X_test.head(1)
X_test_one_row # 1-row dataframe
y_pred_from_one_row = model.predict(X_test_one_row)
y_pred_from_one_row

array([[ 0.04804794, 39.95195206]])