# Introducing Lag Variables
In this notebook, time-lag variables are being added to the model to see if it can contribute towards performance improvement in the model.
<br><br>
We have already seen that **day_name** is a very important feature in explaining rentals prediction.

In [1]:
# Importing
import pandas as pd
selected_rentals = pd.read_csv(r"C:\Users\singh\Desktop\TUD (All Semesters)\Courses - Semester 5 (TU Dresden)\Research Task - Spatial Modelling\Code\rentals_near200_st.csv")
selected_rentals.head()

Unnamed: 0,name,lat,lng,datetime,#_rentals,year,month,day,hour,ID,coordinates
0,1 Ave & E 110 St,40.792327,-73.9383,2023-12-31 08:00:00.000,0,2023,12,31,8,0,POINT (-73.9383 40.7923272)
1,1 Ave & E 110 St,40.792327,-73.9383,2024-03-28 10:00:00.000,1,2024,3,28,10,0,POINT (-73.9383 40.7923272)
2,1 Ave & E 110 St,40.792327,-73.9383,2024-03-28 12:00:00.000,1,2024,3,28,12,0,POINT (-73.9383 40.7923272)
3,1 Ave & E 110 St,40.792327,-73.9383,2024-03-28 14:00:00.000,1,2024,3,28,14,0,POINT (-73.9383 40.7923272)
4,1 Ave & E 110 St,40.792327,-73.9383,2024-03-28 16:00:00.000,3,2024,3,28,16,0,POINT (-73.9383 40.7923272)


In [2]:
# It is not saved as a GeoDataFrame
type(selected_rentals)

pandas.core.frame.DataFrame

In [3]:
# remove data for 2023, sort by station, remove [name, lat, lng]
selected_rentals = selected_rentals[["#_rentals", "datetime", "year", "month", "day", "hour", "ID", 'coordinates']]
selected_rentals = selected_rentals[~(selected_rentals["year"] == 2023)]
selected_rentals.sort_values(by=["ID","year","month","day","hour"], ignore_index=True, inplace=True)
selected_rentals[:15]

Unnamed: 0,#_rentals,datetime,year,month,day,hour,ID,coordinates
0,0,2024-01-01 08:00:00.000,2024,1,1,8,0,POINT (-73.9383 40.7923272)
1,0,2024-01-01 10:00:00.000,2024,1,1,10,0,POINT (-73.9383 40.7923272)
2,0,2024-01-01 12:00:00.000,2024,1,1,12,0,POINT (-73.9383 40.7923272)
3,0,2024-01-01 14:00:00.000,2024,1,1,14,0,POINT (-73.9383 40.7923272)
4,0,2024-01-01 16:00:00.000,2024,1,1,16,0,POINT (-73.9383 40.7923272)
5,0,2024-01-01 18:00:00.000,2024,1,1,18,0,POINT (-73.9383 40.7923272)
6,0,2024-01-01 20:00:00.000,2024,1,1,20,0,POINT (-73.9383 40.7923272)
7,3,2024-01-02 08:00:00.000,2024,1,2,8,0,POINT (-73.9383 40.7923272)
8,0,2024-01-02 10:00:00.000,2024,1,2,10,0,POINT (-73.9383 40.7923272)
9,6,2024-01-02 12:00:00.000,2024,1,2,12,0,POINT (-73.9383 40.7923272)


In [4]:
# Months included in the data
selected_rentals.month.unique()

array([1, 2, 3, 4], dtype=int64)

In [5]:
# Introducing lagged rentals
selected_rentals["#_rentals_lag_1"] = selected_rentals["#_rentals"].shift(1)
selected_rentals[:15]

Unnamed: 0,#_rentals,datetime,year,month,day,hour,ID,coordinates,#_rentals_lag_1
0,0,2024-01-01 08:00:00.000,2024,1,1,8,0,POINT (-73.9383 40.7923272),
1,0,2024-01-01 10:00:00.000,2024,1,1,10,0,POINT (-73.9383 40.7923272),0.0
2,0,2024-01-01 12:00:00.000,2024,1,1,12,0,POINT (-73.9383 40.7923272),0.0
3,0,2024-01-01 14:00:00.000,2024,1,1,14,0,POINT (-73.9383 40.7923272),0.0
4,0,2024-01-01 16:00:00.000,2024,1,1,16,0,POINT (-73.9383 40.7923272),0.0
5,0,2024-01-01 18:00:00.000,2024,1,1,18,0,POINT (-73.9383 40.7923272),0.0
6,0,2024-01-01 20:00:00.000,2024,1,1,20,0,POINT (-73.9383 40.7923272),0.0
7,3,2024-01-02 08:00:00.000,2024,1,2,8,0,POINT (-73.9383 40.7923272),0.0
8,0,2024-01-02 10:00:00.000,2024,1,2,10,0,POINT (-73.9383 40.7923272),3.0
9,6,2024-01-02 12:00:00.000,2024,1,2,12,0,POINT (-73.9383 40.7923272),0.0


## Raw Performance
Here a random forest model is run without any sort of feature engineering to accomodate spatial or temporal information. Only *dummy variables* are used for the **ID** column.

In [6]:
# condensing columns
selected_rentals_raw = selected_rentals[["#_rentals", "year", "month", "day", "hour", "ID"]]
selected_rentals_raw = pd.get_dummies(selected_rentals_raw, columns = ["ID"], drop_first=False)
selected_rentals_raw.head()

Unnamed: 0,#_rentals,year,month,day,hour,ID_0,ID_9,ID_54,ID_55,ID_62,...,ID_1879,ID_1881,ID_1884,ID_2010,ID_2017,ID_2062,ID_2063,ID_2064,ID_2065,ID_2074
0,0,2024,1,1,8,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,2024,1,1,10,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,2024,1,1,12,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,2024,1,1,14,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0,2024,1,1,16,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
# FEBRUARY - raw data

# training data
X_train = selected_rentals_raw.loc[(selected_rentals_raw["day"] <=25) & (selected_rentals_raw["month"] == 2),list(selected_rentals_raw.columns)[1:]]
y_train = selected_rentals_raw.loc[(selected_rentals_raw["day"] <=25) & (selected_rentals_raw["month"] == 2), "#_rentals"]

X_test = selected_rentals_raw.loc[(selected_rentals_raw["day"] >25) & (selected_rentals_raw["month"] == 2),list(selected_rentals_raw.columns)[1:]]
y_test = selected_rentals_raw.loc[(selected_rentals_raw["day"] >25) & (selected_rentals_raw["month"] == 2), "#_rentals"]

# training the model
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(random_state=2) 
forest.fit(X_train, y_train)

# testing performance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred = forest.predict(X_test)
print(mean_squared_error(y_test, y_test_pred),r2_score(y_test, y_test_pred))

3.0334722857142853 0.17192535918278629


# Random Forest
Performance is evauated for the **month of March only** [TEMPORAL INFORMATION ONLY]. The performance increase is compared with **day_name** to study the contribution of *lagged rentals* in r2 score, for the same data.

In [8]:
# condensing columns
selected_rentals = selected_rentals[["#_rentals", "year", "month", "day", "hour", "ID", "#_rentals_lag_1"]]
selected_rentals.columns

Index(['#_rentals', 'year', 'month', 'day', 'hour', 'ID', '#_rentals_lag_1'], dtype='object')

In [9]:
# creating dummies for ID
selected_rentals = pd.get_dummies(selected_rentals, columns = ["ID"], drop_first=False)
selected_rentals.head()

Unnamed: 0,#_rentals,year,month,day,hour,#_rentals_lag_1,ID_0,ID_9,ID_54,ID_55,...,ID_1879,ID_1881,ID_1884,ID_2010,ID_2017,ID_2062,ID_2063,ID_2064,ID_2065,ID_2074
0,0,2024,1,1,8,,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,2024,1,1,10,0.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,2024,1,1,12,0.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,2024,1,1,14,0.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0,2024,1,1,16,0.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:
# MARCH

# training data
X_train = selected_rentals.loc[(selected_rentals["day"] <=25) & (selected_rentals["month"] == 3),list(selected_rentals.columns)[1:]]
y_train = selected_rentals.loc[(selected_rentals["day"] <=25) & (selected_rentals["month"] == 3), "#_rentals"]

X_test = selected_rentals.loc[(selected_rentals["day"] >25) & (selected_rentals["month"] == 3),list(selected_rentals.columns)[1:]]
y_test = selected_rentals.loc[(selected_rentals["day"] >25) & (selected_rentals["month"] == 3), "#_rentals"]

# training the model
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(random_state=2) 
forest.fit(X_train, y_train)

# testing performance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred = forest.predict(X_test)
print(mean_squared_error(y_test, y_test_pred),r2_score(y_test, y_test_pred))

3.996952714285715 0.4332944951673836


In [11]:
# FEBRUARY

# training data
X_train = selected_rentals.loc[(selected_rentals["day"] <=25) & (selected_rentals["month"] == 2),list(selected_rentals.columns)[1:]]
y_train = selected_rentals.loc[(selected_rentals["day"] <=25) & (selected_rentals["month"] == 2), "#_rentals"]

X_test = selected_rentals.loc[(selected_rentals["day"] >25) & (selected_rentals["month"] == 2),list(selected_rentals.columns)[1:]]
y_test = selected_rentals.loc[(selected_rentals["day"] >25) & (selected_rentals["month"] == 2), "#_rentals"]

# training the model
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(random_state=2) 
forest.fit(X_train, y_train)

# testing performance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred = forest.predict(X_test)
print(mean_squared_error(y_test, y_test_pred),r2_score(y_test, y_test_pred))

2.5491888214285714 0.30412464039286025


We can see the tremendous performance increase compared to the **raw** case.

# Introducing Spatial Dependence
We finally introduce *rentals* information from nearby stations to better inform rentals prediction for a given station. The *lag_rentals* are being dropped for clearer performance comparison!
<br><br>
Some sort of **kernel** might be required to process spatial information. The kernel is meant to provide higher weight (i.e. priority) to the closest station and lesser weight to far-away stations.
<br><br>
Another (simpler) approach is to take an average of rentals, k-nearby stations. In this scenario all *k* stations will likely have the same weights. This is less desirable since weighting should be dependent on distance, especially for larger values of 'k'.

### Constructing POINT geometries

In [12]:
# df isn't a GeoDataFrame yet!
selected_rentals = pd.read_csv(r"C:\Users\singh\Desktop\TUD (All Semesters)\Courses - Semester 5 (TU Dresden)\Research Task - Spatial Modelling\Code\rentals_near200_st.csv")
selected_rentals = selected_rentals[["#_rentals", "datetime", "year", "month", "day", "hour", "ID", 'lat','lng']]
selected_rentals = selected_rentals[~(selected_rentals["year"] == 2023)]
selected_rentals.sort_values(by=["ID","year","month","day","hour"], ignore_index=True, inplace=True)
selected_rentals.head()

Unnamed: 0,#_rentals,datetime,year,month,day,hour,ID,lat,lng
0,0,2024-01-01 08:00:00.000,2024,1,1,8,0,40.792327,-73.9383
1,0,2024-01-01 10:00:00.000,2024,1,1,10,0,40.792327,-73.9383
2,0,2024-01-01 12:00:00.000,2024,1,1,12,0,40.792327,-73.9383
3,0,2024-01-01 14:00:00.000,2024,1,1,14,0,40.792327,-73.9383
4,0,2024-01-01 16:00:00.000,2024,1,1,16,0,40.792327,-73.9383


In [14]:
# Creating POINT objects
import geopandas as gpd
from shapely import Point
selected_rentals['coordinates'] = selected_rentals.apply(lambda row: Point(row['lng'], row['lat']), axis=1)
selected_rentals.head()                                    

Unnamed: 0,#_rentals,datetime,year,month,day,hour,ID,lat,lng,coordinates
0,0,2024-01-01 08:00:00.000,2024,1,1,8,0,40.792327,-73.9383,POINT (-73.9383 40.7923272)
1,0,2024-01-01 10:00:00.000,2024,1,1,10,0,40.792327,-73.9383,POINT (-73.9383 40.7923272)
2,0,2024-01-01 12:00:00.000,2024,1,1,12,0,40.792327,-73.9383,POINT (-73.9383 40.7923272)
3,0,2024-01-01 14:00:00.000,2024,1,1,14,0,40.792327,-73.9383,POINT (-73.9383 40.7923272)
4,0,2024-01-01 16:00:00.000,2024,1,1,16,0,40.792327,-73.9383,POINT (-73.9383 40.7923272)


In [15]:
# Reducing columns
selected_rentals = selected_rentals[["#_rentals", "month", "day", "hour", "ID", "coordinates"]]
selected_rentals.head()

Unnamed: 0,#_rentals,month,day,hour,ID,coordinates
0,0,1,1,8,0,POINT (-73.9383 40.7923272)
1,0,1,1,10,0,POINT (-73.9383 40.7923272)
2,0,1,1,12,0,POINT (-73.9383 40.7923272)
3,0,1,1,14,0,POINT (-73.9383 40.7923272)
4,0,1,1,16,0,POINT (-73.9383 40.7923272)


In [19]:
# converting to GeoDataFrame
import geopandas as gpd
selected_rentals_sp = gpd.GeoDataFrame(selected_rentals, geometry = "coordinates", crs = "EPSG:4326")
print(type(selected_rentals_sp))
selected_rentals_sp.head()

<class 'geopandas.geodataframe.GeoDataFrame'>


Unnamed: 0,#_rentals,month,day,hour,ID,coordinates
0,0,1,1,8,0,POINT (-73.9383 40.79233)
1,0,1,1,10,0,POINT (-73.9383 40.79233)
2,0,1,1,12,0,POINT (-73.9383 40.79233)
3,0,1,1,14,0,POINT (-73.9383 40.79233)
4,0,1,1,16,0,POINT (-73.9383 40.79233)


### Proximity to other stations
Here, will need to figure out how to calculate and store distances of all *other* stations, so that <u>nearby stations</u> can be identified.

In [29]:
# station list
station_dict = {'station_id' : selected_rentals_sp.ID.unique()}
station_df = pd.DataFrame(station_dict)
station_df.head()

Unnamed: 0,station_id
0,0
1,9
2,54
3,55
4,62


In [31]:
# Adding coordinates
station_df['coord'] = None

for i in station_df.station_id:
    station_df.loc[station_df['station_id'] == i, 'coord'] = selected_rentals_sp.loc[selected_rentals_sp["ID"] == i, 'coordinates'].unique()
    
station_df.head()

Unnamed: 0,station_id,coord
0,0,POINT (-73.9383 40.7923272)
1,9,POINT (-73.94594 40.7817212)
2,54,POINT (-73.92743647098541 40.772768286288304)
3,55,POINT (-73.9225403 40.7774552)
4,62,POINT (-73.9349 40.8006721)


In [97]:
# Adding dist information
station_df['nearest_st_id'] = None
station_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_df['nearest_st_id'] = None


Unnamed: 0,station_id,coord,nearest_st_id
0,0,POINT (-73.9383 40.7923272),
1,9,POINT (-73.94594 40.7817212),
2,54,POINT (-73.92743647098541 40.772768286288304),
3,55,POINT (-73.9225403 40.7774552),
4,62,POINT (-73.9349 40.8006721),


In [117]:
# Identifying nearest distance for station ID == 0
proximity_index = station_df.loc[station_df['station_id'] == 0, 'coord'][0].distance(station_df.iloc[:,1]).sort_values().index # sorted ascendingly by distance
station_df.loc[station_df['station_id'] == 0, 'nearest_st_id'] = station_df.iloc[proximity_index[1],0]
station_df.head()

Unnamed: 0,station_id,coord,nearest_st_id
0,0,POINT (-73.9383 40.7923272),901.0
1,9,POINT (-73.94594 40.7817212),
2,54,POINT (-73.92743647098541 40.772768286288304),
3,55,POINT (-73.9225403 40.7774552),
4,62,POINT (-73.9349 40.8006721),


In [118]:
# Verifying
station_df.iloc[0,1].distance(station_df.iloc[79,1])

0.0030330307647602234

In [142]:
# Initiating a loop to calculate nearest station ID for all stations
for j in list(station_df.station_id):
    proximity_index = station_df.loc[station_df['station_id'] == j, 'coord'].iloc[0].distance(station_df.iloc[:,1]).sort_values().index # sorted ascendingly by distance
    station_df.loc[station_df['station_id'] == j, 'nearest_st_id'] = station_df.iloc[proximity_index[1],0]
    
station_df.head()

Unnamed: 0,station_id,coord,nearest_st_id
0,0,POINT (-73.9383 40.7923272),901
1,9,POINT (-73.94594 40.7817212),66
2,54,POINT (-73.92743647098541 40.772768286288304),134
3,55,POINT (-73.9225403 40.7774552),1715
4,62,POINT (-73.9349 40.8006721),63


In [147]:
# Initiating a loop to calculate 2nd and 3rd nearest station ID for all stations
station_df['2nd_nearest_st_id'] = None
station_df['3rd_nearest_st_id'] = None

for j in list(station_df.station_id):
    proximity_index = station_df.loc[station_df['station_id'] == j, 'coord'].iloc[0].distance(station_df.iloc[:,1]).sort_values().index # sorted ascendingly by distance
    station_df.loc[station_df['station_id'] == j, '2nd_nearest_st_id'] = station_df.iloc[proximity_index[2],0]
    station_df.loc[station_df['station_id'] == j, '3rd_nearest_st_id'] = station_df.iloc[proximity_index[3],0]

station_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_df['2nd_nearest_st_id'] = None


Unnamed: 0,station_id,coord,nearest_st_id,2nd_nearest_st_id,3rd_nearest_st_id
0,0,POINT (-73.9383 40.7923272),901,894,115
1,9,POINT (-73.94594 40.7817212),66,1090,1088
2,54,POINT (-73.92743647098541 40.772768286288304),134,1541,103
3,55,POINT (-73.9225403 40.7774552),1715,83,77
4,62,POINT (-73.9349 40.8006721),63,911,905


### Creating spatiotemporal features
Now we can add lagged rentals of existing stations as well as three nearby stations.

In [153]:
# At the moment we only have lagged rentals for the station in question
selected_rentals = selected_rentals[["#_rentals", "month", "day", "hour", "ID"]] # removing coordinates
selected_rentals["#_rentals_lag_1"] = selected_rentals["#_rentals"].shift(1)
selected_rentals[:12]

Unnamed: 0,#_rentals,month,day,hour,ID,#_rentals_lag_1
0,0,1,1,8,0,
1,0,1,1,10,0,0.0
2,0,1,1,12,0,0.0
3,0,1,1,14,0,0.0
4,0,1,1,16,0,0.0
5,0,1,1,18,0,0.0
6,0,1,1,20,0,0.0
7,3,1,2,8,0,0.0
8,0,1,2,10,0,3.0
9,6,1,2,12,0,0.0


In [155]:
station_df.loc[station_df['station_id'] == 0, 'nearest_st_id'].iloc[0]

901

In [181]:
# Introducing lagged rentals for nearest stations
selected_rentals['lagged_rentals_nearest'] = None

for i in list(station_df.station_id):
    id_to_search = station_df.loc[station_df['station_id'] == i, 'nearest_st_id'].iloc[0] # id of the nearest station
    selected_rentals.loc[selected_rentals['ID'] == i,'lagged_rentals_nearest'] = list(selected_rentals.loc[selected_rentals['ID'] == id_to_search, '#_rentals_lag_1']) 

selected_rentals[:15]

Unnamed: 0,#_rentals,month,day,hour,ID,#_rentals_lag_1,lagged_rentals_nearest
0,0,1,1,8,0,,0.0
1,0,1,1,10,0,0.0,0.0
2,0,1,1,12,0,0.0,0.0
3,0,1,1,14,0,0.0,0.0
4,0,1,1,16,0,0.0,0.0
5,0,1,1,18,0,0.0,0.0
6,0,1,1,20,0,0.0,0.0
7,3,1,2,8,0,0.0,0.0
8,0,1,2,10,0,3.0,2.0
9,6,1,2,12,0,0.0,0.0


In [187]:
# Verifying for 2 Jan @ 10am -- nearest lagged value is correct
print(selected_rentals.loc[(selected_rentals.hour == 8) & (selected_rentals.ID == 901) & (selected_rentals.day == 2) & (selected_rentals.month == 1), '#_rentals'])

# Verifying for 2 Jan @ 16 hrs -- nearest lagged value is correct
print(selected_rentals.loc[(selected_rentals.hour == 14) & (selected_rentals.ID == 901) & (selected_rentals.day == 2) & (selected_rentals.month == 1), '#_rentals'])

66920    2
Name: #_rentals, dtype: int64
66923    1
Name: #_rentals, dtype: int64


In [199]:
# Introducing lagged rentals for other nearby stations
selected_rentals['lagged_rentals_second_nearest'] = None
selected_rentals['lagged_rentals_third_nearest'] = None

for i in list(station_df.station_id):
    id_to_search = station_df.loc[station_df['station_id'] == i, '2nd_nearest_st_id'].iloc[0] # id of the 2nd nearest station
    selected_rentals.loc[selected_rentals['ID'] == i,'lagged_rentals_second_nearest'] = list(selected_rentals.loc[selected_rentals['ID'] == id_to_search, '#_rentals_lag_1'])
    id_to_search = station_df.loc[station_df['station_id'] == i, '3rd_nearest_st_id'].iloc[0] # id of the 3rd nearest station
    selected_rentals.loc[selected_rentals['ID'] == i,'lagged_rentals_third_nearest'] = list(selected_rentals.loc[selected_rentals['ID'] == id_to_search, '#_rentals_lag_1'])

selected_rentals.head()

Unnamed: 0,#_rentals,month,day,hour,ID,#_rentals_lag_1,lagged_rentals_nearest,lagged_rentals_second_nearest,lagged_rentals_third_nearest
0,0,1,1,8,0,,0.0,4.0,0.0
1,0,1,1,10,0,0.0,0.0,0.0,0.0
2,0,1,1,12,0,0.0,0.0,3.0,0.0
3,0,1,1,14,0,0.0,0.0,0.0,1.0
4,0,1,1,16,0,0.0,0.0,0.0,2.0


In [200]:
# exporting this data with engineered features
selected_rentals.to_csv('rentals_with_engineered_features.csv', index=False)

### Predictive Performance with newly constructed features
Again Random forest is used as an architecture. However, we first need to make use of dummy variables to accommodate for station categories. Clean file containing information with newly constructed features can be [imported] (<u>rentals_with_engineered_features.csv</u>)

In [204]:
# creating dummies for ID
selected_rentals = pd.get_dummies(selected_rentals, columns = ["ID"], drop_first=False)

# MARCH: evaluating performance for the same data range as before

# training data
X_train = selected_rentals.loc[(selected_rentals["day"] <=25) & (selected_rentals["month"] == 3),list(selected_rentals.columns)[1:]]
y_train = selected_rentals.loc[(selected_rentals["day"] <=25) & (selected_rentals["month"] == 3), "#_rentals"]

X_test = selected_rentals.loc[(selected_rentals["day"] >25) & (selected_rentals["month"] == 3),list(selected_rentals.columns)[1:]]
y_test = selected_rentals.loc[(selected_rentals["day"] >25) & (selected_rentals["month"] == 3), "#_rentals"]

# training the model
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(random_state=2) 
forest.fit(X_train, y_train)

# testing performance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred = forest.predict(X_test)
print(mean_squared_error(y_test, y_test_pred),r2_score(y_test, y_test_pred))

3.615483285714286 0.48738090562806247


In [205]:
# FEBRUARY: evaluating performance for the same data range as before

# training data
X_train = selected_rentals.loc[(selected_rentals["day"] <=25) & (selected_rentals["month"] == 2),list(selected_rentals.columns)[1:]]
y_train = selected_rentals.loc[(selected_rentals["day"] <=25) & (selected_rentals["month"] == 2), "#_rentals"]

X_test = selected_rentals.loc[(selected_rentals["day"] >25) & (selected_rentals["month"] == 2),list(selected_rentals.columns)[1:]]
y_test = selected_rentals.loc[(selected_rentals["day"] >25) & (selected_rentals["month"] == 2), "#_rentals"]

# training the model
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(random_state=2) 
forest.fit(X_train, y_train)

# testing performance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred = forest.predict(X_test)
print(mean_squared_error(y_test, y_test_pred),r2_score(y_test, y_test_pred))

2.3332081249999996 0.36308286409604074


There is clearly some contribution to prediction performance. **The real question is how many nearby stations to be considered to attain max prediction performance**?

In [206]:
selected_rentals.head()

Unnamed: 0,#_rentals,month,day,hour,#_rentals_lag_1,lagged_rentals_nearest,lagged_rentals_second_nearest,lagged_rentals_third_nearest,ID_0,ID_9,...,ID_1879,ID_1881,ID_1884,ID_2010,ID_2017,ID_2062,ID_2063,ID_2064,ID_2065,ID_2074
0,0,1,1,8,,0.0,4.0,0.0,True,False,...,False,False,False,False,False,False,False,False,False,False
1,0,1,1,10,0.0,0.0,0.0,0.0,True,False,...,False,False,False,False,False,False,False,False,False,False
2,0,1,1,12,0.0,0.0,3.0,0.0,True,False,...,False,False,False,False,False,False,False,False,False,False
3,0,1,1,14,0.0,0.0,0.0,1.0,True,False,...,False,False,False,False,False,False,False,False,False,False
4,0,1,1,16,0.0,0.0,0.0,2.0,True,False,...,False,False,False,False,False,False,False,False,False,False


# Weighted Average
Up till now we have not been able to provide weights to the *lagged rentals* of <u>nearby stations</u>. As of now it is unclear how the random forest learner is prioritising the lagged rentals of nearby stations. <br><br> 
What should ideally be done is that the existing station should receive the *highest weight* while weights of nearby stations should decrease in proportion to the distance from existing station in question.
<br><br>
A more refined way to assign weights is to make use of **kernels** (such as Gaussian) in scenarios where huge numbers of nearby stations are considered.

In [2]:
# Importing rentals with engineered features
import pandas as pd
selected_rentals_avg = pd.read_csv(r"C:\Users\singh\Desktop\TUD (All Semesters)\Courses - Semester 5 (TU Dresden)\Research Task - Spatial Modelling\Code\rentals_with_engineered_features.csv")
selected_rentals_avg.head()

Unnamed: 0,#_rentals,month,day,hour,ID,#_rentals_lag_1,lagged_rentals_nearest,lagged_rentals_second_nearest,lagged_rentals_third_nearest
0,0,1,1,8,0,,0.0,4.0,0.0
1,0,1,1,10,0,0.0,0.0,0.0,0.0
2,0,1,1,12,0,0.0,0.0,3.0,0.0
3,0,1,1,14,0,0.0,0.0,0.0,1.0
4,0,1,1,16,0,0.0,0.0,0.0,2.0


### Custom weight assignment
Weights are applied to *lagged rentals* from existing stations as well as the nearby stations.
Weights are assigned as follows:
- existing station (w0): 6
- nearest station (w1): 3
- 2nd nearest st (w2): 2
- 3rd nearest st (w3): 1

Weighted average = 1/(w0+w1+w2+w3)*{(w0*r0_lag)+(w1*r1_lag)+(w2*r2_lag)+(w3*r3_lag)}

In [3]:
# Creating a custom function
def w_avg(current_st_lg, nearest_st_lg, second_nearest_st_lg, third_nearest_st_lg):
    weighting = (6*current_st_lg)+(3*nearest_st_lg)+(2*second_nearest_st_lg)+(1*third_nearest_st_lg)
    return weighting/(6+3+2+1)

In [4]:
# Introducing the weighted average column
selected_rentals_avg["wavg_lag_rentals"] = w_avg(selected_rentals_avg['#_rentals_lag_1'], selected_rentals_avg['lagged_rentals_nearest'], selected_rentals_avg['lagged_rentals_second_nearest'], selected_rentals_avg['lagged_rentals_third_nearest'])
selected_rentals_avg.head()

Unnamed: 0,#_rentals,month,day,hour,ID,#_rentals_lag_1,lagged_rentals_nearest,lagged_rentals_second_nearest,lagged_rentals_third_nearest,wavg_lag_rentals
0,0,1,1,8,0,,0.0,4.0,0.0,
1,0,1,1,10,0,0.0,0.0,0.0,0.0,0.0
2,0,1,1,12,0,0.0,0.0,3.0,0.0,0.5
3,0,1,1,14,0,0.0,0.0,0.0,1.0,0.083333
4,0,1,1,16,0,0.0,0.0,0.0,2.0,0.166667


In [5]:
# reducing columns -- only retaining w.avg of lagged rentals
selected_rentals_avg = selected_rentals_avg[['#_rentals','month','day','hour','ID', 'wavg_lag_rentals']]
selected_rentals_avg.head()

Unnamed: 0,#_rentals,month,day,hour,ID,wavg_lag_rentals
0,0,1,1,8,0,
1,0,1,1,10,0,0.0
2,0,1,1,12,0,0.5
3,0,1,1,14,0,0.083333
4,0,1,1,16,0,0.166667


### Evaluating change in prediction performance
The same dataset (for February and March) is used to evaluate prediction performance using random forest. Since weighting is done based on distance, the expectation is that the performance should increase!

In [7]:
# creating dummies for ID
selected_rentals_avg = pd.get_dummies(selected_rentals_avg, columns = ["ID"], drop_first=False)

# FEBRUARY: evaluating performance for the same data range as before
# training data
X_train = selected_rentals_avg.loc[(selected_rentals_avg["day"] <=25) & (selected_rentals_avg["month"] == 2),list(selected_rentals_avg.columns)[1:]]
y_train = selected_rentals_avg.loc[(selected_rentals_avg["day"] <=25) & (selected_rentals_avg["month"] == 2), "#_rentals"]

X_test = selected_rentals_avg.loc[(selected_rentals_avg["day"] >25) & (selected_rentals_avg["month"] == 2),list(selected_rentals_avg.columns)[1:]]
y_test = selected_rentals_avg.loc[(selected_rentals_avg["day"] >25) & (selected_rentals_avg["month"] == 2), "#_rentals"]

# training the model
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(random_state=2) 
forest.fit(X_train, y_train)

# testing performance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred = forest.predict(X_test)
print(mean_squared_error(y_test, y_test_pred),r2_score(y_test, y_test_pred))

2.6313821785714286 0.281687568851392


The result is not appearing good. Consider an **alternative** weighting strategy!

### Alternative weighting strategy
The lagged rentals for the existing station is retained as usual. Weighted average is only calculated for the nearby stations

In [8]:
# Importing
selected_rentals_avg = pd.read_csv(r"C:\Users\singh\Desktop\TUD (All Semesters)\Courses - Semester 5 (TU Dresden)\Research Task - Spatial Modelling\Code\rentals_with_engineered_features.csv")

# Creating a custom function
def w_avg_new(nearest_st_lg, second_nearest_st_lg, third_nearest_st_lg):
    weighting = (3*nearest_st_lg)+(2*second_nearest_st_lg)+(1*third_nearest_st_lg)
    return weighting/(3+2+1)

# Introducing the new weighted average column
selected_rentals_avg["wavg_new_lag_rentals"] = w_avg_new(selected_rentals_avg['lagged_rentals_nearest'], selected_rentals_avg['lagged_rentals_second_nearest'], selected_rentals_avg['lagged_rentals_third_nearest'])
selected_rentals_avg.head()

Unnamed: 0,#_rentals,month,day,hour,ID,#_rentals_lag_1,lagged_rentals_nearest,lagged_rentals_second_nearest,lagged_rentals_third_nearest,wavg_new_lag_rentals
0,0,1,1,8,0,,0.0,4.0,0.0,1.333333
1,0,1,1,10,0,0.0,0.0,0.0,0.0,0.0
2,0,1,1,12,0,0.0,0.0,3.0,0.0,1.0
3,0,1,1,14,0,0.0,0.0,0.0,1.0,0.166667
4,0,1,1,16,0,0.0,0.0,0.0,2.0,0.333333


In [9]:
# Retaining columns to use
selected_rentals_avg = selected_rentals_avg[['#_rentals','month','day','hour','ID','#_rentals_lag_1','wavg_new_lag_rentals']]
selected_rentals_avg.head()

Unnamed: 0,#_rentals,month,day,hour,ID,#_rentals_lag_1,wavg_new_lag_rentals
0,0,1,1,8,0,,1.333333
1,0,1,1,10,0,0.0,0.0
2,0,1,1,12,0,0.0,1.0
3,0,1,1,14,0,0.0,0.166667
4,0,1,1,16,0,0.0,0.333333


In [10]:
# Evaluating performance for this weighting strategy

# creating dummies for ID
selected_rentals_avg = pd.get_dummies(selected_rentals_avg, columns = ["ID"], drop_first=False)

# FEBRUARY: evaluating performance for the same data range as before
# training data
X_train = selected_rentals_avg.loc[(selected_rentals_avg["day"] <=25) & (selected_rentals_avg["month"] == 2),list(selected_rentals_avg.columns)[1:]]
y_train = selected_rentals_avg.loc[(selected_rentals_avg["day"] <=25) & (selected_rentals_avg["month"] == 2), "#_rentals"]

X_test = selected_rentals_avg.loc[(selected_rentals_avg["day"] >25) & (selected_rentals_avg["month"] == 2),list(selected_rentals_avg.columns)[1:]]
y_test = selected_rentals_avg.loc[(selected_rentals_avg["day"] >25) & (selected_rentals_avg["month"] == 2), "#_rentals"]

# training the model
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(random_state=2) 
forest.fit(X_train, y_train)

# testing performance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred = forest.predict(X_test)
print(mean_squared_error(y_test, y_test_pred),r2_score(y_test, y_test_pred))

2.424286857142857 0.338220270572288


The performance is okay-ish, but is not better compared to the case where lagged rentals for nearby stations were considered explicitly. Maybe taking **absolute average** should increase performance (since distances may not be that far-away!)

### Absolute average 
The weighted average approach does not seem to work yet because of **weighting choice**. While decent performance can be observed, it is not as good compared to the case where weighted average is not considered. <br><br> This approach tries to evaluate performance for the absolute average case. Again, the lagged rentals for the <u>existing station</u> are **not** included in the average!

In [11]:
# Importing
selected_rentals_avg = pd.read_csv(r"C:\Users\singh\Desktop\TUD (All Semesters)\Courses - Semester 5 (TU Dresden)\Research Task - Spatial Modelling\Code\rentals_with_engineered_features.csv")

# Absolute Average
selected_rentals_avg["avg_new_lag_rentals"] = (1/3)*(selected_rentals_avg['lagged_rentals_nearest'] + selected_rentals_avg['lagged_rentals_second_nearest'] + selected_rentals_avg['lagged_rentals_third_nearest'])
selected_rentals_avg.head()

Unnamed: 0,#_rentals,month,day,hour,ID,#_rentals_lag_1,lagged_rentals_nearest,lagged_rentals_second_nearest,lagged_rentals_third_nearest,avg_new_lag_rentals
0,0,1,1,8,0,,0.0,4.0,0.0,1.333333
1,0,1,1,10,0,0.0,0.0,0.0,0.0,0.0
2,0,1,1,12,0,0.0,0.0,3.0,0.0,1.0
3,0,1,1,14,0,0.0,0.0,0.0,1.0,0.333333
4,0,1,1,16,0,0.0,0.0,0.0,2.0,0.666667


In [12]:
# reducing columns
selected_rentals_avg = selected_rentals_avg[['#_rentals','month','day','hour','ID','#_rentals_lag_1','avg_new_lag_rentals']]
selected_rentals_avg.head()

Unnamed: 0,#_rentals,month,day,hour,ID,#_rentals_lag_1,avg_new_lag_rentals
0,0,1,1,8,0,,1.333333
1,0,1,1,10,0,0.0,0.0
2,0,1,1,12,0,0.0,1.0
3,0,1,1,14,0,0.0,0.333333
4,0,1,1,16,0,0.0,0.666667


In [13]:
# Evaluating performance for this weighting strategy

# creating dummies for ID
selected_rentals_avg = pd.get_dummies(selected_rentals_avg, columns = ["ID"], drop_first=False)

# FEBRUARY: evaluating performance for the same data range as before
# training data
X_train = selected_rentals_avg.loc[(selected_rentals_avg["day"] <=25) & (selected_rentals_avg["month"] == 2),list(selected_rentals_avg.columns)[1:]]
y_train = selected_rentals_avg.loc[(selected_rentals_avg["day"] <=25) & (selected_rentals_avg["month"] == 2), "#_rentals"]

X_test = selected_rentals_avg.loc[(selected_rentals_avg["day"] >25) & (selected_rentals_avg["month"] == 2),list(selected_rentals_avg.columns)[1:]]
y_test = selected_rentals_avg.loc[(selected_rentals_avg["day"] >25) & (selected_rentals_avg["month"] == 2), "#_rentals"]

# training the model
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(random_state=2) 
forest.fit(X_train, y_train)

# testing performance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred = forest.predict(X_test)
print(mean_squared_error(y_test, y_test_pred),r2_score(y_test, y_test_pred))

2.4282113035714286 0.33714897858058257


In [15]:
# MARCH: evaluating performance for the same data range as before

# training data
X_train = selected_rentals_avg.loc[(selected_rentals_avg["day"] <=25) & (selected_rentals_avg["month"] == 3),list(selected_rentals_avg.columns)[1:]]
y_train = selected_rentals_avg.loc[(selected_rentals_avg["day"] <=25) & (selected_rentals_avg["month"] == 3), "#_rentals"]

X_test = selected_rentals_avg.loc[(selected_rentals_avg["day"] >25) & (selected_rentals_avg["month"] == 3),list(selected_rentals_avg.columns)[1:]]
y_test = selected_rentals_avg.loc[(selected_rentals_avg["day"] >25) & (selected_rentals_avg["month"] == 3), "#_rentals"]

# training the model
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(random_state=2) 
forest.fit(X_train, y_train)

# testing performance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred = forest.predict(X_test)
print(mean_squared_error(y_test, y_test_pred),r2_score(y_test, y_test_pred))

3.6862694166666663 0.47734453718839087


### Conclusion: Weighted Average
Finding out the correct weighting strategy can be very tricky! For the strategies that have been tried in this section, no improvement in prediction performance was seen. So, lagged rentals for nearby stations should be provided to the learner explicitly (not using any method for averaging).

# Combining spatiotemporal features
The day of week (name of day), lagged rentals for existing and three nearby stations are used as features. The **ID** column is expressed in terms of a *dummy variable* since it's a categorical variable.

### Creating features
The previous notebook showed the promise for *day_name*. This feature is being combined with the already existing information of lagged rentals, for existing and nearby stations.

In [18]:
# Importing
selected_rentals_eng_features = pd.read_csv(r"C:\Users\singh\Desktop\TUD (All Semesters)\Courses - Semester 5 (TU Dresden)\Research Task - Spatial Modelling\Code\rentals_with_engineered_features.csv")
selected_rentals_eng_features.head()

Unnamed: 0,#_rentals,month,day,hour,ID,#_rentals_lag_1,lagged_rentals_nearest,lagged_rentals_second_nearest,lagged_rentals_third_nearest
0,0,1,1,8,0,,0.0,4.0,0.0
1,0,1,1,10,0,0.0,0.0,0.0,0.0
2,0,1,1,12,0,0.0,0.0,3.0,0.0
3,0,1,1,14,0,0.0,0.0,0.0,1.0
4,0,1,1,16,0,0.0,0.0,0.0,2.0


In [26]:
# Adding Name of day -- datetime object must be created
import datetime as dt
x = dt.datetime(2024,1,20)
pd.to_datetime(x).day_name()

'Saturday'

In [34]:
# Adding Name of day -- datetime object must be created
import datetime as dt

dt_object = list()
for i in range(len(selected_rentals_eng_features)):
    dt_object.append(dt.datetime(2024, selected_rentals_eng_features.iloc[i,1], selected_rentals_eng_features.iloc[i,2]))

print(len(dt_object), len(selected_rentals_eng_features))                     

169400 169400


In [35]:
# Adding Name of day -- using day_name() method on dt object
selected_rentals_eng_features['day_name'] = pd.Series([x.day_name() for x in pd.to_datetime(dt_object)])
selected_rentals_eng_features.head()

Unnamed: 0,#_rentals,month,day,hour,ID,#_rentals_lag_1,lagged_rentals_nearest,lagged_rentals_second_nearest,lagged_rentals_third_nearest,day_name
0,0,1,1,8,0,,0.0,4.0,0.0,Monday
1,0,1,1,10,0,0.0,0.0,0.0,0.0,Monday
2,0,1,1,12,0,0.0,0.0,3.0,0.0,Monday
3,0,1,1,14,0,0.0,0.0,0.0,1.0,Monday
4,0,1,1,16,0,0.0,0.0,0.0,2.0,Monday


### Evaluating Performance
After creating dummy variables for both **ID** and **day_name**, the prediction performance is evaluated using the same data.

In [37]:
# Add dummies for day_name and station ID
selected_rentals_eng_features = pd.get_dummies(selected_rentals_eng_features, columns = ["ID", "day_name"], drop_first=False)

# FEBRUARY: evaluating performance for the same data range as before
# training data
X_train = selected_rentals_eng_features.loc[(selected_rentals_eng_features["day"] <=25) & (selected_rentals_eng_features["month"] == 2),list(selected_rentals_eng_features.columns)[1:]]
y_train = selected_rentals_eng_features.loc[(selected_rentals_eng_features["day"] <=25) & (selected_rentals_eng_features["month"] == 2), "#_rentals"]

X_test = selected_rentals_eng_features.loc[(selected_rentals_eng_features["day"] >25) & (selected_rentals_eng_features["month"] == 2),list(selected_rentals_eng_features.columns)[1:]]
y_test = selected_rentals_eng_features.loc[(selected_rentals_eng_features["day"] >25) & (selected_rentals_eng_features["month"] == 2), "#_rentals"]

# training the model
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(random_state=2) 
forest.fit(X_train, y_train)

# testing performance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred = forest.predict(X_test)
print(mean_squared_error(y_test, y_test_pred),r2_score(y_test, y_test_pred))

2.194286142857143 0.4010056666237961


In [38]:
# MARCH: evaluating performance for the same data range as before

# training data
X_train = selected_rentals_eng_features.loc[(selected_rentals_eng_features["day"] <=25) & (selected_rentals_eng_features["month"] == 3),list(selected_rentals_eng_features.columns)[1:]]
y_train = selected_rentals_eng_features.loc[(selected_rentals_eng_features["day"] <=25) & (selected_rentals_eng_features["month"] == 3), "#_rentals"]

X_test = selected_rentals_eng_features.loc[(selected_rentals_eng_features["day"] >25) & (selected_rentals_eng_features["month"] == 3),list(selected_rentals_eng_features.columns)[1:]]
y_test = selected_rentals_eng_features.loc[(selected_rentals_eng_features["day"] >25) & (selected_rentals_eng_features["month"] == 3), "#_rentals"]

# training the model
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(random_state=2) 
forest.fit(X_train, y_train)

# testing performance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_test_pred = forest.predict(X_test)
print(mean_squared_error(y_test, y_test_pred),r2_score(y_test, y_test_pred))

3.5069851428571432 0.5027642486937676
