In [31]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import scipy.optimize as op
plt.style.use("dark_background") # Config plots for dark mode, delete if on light mode
plt.rcParams['figure.dpi'] = 150 # Hi-res plots

In [57]:
station_data = pd.read_csv("../data/santander_locations.csv")
station_data.head() # Load the station data and inspect the first 5 rows
class StationIdError(IndexError):
    """Called when we try and read a non-existing station Id"""
    pass

def get_station_name(in_id):
    """Get station name from bike_data for a given id, catching any exceptions"""
    try:
        return station_data[station_data["Station.Id"] == in_id].StationName.iloc[0]
    except IndexError:
        StationIdError("No station matching input ID")

## Normalising time data

In [10]:
bike_data = pd.read_csv("../data/processed_df.csv", index_col=0)
bike_data.head() # Load the processed bike data and inspect the first 5 rows

# Find minimum start time
x = bike_data.min()["start_time"]
t_min = (x // 86400) * 86400

# Substract t_min from start_time and end_time
bike_data["start_time"] = (bike_data["start_time"] - t_min) / 60
bike_data["end_time"] = (bike_data["end_time"] - t_min) / 60

# Introduce random perturbations to make continuous
bike_data["start_time"] = bike_data["start_time"] + np.random.rand(*bike_data["start_time"].shape)
bike_data["end_time"] = bike_data["end_time"] + np.random.rand(*bike_data["end_time"].shape)

bike_data["duration"] = bike_data.end_time - bike_data.start_time

test_time = 12*7*24*60
test_bike_data = bike_data[bike_data.start_time <= test_time]

test_bike_data.head()

Unnamed: 0,start_id,end_id,start_time,duration,end_time,dist
1,103,37,2.952965,5.868038,8.821003,1.458333
2,39,539,2.257208,2.73691,4.994118,0.545517
3,785,785,2.19102,5.280868,7.471888,0.0
4,341,159,3.909475,29.382493,33.291968,1.092775
5,708,573,3.040017,18.038967,21.078984,3.60729


In [13]:
test_sorted_stations = []
for st_id in test_bike_data.start_id.sort_values().unique():
    test_sorted_stations.append(test_bike_data[test_bike_data.start_id==st_id])

test_sorted_stations[0].head()

Unnamed: 0,start_id,end_id,start_time,duration,end_time,dist
1512,1,71,425.460293,8.479955,433.940248,1.718483
3498,1,3,498.720431,20.128481,518.848913,1.964612
5961,1,330,584.17691,28.755732,612.932642,4.997948
5977,1,433,585.811182,2.645814,588.456996,0.673108
6447,1,803,607.283786,15.666662,622.950448,2.768124


In [49]:
rates_dict = {}
test_sorted_stations[0]
for station in test_sorted_stations:
    time_elapsed = station.start_time.to_numpy()[-1] - station.start_time.to_numpy()[0]
    n_events = test_sorted_stations[0].size
    rate = n_events / time_elapsed

    rates_dict[station.start_id.unique()[0]]= rate

rates_dict


{1: 0.08485045970130896,
 2: 0.0845963145186222,
 3: 0.08497428386202958,
 4: 0.08496562120288456,
 5: 0.08487419647534118,
 6: 0.08493789712471712,
 7: 0.08465153968948422,
 8: 0.084554422516871,
 9: 0.08491432013622488,
 10: 0.08484526933224502,
 11: 0.08449560933161665,
 12: 0.0850007206627124,
 13: 0.0846408180971788,
 14: 0.08475403007165593,
 15: 0.08493089661291074,
 16: 0.08478698836907497,
 17: 0.08462140572267919,
 18: 0.08464474868867289,
 19: 0.08488635991045496,
 20: 0.08490535686272298,
 22: 0.08464258796836961,
 23: 0.08483090448392813,
 24: 0.08454608463774277,
 25: 0.08455982077232006,
 26: 0.08481067657309628,
 27: 0.0848784936278255,
 28: 0.08473326440071893,
 29: 0.0845795808345821,
 30: 0.08476497655566854,
 31: 0.08466829275278352,
 32: 0.08478468775810506,
 33: 0.0845294190392275,
 34: 0.0847728148548288,
 36: 0.0848328747721283,
 37: 0.08470185992537961,
 38: 0.08475354521003409,
 39: 0.08449920862116173,
 40: 0.08482076271223359,
 41: 0.08488548341366732,
 42: 

In [62]:
rates = list(rates_dict.values())
cutoff = np.mean(rates) + 1.5*np.std(rates) # clearly nothing at minus 1.5sigma
max_rate = max(rates)
max_keys = [key for key, val in rates_dict.items() if val >= cutoff]
outliers = [get_station_name(i) for i in max_keys]
outliers

['Lansdowne Way Bus Garage, Stockwell',
 'George Row, Bermondsey',
 'Tower Wharf, Bermondsey']