# Boris Bikes

In [None]:
import datetime

import h3 # uber geo package
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import json
import pickle
import os
import folium

import requests as requests
from geopandas import GeoDataFrame, points_from_xy
import movingpandas as mpd
from shapely.geometry import Point
from folium.plugins import TimestampedGeoJson

import re
import contextily as cx
import community
import networkx as nx


from tqdm.auto import tqdm
import pyproj


In [None]:
os.environ['PROJ_LIB'] = pyproj.datadir.get_data_dir()

!echo $PROJ_LIB

In [None]:
tqdm.pandas()

TODO: this needs doesn't persist in initial notebook run and has to be run again separately and charts redrawn :( - fix

In [None]:
matplotlib.rcParams['figure.figsize'] = (20, 10)

# Introduction

Context


# Data description and cleaning

Explain the steps for the data description.

## Load processed data 

load reference data for bike station locations

In [None]:
# TODO update
RAW = "../data/BorisBikes_journeys_cleaned_data.pickle"

LOCATION_REF = "../data/BorisBikes_stations_coordinates.json"

In [None]:
station_locations_df = pd.read_json(LOCATION_REF).T

station_locations_df.head()

main data, load with dask

In [None]:
# load raw data
df = pd.read_pickle(RAW)

about 100M rows

In [None]:
# num rows
len(df)

some cleaning of bad dates

In [None]:
EARLIEST_DATE = datetime.datetime(2010, 1, 1)

In [None]:
%%time
# filter out of range dates
df = df[df["start_date"] > EARLIEST_DATE]
# allow NA for end dates
df = df[(df["end_date"] > EARLIEST_DATE) | df["end_date"].isna()]

# also drop entries where start date before end date
df = df[df["start_date"] < df["end_date"]]

# recalc duration
df["duration"] = df["end_date"]  - df["start_date"] 

we've lost a few!

In [None]:
len(df)

In [None]:
max(df['end_date'])

# Statistics on bike usage

How many bikes we have in our dataset

In [None]:
df["bike_id"].nunique()

Look a statistics of bikes

In [None]:
bike_groups = df.groupby("bike_id")

# bike with the most trips
group_counts = bike_groups.count()["filename"] # pick abritrary column (without nulls) to get counts
b_id = group_counts.idxmax()
n_trips = group_counts.loc[b_id]

print(f"""
bike with most trips: {b_id}
did {n_trips} trips
""")

In [None]:
# bike with the longest trips
group_sums = bike_groups["duration"].sum()
b_id = group_sums.idxmax()
d_sum = group_sums.loc[b_id]


print(f"""
bike with longest sum duration of trips: {b_id}
total of {d_sum} seconds
""")

how long are trips? (excluding outliers)

In [None]:

fig, ax = plt.subplots()
ax = df[df["duration"].dt.seconds < 10000]["duration"].dt.seconds.hist(bins=50)
plt.xlabel('Trip duration (seconds)')
plt.ylabel('Frequency')
plt.title('Trip duration')
plt.show()


## Long-lived bikes

In [None]:
trips_per_bike = bike_groups["filename"].count()
bike_start = bike_groups["start_date"].first()
bike_end = bike_groups["end_date"].last()

bike_lifetime = bike_end - bike_start

how long is the lifetime of a bike? (in days)

In [None]:

fig, ax = plt.subplots()
ax = bike_lifetime.dt.days.hist(bins=50)
plt.xlabel('Days')
plt.ylabel('Bikes')
plt.title('Lifetime of a bike')
plt.show()




what is the average utilisation of a bike? (total ride duration / lifetime)

this is the mean of the per bike utilisation. I.e. each bike's usage divided by it's total lifetime.

In [None]:
duration_sums = bike_groups["duration"].sum()
bike_utilisation = duration_sums / bike_lifetime
bike_utilisation.mean()

bike_utilisation.max()

In [None]:

fig, ax = plt.subplots()
ax = bike_utilisation.hist(bins=500)
plt.xlim([0, 0.15])
plt.xlabel('Days')
plt.ylabel('Bikes')
plt.title('Lifetime of a bike')
plt.show()




### per month

- how many bikes are "alive" by month?

- how many stations are "alive" by month?

- what is bike utilisation by month? 

our utilisation measure here will be slightly different to previous figure. Previously we looked at per bike utilisation and averaged this. Now, we're looking at sum of use over the entire fleet and dividing this by the max possible usage per month (24/7 riding).

In [None]:
# don't want to incude first and last months as may be incompelte, use in filter later
incomplete_months = df["start_date"].iloc[[0, -1]].dt.to_period("M")


# create a complete monthly index that covers ALL months in period 
complete_monthly_index = pd.date_range(start=df["start_date"].iloc[0], end=df["end_date"].iloc[-1], freq="M").to_period("M")
# remove incomplete months
complete_monthly_index = complete_monthly_index.delete(complete_monthly_index.isin(incomplete_months))

In [None]:
# TODO should stations count as allive for next month rather than current?
def calc_alive_per_month(starts: pd.Series, ends: pd.Series, incomplete_months: pd.Series, complete_monthly_index: pd.PeriodIndex):
    starts_per_month = starts.dt.to_period("M").value_counts()
    ends_per_month = ends.dt.to_period("M").value_counts()

    counts_df = complete_monthly_index.to_frame(name="foo").join(starts_per_month).join(ends_per_month).sort_index().fillna(0)
    # ending items should only be counted at the start of next month, so shift
    counts_df["end_date"] = counts_df["end_date"].shift(fill_value=0)

    alive_per_month = counts_df["start_date"].cumsum() - counts_df["end_date"].cumsum()
    
    return alive_per_month[~alive_per_month.index.isin(incomplete_months)]

In [None]:
alive_bikes_per_month = calc_alive_per_month(starts=bike_start, ends=bike_end, incomplete_months=incomplete_months, complete_monthly_index=complete_monthly_index)

In [None]:
duration_sums_per_month = df[["duration"]].groupby(df["start_date"].dt.to_period("M"))["duration"].sum()

duration_sums_per_month = duration_sums_per_month.to_frame()
duration_sums_per_month["max_possible_duration"] = duration_sums_per_month.index.map(lambda x: x.end_time - x.start_time)

utilisation_per_month = duration_sums_per_month["duration"] / duration_sums_per_month["max_possible_duration"] / alive_bikes_per_month

# remove incomplelte months
utilisation_per_month = utilisation_per_month[~utilisation_per_month.index.isin(incomplete_months)]

In [None]:
station_groups = df.groupby("start_station_id")

# relies on time ordering of df via rental_id
station_start = station_groups["start_date"].first()
station_end = station_groups["end_date"].last()

In [None]:
alive_stations_per_month = calc_alive_per_month(starts=station_start, ends=station_end,
                                                incomplete_months=incomplete_months, complete_monthly_index=complete_monthly_index)

In [None]:
# forward fill gaps
stats_df = complete_monthly_index.to_frame(name="date")\
    .join(alive_bikes_per_month.rename("alive_bikes"))\
    .join(alive_stations_per_month.rename("alive_stations"))\
    .join(utilisation_per_month.rename("utilisation"))\
    .fillna(method="ffill") 

In [None]:
stats_df.head()

First month seems to be unusual, look from march 2012

In [None]:
stats_df[1:].plot.area(subplots=True)
plt.xlabel('Years')


# chains

A "chain" is a sequence of trips for a given bike, where the start location matches the previous end location.

Run for some subset only. Try long lived bikes

In [None]:
top_ten_lived_bike_ids = bike_lifetime.sort_values()[-10:].index.values

In [None]:
top_ten_bike_subset = df[df["bike_id"].isin(top_ten_lived_bike_ids)].copy()

In [None]:
def add_chains(bike_id: int, bike_group: pd.DataFrame, df: pd.DataFrame) -> None:
    """ note: adds to dataframe as side effect """

    # note fillna for end station to allow for comparison to NA
    breaks = bike_group[bike_group["start_station_id"] != bike_group.shift()["end_station_id"].fillna(-1)]
    break_indices = breaks.index.values

    chains = list()
    for i, (start, end) in enumerate(zip([None, *break_indices], [*break_indices, None])):
        chain = bike_group.loc[start:end]
        chain_id = f"{bike_id}_{i}"
        chains.append(pd.Series(chain_id, index=chain.index))
    return pd.concat(chains)

In [None]:
chains = list()
for k, g in tqdm(top_ten_bike_subset.groupby("bike_id")):
    g = bike_groups.get_group(k)
    chains.append(add_chains(bike_id=k, bike_group=g, df=df))

In [None]:
top_ten_bike_subset = top_ten_bike_subset.join(pd.concat(chains).rename("chain_id"))

# Looking a behaviour of individual bikes


- Take a look at the movement of some of these long lived bikes during a period of time

- Clean dataset for places with missing stations

In [None]:
def remove_missing_stations(df, stations):
     
    def check_id(row, stations):
        start_id = str(int(row["start_station_id"]))
        end_id = str(int(row["end_station_id"]))
        if str(start_id) in stations.keys() and str(end_id) in stations.keys():
            return True
        return False
    
    df["check_stations_ids"] = df.apply(
        lambda row: check_id(row, stations), axis=1
    )
    df = df[df.check_stations_ids.eq(True)]
    return df

In [None]:
with open("../data/BorisBikes_stations_coordinates.json") as f:
    stations = json.load(f)

data = remove_missing_stations(top_ten_bike_subset,stations)
data.head()

- Lets make this a bit more formal, by building objects htat represent bikes and trips

- A trip is a journey between two stations, we can use https://www.cyclestreets.net/ api to build the most probable journey between two
    point given the duration of that journey. 

In [None]:
class Trip:
    def __init__(self, data, bike_id, trip_id, station_data):
        df = data[data.index == trip_id]

        self.init_station = {
            "name": df.start_station_name.values[0],
            "id": df.start_station_id.values[0],
            "latitude": station_data[str(int(df.start_station_id.values[0]))][
                "lat"
            ],
            "longitude": station_data[str(int(df.start_station_id.values[0]))][
                "lon"
            ],
        }
        self.end_station = {
            "name": df.end_station_name.values[0],
            "id": df.end_station_id.values[0],
            "latitude": station_data[str(int(df.end_station_id.values[0]))][
                "lat"
            ],
            "longitude": station_data[str(int(df.end_station_id.values[0]))][
                "lon"
            ],
        }
        self.bike = df.bike_id.values[0]
        self.duration = df.duration.values[0]
        self.date = {
            "start": df.start_date.values[0],
            "end": df.end_date.values[0],
        }
        self.circular = self.init_station == self.end_station
        self.route = {}
        self.bike_id = bike_id
        self.trip_id = trip_id

    def get_route(self, key, route_path= 'routes/'):
        
        if not os.path.exists(route_path):
            os.makedirs(route_path)
            
        route_file_path = (
            route_path
            + str(self.bike_id)
            + "_"
            + str(self.trip_id)
            + ".json"
        )
        if os.path.isfile(route_file_path):
            with open(route_file_path, "r") as fp:
                data = json.load(fp)
                self.route = data
        else:
            if self.circular:
                self.route = {}

            else:
                plans = ["balanced", "fastest", "quietest", "shortest"]

                closest_time = False
                trip_data = {}

                for plan in plans:
                    name = (
                        "https://www.cyclestreets.net/api/journey.json?key="
                        + key
                        + "&itinerarypoints="
                        + str(self.init_station["longitude"])
                        + ","
                        + str(self.init_station["latitude"])
                        + "|"
                        + str(self.end_station["longitude"])
                        + ","
                        + str(self.end_station["latitude"])
                        + "&plan="
                        + plan
                    )
                    data = requests.get(name).json()["marker"][0][
                        "@attributes"
                    ]
                    time = int(data["time"])
                    if closest_time is False:
                        closest_time = abs(time - self.duration)
                        trip_data = data

                    elif abs(self.duration - time) < closest_time:
                        closest_time = abs(time - self.duration)
                        trip_data = data

                self.route = trip_data

            with open(route_file_path, "w") as fp:
                json.dump(self.route, fp)



A bike is identified by its ID, and they story contains all the trips recorded in the data and the routed obtained from https://www.cyclestreets.net/. 

In [None]:
class Bike:
    def __init__(self, id):
        self.id = id

    def get_chains(self, stations):
        chain_ids = self.bike_rides.chain_id.to_list()
        chains = {}
        for chain_id in chain_ids:
            chain_rides = self.bike_rides[
                self.bike_rides["chain_id"] == chain_id
            ]
            chains[chain_id] = [Trip(chain_rides, self.id, trip_id, stations) for trip_id in chain_rides.index]
            #self.get_trips(chain_rides, stations)
        self.chains = chains

    def get_story(self, dataset, stations,key):
        bike_rides = dataset[dataset["bike_id"] == self.id]
        self.bike_rides = bike_rides
        self.get_chains(stations)
        
        for chain_id, chain in self.chains.items():

            for counter, trip in enumerate(chain):
                trip.get_route(key)
                if trip.route == {}:
                    continue


## A day of the life of a bike

We can visualise these journes of a given bike on a map using folium and moving pandas.

In [None]:
def get_colours(steps):
    colours = sns.color_palette("mako").as_hex()
    rev_colours = sns.color_palette("mako").as_hex()
    rev_colours.reverse()
    colours = rev_colours + colours
    while len(colours) < steps:
        colours += colours
    return colours


def get_trajectory(bike_id, route_folder = "routes/"):

    chains = [
        filename
        for filename in sorted(os.listdir(route_folder))
        if str(bike_id) + "_" in filename
    ]
    
    times = []
    geometry = []
    colours = []

    many_colurs = get_colours(len(chains))

    for c in range(len(chains)):
        chain = chains[c]
        with open(route_folder + chain) as f:
            d = json.load(f)
        if len(d) > 0:
            geometry += [
                Point([float(y) for y in x.split(",")])
                for x in d["coordinates"].split(" ")
            ]
            if len(times) == 0:
                time_now = datetime.datetime.now()
            else:
                time_now = times[-1]
            times += [
                time_now + datetime.timedelta(seconds=1 * t + 1)
                for t in range(len(d["coordinates"].split(" ")))
            ]
            colours += [
                many_colurs[c] for x in range(len(d["coordinates"].split(" ")))
            ]

    df = pd.DataFrame()

    df["t"] = times
    df["trajectory_id"] = [1 for x in range(len(geometry))]
    df["sequence"] = [x + 1 for x in range(len(geometry))]
    df["colour"] = colours

    gdf = GeoDataFrame(df, crs="EPSG:4326", geometry=geometry)
    gdf = gdf.set_index("t")

    trajs = mpd.TrajectoryCollection(gdf, "trajectory_id")
    trajs = mpd.MinTimeDeltaGeneralizer(trajs).generalize(
        tolerance=datetime.timedelta(seconds=1)
    )
    traj = trajs.trajectories[0]
    return traj

def draw_map(traj):
    features = traj_to_timestamped_geojson(traj)
    # Create base map
    London = [51.506949, -0.122876]
    map = folium.Map(location=London, zoom_start=12, tiles="cartodbpositron")
    TimestampedGeoJson(
        {
            "type": "FeatureCollection",
            "features": features,
        },
        period="PT1S",
        add_last_point=False,
        transition_time=10,
    ).add_to(map)
    return map

def traj_to_timestamped_geojson(trajectory):
    features = []
    df = trajectory.df.copy()
    df["previous_geometry"] = df["geometry"].shift()
    df["time"] = df.index
    df["previous_time"] = df["time"].shift()
    for _, row in df.iloc[1:].iterrows():
        coordinates = [
            [
                row["previous_geometry"].xy[0][0],
                row["previous_geometry"].xy[1][0],
            ],
            [row["geometry"].xy[0][0], row["geometry"].xy[1][0]],
        ]
        times = [row["previous_time"].isoformat(), row["time"].isoformat()]
        features.append(
            {
                "type": "Feature",
                "geometry": {
                    "type": "LineString",
                    "coordinates": coordinates,
                },
                "properties": {
                    "times": times,
                    "style": {
                        "color": row["colour"],
                        "weight": 5,
                    },
                },
            }
        )
    return features


In [None]:
key = open("cycle_street_key.txt", "r").read()

In [None]:
bike_id = 893
selected_data = data[(data['start_date']> '2020-03-23') & (data['start_date']< '2020-05-14')]
bike = Bike(id=bike_id)
bike.get_story(selected_data, stations,key)

In [None]:
traj = get_trajectory(bike_id)
map_trajectory = draw_map(traj)

map_trajectory

In [None]:
bike_id = 3278
selected_data =data[(data['start_date']> '2021-03-23') & (data['start_date']< '2021-05-14')]
bike = Bike(id=bike_id)
bike.get_story(selected_data, stations,key)

traj = get_trajectory(bike_id)
map_trajectory = draw_map(traj)

map_trajectory

# Bike mobility patterns

- Bikes seem to stay on the areas unless they get moved by car, which is not uncommon
- Characterise the mobility patterns using network analysis.

- Describe the network.

In [None]:
def create_network_from_data(df, trip_count_threshold = 1e-5):
    trip_counts = (
        (
            df[["start_station_id", "end_station_id", "bike_id"]]
            .groupby(["start_station_id", "end_station_id"])
            .count()
        )
        .reset_index()
        .rename(columns={"bike_id": "trip_count"})
    )
    trip_counts = trip_counts.sort_values("trip_count")
    total_num_trips = trip_counts["trip_count"].sum()

    trip_counts = trip_counts[
        trip_counts["trip_count"] >= trip_count_threshold * total_num_trips
    ]

    graph = nx.from_pandas_edgelist(
        trip_counts,
        source="start_station_id",
        target="end_station_id",
        edge_attr="trip_count",
        create_using=nx.DiGraph,
    )

    return graph



describe community detection

In [None]:
def network_community_detection(graph, edge_weight):
    graph_undirected = nx.Graph()
    undirected_edges = set(sorted(graph.edges))
    for edge in undirected_edges:
        reverse_edge = (edge[1], edge[0])
        trip_count = graph.edges[edge][edge_weight]
        if reverse_edge in graph.edges:
            trip_count += graph.edges[reverse_edge][edge_weight]
        graph_undirected.add_edge(edge[0], edge[1], trip_count=trip_count)

    partition = community.best_partition(graph_undirected, weight=edge_weight)
    df_partition = pd.DataFrame(partition, index=[0]).T.reset_index()
    df_partition.columns = ["id", "partition"]

    return df_partition

Visualise the network, describe what we want to visualise.

In [None]:


STATION_NAMES_FILE = "../data/BorisBikes_station_names.pickle"
STATION_COORDS_FILE = LOCATION_REF

LABEL_STATIONS = [
     "Belgrove Street",
     "Waterloo Station 3",
     "Hyde Park Corner",
     "Aquatic Centre",
     "Bethnal Green Road",
     "Natural History Museum",
     "Kennington Oval",
     "Mudchute DLR",
]


def get_station_name(id):
    with open(STATION_NAMES_FILE, "rb") as f:
        station_allnames = pickle.load(f)

    name = sorted(station_allnames[id])[0]
    name = re.split(";|,|:", name)[0].strip()
    return name




def get_node_info(graph):
    with open(STATION_COORDS_FILE, "r") as f:
        station_latlon = json.load(f)

    nodes = graph.nodes()

    pos = [station_latlon[str(int(node))] for node in nodes]
    pos = [(p["lon"], p["lat"]) for p in pos]

    station_sizes = [i[1] for i in list(graph.degree(weight="trip_count"))]

    labels = [get_station_name(int(node)) for node in nodes]

    nodes_df = pd.DataFrame(
        {"id": list(nodes), "pos": pos, "size": station_sizes, "name": labels}
    )

    return nodes_df






def _scale_range(values, min_scaled, max_scaled):
    values = np.array(values)
    if min_scaled is not None:
        max_value = np.max(values)
        min_value = np.min(values)
        mult_coeff = (max_scaled - min_scaled) / (max_value - min_value)
        add_coeff = (max_value * min_scaled - min_value * max_scaled) / (
            max_value - min_value
        )
        scaled = mult_coeff * values + add_coeff
    else:
        max_value = np.max(values)
        scaled = max_scaled * values / max_value
    return scaled


def _drop_stations_without_location(graph):
    with open(STATION_COORDS_FILE, "r") as f:
        station_latlon = json.load(f)
    nodes = tuple(graph.nodes)
    stations_with_location = tuple(map(int, station_latlon.keys()))
    for n in nodes:
        if n not in stations_with_location:
            print(f"Removing node {n} because of missing location data.")
            graph.remove_node(n)
    return None


def create_network_and_map(
    df,
    label_stations= LABEL_STATIONS,
    allow_self_loops=False,
    arrows=True,
):
    community_graph = create_network_from_data(df)
    _drop_stations_without_location(community_graph)
    nodes_info = get_node_info(community_graph)
    visualisation_graph = community_graph.copy()
    if not allow_self_loops:
        visualisation_graph.remove_edges_from(
            nx.selfloop_edges(community_graph)
        )
    community_df = network_community_detection(community_graph, "trip_count")
    nodes_info = nodes_info.merge(community_df, on="id")
    nodes_info = nodes_info.sort_values(by="size", ascending=False)
    del community_df

    nodes_info['lon'] = [p[0] for p in nodes_info["pos"]]
    nodes_info['lat'] = [p[1] for p in nodes_info["pos"]]

    nodes_info = GeoDataFrame(nodes_info,geometry=points_from_xy(nodes_info.lon, nodes_info.lat),crs="EPSG:4326")
    
    labels = {
        id: name
        for id, name in zip(nodes_info["id"], nodes_info["name"])
        if name in label_stations
    }

    fig, ax = plt.subplots(
        1, 1, figsize=(20, 15)
    )
    nodes_info.plot(ax=ax,markersize=1)
    cx.add_basemap(ax,crs=nodes_info.crs,source=cx.providers.Stamen.TonerLite)

    xynps = [np.array([p[0] for p in nodes_info["pos"]]),
        np.array([p[1] for p in nodes_info["pos"]])]
    pos = {
        k: (xynps[0][i], xynps[1][i]) for i, k in enumerate(nodes_info["id"])
    }

    MAX_NODE_SIZE = 300.0
    MIN_NODE_SIZE = 5.0
    
    sizes = _scale_range(nodes_info["size"], MIN_NODE_SIZE, MAX_NODE_SIZE)
    weights = np.array(
        [
            visualisation_graph.edges[e]["trip_count"]
            for e in visualisation_graph.edges
        ]
    )
    
    MAX_EDGE_WIDTH = 3.0
    MIN_EDGE_WIDTH = None
    
    
    weights = _scale_range(weights, MIN_EDGE_WIDTH, MAX_EDGE_WIDTH)
    
    MAX_EDGE_ALPHA = 0.9
    MIN_EDGE_ALPHA = None

    edge_alpha = _scale_range(weights, MIN_EDGE_ALPHA, MAX_EDGE_ALPHA)

    
    # Plots
    nx.draw_networkx_nodes(
        visualisation_graph,
        pos=pos,
        nodelist=nodes_info["id"],
        node_color=nodes_info["partition"],
        alpha=1.0,
        node_size=sizes,
        cmap="tab10",
        ax=ax,
    )
    nx.draw_networkx_edges(
        visualisation_graph,
        pos=pos,
        edge_color="#222222",
        width=weights,
        alpha=edge_alpha,
        arrows=arrows,
        ax=ax,
    )
    nx.draw_networkx_labels(
        visualisation_graph,
        pos=pos,
        labels=labels,
        font_size=12,
        ax=ax,
    )
    return fig, ax, nodes_info


In [None]:
HARD_START_DATE = datetime.datetime(year=2010, month=1, day=1)

start_date = datetime.datetime(year=2021, month=1, day=1)
end_date = datetime. datetime(year=2022, month=1, day=1)

df = df[
    (df["start_date"] > HARD_START_DATE) & (df["end_date"] > HARD_START_DATE)
]
df_year = df[(df["start_date"] > start_date) & (df["start_date"] < end_date)]



Take a look at the network on a tipical morning on 2021.

In [None]:
print("Plotting mornings")
df_year_mornings = df[
    df["start_date"].dt.hour.isin([7, 8, 9, 10])
    & df["start_date"].dt.weekday.isin((0, 1, 2, 3, 4))
]
fig, ax, nodes_info = create_network_and_map(df_year_mornings)
num_communities = len(nodes_info["partition"].unique())
print(f"Number of communities: {num_communities}")
plt.title("Weekday mornings (7-10)")
plt.show()



In [None]:
Take a look at the network on a tipical afternoon on 2021.

In [None]:
print("Plotting afternoons")
df_year_afternoons = df[
    df["start_date"].dt.hour.isin([15, 16, 17, 18, 19])
    & df["start_date"].dt.weekday.isin((0, 1, 2, 3, 4))
]
fig, ax, nodes_info = create_network_and_map(df_year_afternoons)
num_communities = len(nodes_info["partition"].unique())
print(f"Number of communities: {num_communities}")
plt.title("Weekday afternoons (15-19)")
plt.show()



In [None]:
Take a look at the network on a tipical weekend on 2021.

In [None]:
print("Plotting weekends")
df_year_weekends = df[df["start_date"].dt.weekday.isin((5, 6))]
fig, ax, nodes_info = create_network_and_map(
    df_year_weekends,
    allow_self_loops=True,
)
num_communities = len(nodes_info["partition"].unique())
print(f"Number of communities: {num_communities}")
plt.title("Weekends")
plt.show()


In [None]:
Lets see how the patterns have changed over time.

In [None]:
for year in (2013, 2017, 2020,2021):
    print(f"Plotting {year}")
    start_date = datetime.datetime(year=year, month=1, day=1)
    end_date = datetime.datetime(year=year + 1, month=1, day=1)
    df_year = df[
        (df["start_date"] > start_date) & (df["start_date"] < end_date)
    ]
    fig, ax, nodes_info = create_network_and_map(
        df_year,
        allow_self_loops=False,
        arrows=False,
    )
    num_communities = len(nodes_info["partition"].unique())
    print(f"Number of communities: {num_communities}")
    plt.title(f"Year {year}")
    plt.show()
