In [1]:
import pandas as pd
from pathlib import Path
import os
import plotly.graph_objects as go
import json


# Input Data

In [2]:
DATA_DIR = Path(os.path.abspath('')).parents[1] / "data"
FARE_PRICES_DIR = DATA_DIR / "fares"
STOCK_PRICES_DIR = DATA_DIR / "stocks"
STOCK_PRICES_DIR

PosixPath('/Users/andras.vekassy/localFolder/visualisations/tfl-price-visualisations/data/stocks')

In [3]:
tflFares = pd.read_csv(FARE_PRICES_DIR / "raw" / "TfLHistoricalFares2000to2025.csv")
tflFares

Unnamed: 0,year,singleZ1toZ4Cash,singleZ1toZ4OysterPeak,busCash,singleBusOyster,capBusTram,travelcardZ1toZ4,capZ1toZ4PAYG,travelcard7DayZ1toZ4,weeklyBusAndTramPass
0,2000,2.6,,1.0,,,,,26.8,11.5
1,2001,2.7,,1.0,,,,,27.6,9.5
2,2002,2.7,,1.0,,,6.8,,28.1,8.5
3,2003,2.8,,1.0,,,7.0,,28.4,8.5
4,2004,3.0,2.8,1.0,0.7,,7.3,,29.2,9.5
5,2005,2.8,2.5,1.2,1.0,3.0,8.0,,30.4,11.0
6,2006,3.0,2.5,1.5,1.0,3.0,8.4,,31.6,13.5
7,2007,4.0,2.5,2.0,1.0,3.0,9.0,,33.2,14.0
8,2008,4.0,2.5,2.0,0.9,3.0,9.4,,34.6,13.0
9,2009,4.0,2.8,2.0,1.0,3.3,10.0,,36.8,13.8


# Charts

In [4]:
# UTILS

LINE_AND_MARKER = "lines+markers"

def create_trace(x, y, label: str, mode=LINE_AND_MARKER, line_shape=None, line_dash=None) -> go.Scatter:
    return go.Scatter(x=x, y=y, mode=mode, name=label, line_shape=line_shape, line_dash=line_dash)

def create_layout(title: str, type=None):
    return dict(
        title=title,
        width = 960,
        height = 500,
        xaxis=dict(
            showgrid=False,
            linecolor="#7f7f7f",
            linewidth=2,
            ticks='outside',
            type=type
        ),
        showlegend=True,
        plot_bgcolor='white'
    )

def normalise_single_series_to_100(df):
    return (df - df.min())/(df.max()-df.min()) * 100

def normalise_series_to_first_value(df):
    return (df / df.dropna().iloc[0])

In [5]:
def normalise_df_to_first_value_per_col(df):
    normed = df.copy()
    for col in normed.columns:
        if col != "year":
            normed[col] = normalise_series_to_first_value(normed[col])
    return normed

## Tube Fares

In [6]:
# 2000 - 2.6, 2012 - 5.3 for paper and 3.6 for contactless, 2025 - 7 for paper and 4.6 for contactless -> +38.46%, +76.92% using contactless
# Average Weekly Earnings (AWE) comparison: 2000 - £305, 2012 - £457, 2025 - £711 -> 49.83%, 133.11%
# https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/earningsandworkinghours/timeseries/kab9/emp
# Median weekly earnings for full-time employees: 2012 - £506, 2025 - £728
# https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/earningsandworkinghours/bulletins/annualsurveyofhoursandearnings/2024

In [7]:
fig = go.Figure()
# TRACES
fig.add_trace(create_trace(x=tflFares['year'][10:], y=normalise_series_to_first_value(tflFares["singleZ1toZ4OysterPeak"][10:]) * 100 - 100, label="Contactless ticket"))
fig.add_trace(create_trace(x=tflFares['year'][10:], y=normalise_series_to_first_value(tflFares["singleZ1toZ4Cash"][10:]) * 100 - 100, label="Paper ticket"))
# fig.add_trace(create_trace(x=tflFares['year'][10:], y=normalise_single_series_to_100(tflFares["singleZ1to4OysterPeak"][10:]), label="Contactless ticket"))
# fig.add_trace(create_trace(x=tflFares['year'][10:], y=normalise_single_series_to_100(tflFares["singleZ1to4Cash"][10:]), label="Paper ticket"))
# LAYOUT
layout = create_layout(title="Normalised TfL Tube Fares <br><sup>Zone 1-4 travel during peak hours</sup>")
fig.update_layout(layout)

fig.show()

In [8]:
# Get pay-as-you-go caps as well
# How many journeys are required to reach the cap? -> could put it as barchart underneath
# tflFares['capZ1to4PAYG']/tflFares['singleZ1to4OysterPeak']

# fig = go.Figure()
# # TRACES
# fig.add_trace(create_trace(x=tflFares['year'], y=tflFares['capZ1to4PAYG'], label="PAYG Cap"))
# # fig.add_trace(create_trace(x=tflFares['year'], y=tflFares['travelcard7DayZ1to4'], label="travelcard7DayZ1to4"))
# # LAYOUT
# layout = create_layout(title="TfL Tube Fares <br><sup>Zone 1-4 travel during peak hours</sup>")
# fig.update_layout(layout)

# fig.update_xaxes(range=[2012,2025])

# fig.show()

## FTSE100

In [9]:
data = json.load(open(STOCK_PRICES_DIR / "ftse100from2000to2025.json"))
ftse100 = pd.DataFrame(data['data'])
ftse100
# Month end value are reported from 2000 Jan to 2025 March + mid April

Unnamed: 0,_DATE_END,LOW_1,CLOSE_PRC,HIGH_1,OPEN_PRC
0,2000-01-31,6246.8,6268.54,6930.2,6930.2
1,2000-02-29,5972.7,6232.56,6450.9,6268.5
2,2000-03-31,6232.6,6540.22,6770.4,6232.6
3,2000-04-30,5915.2,6327.43,6586.3,6540.2
4,2000-05-31,5991.9,6359.35,6419.9,6327.4
...,...,...,...,...,...
299,2024-12-31,8002.34,8173.02,8388.37,8287.3
300,2025-01-31,8160.6,8673.96,8692.84,8173.02
301,2025-02-28,8520.2,8809.74,8820.93,8673.96
302,2025-03-31,8481.11,8582.81,8908.82,8809.74


In [10]:
ftse100['_DATE_END'] = pd.to_datetime(ftse100['_DATE_END'])
ftse100['year'] = ftse100['_DATE_END'].dt.year
# ftse100['year'] = ftse100['_DATE_END'].apply(lambda x: int(x.split('-')[0]))

ftse_yearly_max_prices = ftse100.groupby("year", as_index=False).max() # HALO this is stupid
ftse_yearly_max_prices = ftse_yearly_max_prices.sort_values("year")

ftse_yearly_min_prices = ftse100.groupby("year", as_index=False).min()
ftse_yearly_min_prices = ftse_yearly_min_prices.sort_values("year")

ftse_yearly_min_prices.head()

Unnamed: 0,year,_DATE_END,LOW_1,CLOSE_PRC,HIGH_1,OPEN_PRC
0,2000,2000-01-31,5915.2,6142.19,6419.9,6142.2
1,2001,2001-01-31,4219.8,4903.39,5279.8,4903.4
2,2002,2002-01-31,3609.9,3721.75,4197.5,3721.8
3,2003,2003-01-31,3277.5,3567.41,3747.0,3567.4
4,2004,2004-01-31,4283.0,4385.67,4487.9,4385.7


In [11]:
# fig = go.Figure()
# # TRACES
# fig.add_trace(create_trace(x=ftse_yearly_max_prices['year'][10:], y=ftse_yearly_max_prices['HIGH_1'].astype(float)[10:], label="Highest annual price"))
# fig.add_trace(create_trace(x=ftse_yearly_min_prices['year'][10:], y=ftse_yearly_min_prices['LOW_1'].astype(float)[10:], label="Lowest annual price"))
# # LAYOUT
# layout = create_layout(title="FTSE100<br><sup>UK's stock market's index innit</sup>", type="linear")
# fig.update_layout(layout)

# fig.show()

In [12]:
# fig = go.Figure()
# # TRACES
# fig.add_trace(create_trace(x=ftse_yearly_max_prices['year'][10:], y=normalise_single_series_to_100(ftse_yearly_max_prices['LOW_1'].astype(float)[10:]), label="Lowest annual price"))
# # LAYOUT
# layout = create_layout(title="Normalised FTSE100<br><sup>UK's stock market's index innit</sup>", type="linear")
# fig.update_layout(layout)
# # fig.update_xaxes(range=[2010,2025])
# fig.show()

## Combined Charts

In [13]:
fig = go.Figure()

# TODO - identify multiplier dynamically
fares_normalised = normalise_series_to_first_value(tflFares["singleZ1toZ4OysterPeak"][10:])
stocks_normalised = normalise_series_to_first_value(ftse_yearly_max_prices['HIGH_1'].astype(float)[10:])

# TRACES
fig.add_trace(create_trace(x=ftse_yearly_max_prices['year'][10:], y=stocks_normalised * 200 - 200, label="FTSE100 highs"))
fig.add_trace(create_trace(x=tflFares['year'][10:], y=fares_normalised * 200 - 200, label="Contactless ticket"))
# fig.add_trace(create_trace(x=tflFares['year'][10:], y=normalise_series_to_first_value(tflFares["singleZ1to4Cash"][10:]) * 200 - 200, label="Paper ticket"))
# fig.add_trace(create_trace(x=ftse_yearly_min_prices['year'][10:], y=normalise_series_to_first_value(ftse_yearly_max_prices['LOW_1'].astype(float)[10:]) * 200 - 200, label="FTSE100 lows"))

# LAYOUT
layout = create_layout(title="TfL Tube Fares vs the FTSE100<br><sup>Prices are normalised between 0-100</sup>")
fig.update_layout(layout)
fig.update_yaxes(title_text="Relative price")
fig.update_xaxes(title_text="Year")

fig.show()


# TODO: add annotiations for starting price
# Follow code here: https://plotly.com/python/line-charts/ -> section Label Lines with Annotations

In [14]:
# Multiplier pls
m1 = fares_normalised.max()
m2 = stocks_normalised.max()
m = max(m1, m2)
print(m)

stocks_normalised

1.4838709677419353


10    1.000000
11    1.014002
12    0.995945
13    1.141853
14    1.146709
15    1.182893
16    1.186229
17    1.278364
18    1.312555
19    1.283325
20    1.277044
21    1.238427
22    1.276646
23    1.336397
24    1.407368
25    1.479512
Name: HIGH_1, dtype: float64

In [15]:
# normalise_single_series_to_100(pd.concat([tflFares["singleZ1to4OysterPeak"][10:], ftse_yearly_max_prices['HIGH_1'].astype(float)[10:]], axis=1))
# I need the same starting point and for the max to be 100

In [16]:
# Create step non-decreasing step function
fig = go.Figure()

fares_normalised = normalise_series_to_first_value(tflFares["singleZ1toZ4OysterPeak"][10:]) * 200 - 200
stocks_normalised = normalise_series_to_first_value(ftse_yearly_max_prices['HIGH_1'].astype(float)[10:]) * 200 - 200
# TRACES
fig.add_trace(create_trace(
    x=tflFares['year'][10:],
    y=fares_normalised, 
    label="Contactless ticket (PAYG)",
    line_shape="hv"
))
fig.add_trace(create_trace(
    x=ftse_yearly_max_prices['year'][10:], 
    y=stocks_normalised, 
    label="FTSE100 annual highs",
    # line_shape="hv"
))

# LAYOUT
layout = create_layout(title="TfL Tube Fares vs the FTSE100<br><sup>Prices are normalised to the same starting point of 1</sup>")
fig.update_layout(layout)

fig.show()

In [17]:
((fares_normalised - stocks_normalised) > 0).sum()
# 11 out of 15 years the tube prices rose above the FTSE100

# Reselling your tube tickets would reap in more revenue than investing it in the UK economy
#  through a stock ISA. Sadiq, Rachel - wanna chip in?

np.int64(11)

### Plot for Streamlit Dashboard

In [18]:
combined_fares = pd.read_csv(FARE_PRICES_DIR / "processed" / "combined_transport_fares.csv")
combined_stocks = pd.read_csv(STOCK_PRICES_DIR / "combined_indices.csv")
# Time to plot dynamically. Variables
# - Transport modes selected
# - Indices selected
# - Year range
# How to distinguish between transport and stocks? Make transport costs a step function

In [19]:
indices_keys = combined_stocks.columns.to_list()
indices_keys.remove("year")
print(indices_keys)

transportation_keys = combined_fares.columns.to_list()
transportation_keys.remove("year")
print(transportation_keys)

['HangSeng_1985to2025', 'FTSE100_1985to2025', 'Nikkei225_1980to2025', 'SnP500_1980to2025', 'DAX_1987to2025']
['relative_bus_and_coach_fares', 'relative_rail_fares', 'singleZ1toZ4Cash', 'singleZ1toZ4OysterPeak', 'busCash', 'singleBusOyster', 'capBusTram', 'travelcardZ1toZ4', 'capZ1toZ4PAYG', 'travelcard7DayZ1toZ4', 'weeklyBusAndTramPass']


In [20]:
def normalise_series_to_stocks_anchor(series, stock_normed_df, years):
    """Normalises fare series such that its first valid point gets set to the
    highest equivalent in the corresponding year of the stocks DataFrame."""
    mask = ~series.isna()
    if mask.any():
        first_label = mask.idxmax()
        first_pos = series.index.get_loc(first_label)
        fare_start_year = years[first_pos]
        # Find the max normed stock value for that year
        if fare_start_year in stock_normed_df['year'].values:
            anchor = stock_normed_df.loc[stock_normed_df['year'] == fare_start_year].drop('year', axis=1).max(axis=1).iloc[0]
        else:
            anchor = 1.0  # Fallback if year missing
        anchor_fare = series.iloc[first_pos]
        return (series / anchor_fare) * anchor
    else:
        return series
    
def normalise_fares_to_stock_anchor(fares_df, stock_normed_df):
    normed = fares_df.copy()
    years = fares_df['year'].values
    for col in normed.columns:
        if col != "year":
            normed[col] = normalise_series_to_stocks_anchor(normed[col], stock_normed_df, years)
    return normed

In [22]:
def comparison_plot(selected_fares: list, selected_stocks: list, start_year: int, end_year: int):
    filtered_stocks = combined_stocks.loc[(combined_stocks.year >= start_year) & (combined_stocks.year <= end_year)]
    filtered_fares = combined_fares.loc[(combined_fares.year >= start_year) & (combined_fares.year <= end_year)]
    normalised_stocks = normalise_df_to_first_value_per_col(filtered_stocks)
    # normalised_fares = normalise_df_to_first_value_per_col(filtered_fares)
    normalised_fares = normalise_fares_to_stock_anchor(filtered_fares, normalised_stocks)

    fig = go.Figure()
    # TRACES
    # First, add stock indices
    for key in selected_stocks:
        fig.add_trace(create_trace(
            x=filtered_stocks['year'],
            y=normalised_stocks[key], 
            label=key.split("_")[0],
            mode="lines",
            line_dash="dash"
        ))

    # Second, transportation fares
    for key in selected_fares:
        fig.add_trace(create_trace(
            x=filtered_fares['year'],
            y=normalised_fares[key], 
            label=key,
            line_shape='hv'
        ))

    # LAYOUT
    layout = create_layout(title="Transportation Costs in the UK vs Global Stock Indecis<br><sup>Prices are normalised to the same starting point of 1</sup>")
    fig.update_layout(layout)

    fig.show()

In [23]:
selected_stocks = indices_keys[:3]
selected_fares = transportation_keys[:3]
comparison_plot(selected_fares, selected_stocks, 2010, 2020)

In [27]:
selected_stocks = indices_keys[:3]
selected_fares = transportation_keys[8:]
comparison_plot(selected_fares, selected_stocks, 2020, 2025)

In [26]:
# selected_stocks = indices_keys[2:]
# selected_fares = transportation_keys[7:10]
# comparison_plot(selected_fares, selected_stocks, 2000, 2025)

# Ideas, thoughts

In [None]:
# Create plotly/streamlit chart where one can select the following from a dropdown
# - Stock Index -> Get S&P 500
# Tranportation mode -> we have average rail and bus/coach fares too from 

# Also need a dropdown for selecting years (2000-2025) and deal with missing values pls
# Though fares go back to 1987, and so do most indeces -> Did a search in perplexity

# Sources
# - Bus & coach: https://www.ons.gov.uk/economy/inflationandpriceindices/timeseries/docx/mm23
# - Train: https://www.ons.gov.uk/economy/inflationandpriceindices/timeseries/docw/mm23
# - FTSE100: downloaded from WSJ Markets
# - S&P 500: downloaded from WSJ Markets
# - DAX: downloaded from WSJ Markets
# - Nikkei 225: downloaded from WSJ Markets
# - Hang Seng: downloaded from WSJ Markets

In [None]:
# Could factor in inflation using CPIH
# CPIH stands for the Consumer Prices Index including owner occupiers' housing costs, which is the UK's leading measure of inflation
# Source: https://www.ons.gov.uk/economy/inflationandpriceindices/timeseries/l55o/mm23
# cpih = pd.read_csv(DATA_DIR / "CPIH-annual-rate-1989-2024-by-year.csv", skiprows=7)
# n_years = 2024 - 1989 + 1
# cpih.iloc[:n_years].tail()