In [1]:
import pandas as pd
import os
from pathlib import Path

DATA_DIR = Path(os.path.abspath('')).parents[1] / "data"
STOCK_PRICES_DIR_RAW = DATA_DIR / "stocks" / "raw"
TRANSPORT_FARES_DIR = DATA_DIR / "fares"
TRANSPORT_FARES_DIR

MIN_YEAR = 2000

# Transport Fares

In [2]:
bus_and_coach_fares = pd.read_csv(TRANSPORT_FARES_DIR / "raw" / "bus_and_coach_fares.csv", skiprows=7)
rail_fares = pd.read_csv(TRANSPORT_FARES_DIR / "raw" / "rail_fares.csv", skiprows=7)
tfl_fares = pd.read_csv(TRANSPORT_FARES_DIR / "raw" / "TfLHistoricalFares2000to2025.csv")

In [3]:
tfl_fares.columns

Index(['year', 'Single_Z1_to_Z4_Cash', 'Single_Z1_to_Z4_Oyster_Peak',
       'Bus_Cash', 'Single_Bus_Oyster', 'Cap_Bus_Tram', 'Travelcard_Z1_to_Z4',
       'Cap_Z1_to_Z4_PAYG', 'Travelcard_7_Day_Z1_to_Z4',
       'Weekly_Bus_And_Tram_Pass'],
      dtype='object')

In [4]:
n_rows = 2024 - int(rail_fares.iloc[0].year) + 1

rail_fares = rail_fares.iloc[:n_rows]
rail_fares["year"] = rail_fares["year"].astype(int)

bus_and_coach_fares = bus_and_coach_fares.iloc[:n_rows]
bus_and_coach_fares["year"] = bus_and_coach_fares["year"].astype(int)

In [5]:
combined_fares = bus_and_coach_fares.merge(rail_fares, on='year')
combined_fares = combined_fares.merge(tfl_fares, on='year', how='outer')
combined_fares.head()

Unnamed: 0,year,Bus_and_Coach,Rail,Single_Z1_to_Z4_Cash,Single_Z1_to_Z4_Oyster_Peak,Bus_Cash,Single_Bus_Oyster,Cap_Bus_Tram,Travelcard_Z1_to_Z4,Cap_Z1_to_Z4_PAYG,Travelcard_7_Day_Z1_to_Z4,Weekly_Bus_And_Tram_Pass
0,1987,103.4,100.6,,,,,,,,,
1,1988,110.6,107.6,,,,,,,,,
2,1989,119.3,117.4,,,,,,,,,
3,1990,125.9,127.7,,,,,,,,,
4,1991,143.6,141.0,,,,,,,,,


In [6]:
combined_fares = combined_fares.loc[combined_fares.year >= MIN_YEAR]

In [7]:
combined_fares.to_csv(TRANSPORT_FARES_DIR / "processed" / "combined_transport_fares.csv", index=False)

# Stock Indeces

In [8]:
def preprocess_stock_prices_dataset(fpath: Path) -> None:
    df = pd.read_csv(fpath)
    df["Date"] = pd.to_datetime(df["Date"])
    df["year"] = df.Date.dt.year

    df_processed = df.groupby("year").max().sort_values("year")
    df_processed.columns = [s.strip().lower() for s in df_processed.columns]

    if ("year" not in df_processed.columns):
        df_processed = df_processed.reset_index(drop=False)

    # df_processed.to_csv(fname[:-4] + "_processed.csv", index=False)
    new_fname =  fpath.name[:-4] + "_processed.csv"
    df_processed.to_csv(fpath.parents[1] / "processed" / new_fname, index=False)

    return df_processed

In [9]:
processed_indecis = {}
for fpath in STOCK_PRICES_DIR_RAW.iterdir():
    if ("csv" in str(fpath)):
        print(fpath.parent, fpath.name)
        processed_indecis[fpath.name[:-4]] = preprocess_stock_prices_dataset(fpath)

/Users/andras.vekassy/localFolder/visualisations/tfl-price-visualisations/data/stocks/raw HangSeng_1985to2025.csv
/Users/andras.vekassy/localFolder/visualisations/tfl-price-visualisations/data/stocks/raw FTSE100_1985to2025.csv


  df["Date"] = pd.to_datetime(df["Date"])
  df["Date"] = pd.to_datetime(df["Date"])


/Users/andras.vekassy/localFolder/visualisations/tfl-price-visualisations/data/stocks/raw Nikkei225_1980to2025.csv
/Users/andras.vekassy/localFolder/visualisations/tfl-price-visualisations/data/stocks/raw SnP500_1980to2025.csv


  df["Date"] = pd.to_datetime(df["Date"])
  df["Date"] = pd.to_datetime(df["Date"])


/Users/andras.vekassy/localFolder/visualisations/tfl-price-visualisations/data/stocks/raw DAX_1987to2025.csv


  df["Date"] = pd.to_datetime(df["Date"])


In [10]:
pd.DataFrame(processed_indecis['DAX_1987to2025'].year, columns=['year']).merge(processed_indecis['DAX_1987to2025'][['year', 'high']])

Unnamed: 0,year,high
0,1987,1000.0
1,1988,1340.41
2,1989,1790.37
3,1990,1968.55
4,1991,1715.8
5,1992,1811.57
6,1993,2266.68
7,1994,2271.11
8,1995,2394.15
9,1996,2909.91


In [11]:
# processed_indecis['DAX_1987to2025'].high
combined_indecis = pd.DataFrame(processed_indecis['DAX_1987to2025'].year, columns=['year'])
for key, df in processed_indecis.items():
    incoming_df = df[['year','high']].rename(columns={'high': key})
    combined_indecis = combined_indecis.merge(incoming_df, on='year', how='outer')

In [12]:
combined_indecis = combined_indecis.loc[combined_indecis.year >= MIN_YEAR]
combined_indecis

Unnamed: 0,year,HangSeng_1985to2025,FTSE100_1985to2025,Nikkei225_1980to2025,SnP500_1980to2025,DAX_1987to2025
20,2000,18301.69,6930.2,20833.2,1553.11,8064.97
21,2001,16163.99,6360.3,14556.1,1383.37,6795.14
22,2002,11974.61,5362.3,12081.4,1176.97,5462.55
23,2003,12594.42,4491.8,11238.6,1112.56,3965.16
24,2004,14266.38,4826.2,12195.7,1217.33,4261.79
25,2005,15508.57,5647.2,16445.56,1275.8,5469.96
26,2006,20049.03,6271.4,17563.37,1431.81,6629.33
27,2007,31958.41,6754.1,18300.39,1576.09,8151.57
28,2008,27853.6,6534.7,15156.66,1471.77,8100.64
29,2009,23099.57,5445.17,10767.0,1130.38,6026.69


In [17]:
combined_indecis.columns = [name.split("_")[0] for name in combined_indecis.columns]

In [18]:
combined_indecis.to_csv(STOCK_PRICES_DIR_RAW.parent / "combined_indices.csv", index=False)