In [None]:
import pandas as pd
import os
from pathlib import Path

DATA_DIR = Path(os.path.abspath('')).parents[1] / "data"
STOCK_PRICES_DIR_RAW = DATA_DIR / "stocks" / "raw"
TRANSPORT_FARES_DIR = DATA_DIR / "fares"
TRANSPORT_FARES_DIR

# Transport Fares

In [None]:
bus_and_coach_fares = pd.read_csv(TRANSPORT_FARES_DIR / "raw" / "bus_and_coach_fares.csv", skiprows=7)
rail_fares = pd.read_csv(TRANSPORT_FARES_DIR / "raw" / "rail_fares.csv", skiprows=7)
tfl_fares = pd.read_csv(TRANSPORT_FARES_DIR / "raw" / "TfLHistoricalFares2000to2025.csv")

In [None]:
n_rows = 2024 - int(rail_fares.iloc[0].year) + 1

rail_fares = rail_fares.iloc[:n_rows]
rail_fares["year"] = rail_fares["year"].astype(int)

bus_and_coach_fares = bus_and_coach_fares.iloc[:n_rows]
bus_and_coach_fares["year"] = bus_and_coach_fares["year"].astype(int)

In [None]:
combined_fares = bus_and_coach_fares.merge(rail_fares, on='year')
combined_fares = combined_fares.merge(tfl_fares, on='year', how='outer')
combined_fares.head()

In [None]:
combined_fares.to_csv(TRANSPORT_FARES_DIR / "processed" / "combined_transport_fares.csv", index=False)

# Stock Indeces

In [None]:
def preprocess_stock_prices_dataset(fpath: Path) -> None:
    df = pd.read_csv(fpath)
    df["Date"] = pd.to_datetime(df["Date"])
    df["year"] = df.Date.dt.year

    df_processed = df.groupby("year").max().sort_values("year")
    df_processed.columns = [s.strip().lower() for s in df_processed.columns]

    if ("year" not in df_processed.columns):
        df_processed = df_processed.reset_index(drop=False)

    # df_processed.to_csv(fname[:-4] + "_processed.csv", index=False)
    new_fname =  fpath.name[:-4] + "_processed.csv"
    df_processed.to_csv(fpath.parents[1] / "processed" / new_fname, index=False)

    return df_processed

In [None]:
processed_indecis = {}
for fpath in STOCK_PRICES_DIR_RAW.iterdir():
    if ("csv" in str(fpath)):
        print(fpath.parent, fpath.name)
        processed_indecis[fpath.name[:-4]] = preprocess_stock_prices_dataset(fpath)

In [None]:
pd.DataFrame(processed_indecis['DAX_1987to2025'].year, columns=['year']).merge(processed_indecis['DAX_1987to2025'][['year', 'high']])

In [None]:
# processed_indecis['DAX_1987to2025'].high
combined_indecis = pd.DataFrame(processed_indecis['DAX_1987to2025'].year, columns=['year'])
for key, df in processed_indecis.items():
    incoming_df = df[['year','high']].rename(columns={'high': key})
    combined_indecis = combined_indecis.merge(incoming_df, on='year', how='outer')

In [None]:
combined_indecis

In [None]:
combined_indecis.to_csv(STOCK_PRICES_DIR_RAW.parent / "combined_indices.csv", index=False)