# 2026 The Best Ski Trip Prediction

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

## Data Parsing and Cleaning

In [None]:
# Read the excel data
data_fp = list(Path().rglob('../data/2025*.xlsx'))[0]
excel_sheets = pd.read_excel(data_fp, sheet_name=None)
# Unpack the excel sheets
info_data, visitation_data, climate_data = [v for _, v in excel_sheets.items()]

### Parsing the info sheet 
1. Station ID and location
2. Winter ski weeks and dates

In [44]:
location_id = info_data.iloc[21:26,1].copy()
location_id = pd.DataFrame(location_id.str.split(" - ").to_list(), columns=["station_id", "location"])
location_id

Unnamed: 0,station_id,location
0,71032,Thredbo AWS
1,71075,Perisher AWS
2,72161,Cabramurra SMHEA AWS
3,83024,Mount Buller
4,83084,Falls Creek


In [43]:
ski_season_dates = (info_data.iloc[35:, 1:].copy()
                    .rename(columns={"Unnamed: 1": "week", "Unnamed: 2": "dates"})
                    .reset_index(drop=True))
ski_season_dates.loc[:,"dates"] = ski_season_dates.loc[:,"dates"].dt.strftime("%d-%b")
ski_season_dates

 '04-Aug' '11-Aug' '18-Aug' '25-Aug' '01-Sep' '08-Sep' '15-Sep']' has dtype incompatible with datetime64[ns], please explicitly cast to a compatible dtype first.
  ski_season_dates.loc[:,"dates"] = ski_season_dates.loc[:,"dates"].dt.strftime("%d-%b")


Unnamed: 0,week,dates
0,Week 1,09-Jun
1,Week 2,16-Jun
2,Week 3,23-Jun
3,Week 4,30-Jun
4,Week 5,07-Jul
5,Week 6,14-Jul
6,Week 7,21-Jul
7,Week 8,28-Jul
8,Week 9,04-Aug
9,Week 10,11-Aug


### Parsing the climate data

In [60]:
# Check unclean data
climate_data.tail()

Unnamed: 0,Bureau of Meteorology station number,Year,Month,Day,Maximum temperature (Degree C),Minimum temperature (Degree C),Rainfall amount (millimetres)
39808,72161,2025,7,24,3.0,-2.7,4.2
39809,72161,2025,7,25,6.0,-2.4,0.2
39810,72161,2025,7,26,2.7,0.0,10.2
39811,72161,2025,7,27,2.5,1.3,15.6
39812,72161,2025,7,28,2.2,-0.1,54.4


In [None]:
# Renaming columns using dictionary
cd_col_names = [col.lower() for col in climate_data.columns]
cd_col_names[0] = "station_id"
cd_col_names[-3:] = ["max_temp_c", "min_temp_c", "rainfall_mm"]
cd_col_names = {col_old: col_new for col_old, col_new in zip(climate_data.columns, cd_col_names)}
climate_data_clean = climate_data.copy().rename(columns=cd_col_names)
climate_data_clean.tail()

Unnamed: 0,station_id,year,month,day,max_temp_c,min_temp_c,rainfall_mm
39808,72161,2025,7,24,3.0,-2.7,4.2
39809,72161,2025,7,25,6.0,-2.4,0.2
39810,72161,2025,7,26,2.7,0.0,10.2
39811,72161,2025,7,27,2.5,1.3,15.6
39812,72161,2025,7,28,2.2,-0.1,54.4
