# 00 – Setup & Configuration

Project: Weather-Based Ski Resort Attendance Analysis  
Student: Aneta Kalabric  

This notebook:
- sets up the project environment
- defines folder structure
- loads and validates raw datasets
- defines global configuration variables

Datasets used:
- resorts.csv (ski resorts metadata)
- snow.csv (snow & weather observations)

In [1]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np

print("Python executable:", sys.executable)
print("Python version:", sys.version)
print("Current working directory:", Path.cwd())

Python executable: c:\Users\aneta\Desktop\ski-attendance-akalabric\ski-attendance-akalabric\2025-sci-prog\projects\ski-attendance-akalabric\.venv\Scripts\python.exe
Python version: 3.14.2 (tags/v3.14.2:df79316, Dec  5 2025, 17:18:21) [MSC v.1944 64 bit (AMD64)]
Current working directory: c:\Users\aneta\Desktop\ski-attendance-akalabric\ski-attendance-akalabric\2025-sci-prog\projects\ski-attendance-akalabric\notebooks


In [2]:
def find_project_root(start: Path) -> Path:
    for parent in [start] + list(start.parents):
        if (parent / "README.md").exists():
            return parent
    raise RuntimeError("Project root not found")

PROJECT_ROOT = find_project_root(Path.cwd())

print("PROJECT_ROOT:", PROJECT_ROOT)

PROJECT_ROOT: c:\Users\aneta\Desktop\ski-attendance-akalabric\ski-attendance-akalabric\2025-sci-prog\projects\ski-attendance-akalabric


In [3]:
DATA_DIR = PROJECT_ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print("RAW_DIR exists:", RAW_DIR.exists())
print("PROCESSED_DIR exists:", PROCESSED_DIR.exists())

RAW_DIR exists: True
PROCESSED_DIR exists: True


In [4]:
def read_csv_safely(path: Path) -> pd.DataFrame:
    encodings = ["utf-8", "utf-8-sig", "cp1252", "latin1"]
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc)
        except UnicodeDecodeError:
            continue
    raise UnicodeDecodeError(f"Cannot decode file: {path}")

In [5]:
resorts_path = RAW_DIR / "resorts.csv"

df_resorts = read_csv_safely(resorts_path)

# normalize column names
df_resorts.columns = (
    df_resorts.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
)

print("Resorts dataset shape:", df_resorts.shape)
df_resorts.head()

Resorts dataset shape: (499, 25)


Unnamed: 0,id,resort,latitude,longitude,country,continent,price,season,highest_point,lowest_point,...,snow_cannons,surface_lifts,chair_lifts,gondola_lifts,total_lifts,lift_capacity,child_friendly,snowparks,nightskiing,summer_skiing
0,1,Hemsedal,60.928244,8.383487,Norway,Europe,46,November - May,1450,620,...,325,15,6,0,21,22921,Yes,Yes,Yes,No
1,2,Geilosiden Geilo,60.534526,8.206372,Norway,Europe,44,November - April,1178,800,...,100,18,6,0,24,14225,Yes,Yes,Yes,No
2,3,Golm,47.05781,9.828167,Austria,Europe,48,December - April,2110,650,...,123,4,4,3,11,16240,Yes,No,No,No
3,4,Red Mountain Resort-Rossland,49.10552,-117.84628,Canada,North America,60,December - April,2075,1185,...,0,2,5,1,8,9200,Yes,Yes,Yes,No
4,5,Hafjell,61.230369,10.529014,Norway,Europe,45,November - April,1030,195,...,150,14,3,1,18,21060,Yes,Yes,Yes,No


In [6]:
snow_path = RAW_DIR / "snow.csv"

df_snow = read_csv_safely(snow_path)

df_snow.columns = (
    df_snow.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
)

print("Snow dataset shape:", df_snow.shape)
df_snow.head()

Snow dataset shape: (820522, 4)


Unnamed: 0,month,latitude,longitude,snow
0,2022-12-01,63.125,68.875,95.28
1,2022-12-01,63.125,69.125,100.0
2,2022-12-01,63.125,69.375,100.0
3,2022-12-01,63.125,69.625,100.0
4,2022-12-01,63.125,69.875,100.0


In [7]:
print("Resorts columns:")
print(df_resorts.columns.tolist())

print("\nSnow columns:")
print(df_snow.columns.tolist())

Resorts columns:
['id', 'resort', 'latitude', 'longitude', 'country', 'continent', 'price', 'season', 'highest_point', 'lowest_point', 'beginner_slopes', 'intermediate_slopes', 'difficult_slopes', 'total_slopes', 'longest_run', 'snow_cannons', 'surface_lifts', 'chair_lifts', 'gondola_lifts', 'total_lifts', 'lift_capacity', 'child_friendly', 'snowparks', 'nightskiing', 'summer_skiing']

Snow columns:
['month', 'latitude', 'longitude', 'snow']


In [8]:
df_resorts.to_csv(PROCESSED_DIR / "resorts_clean.csv", index=False)
df_snow.to_csv(PROCESSED_DIR / "snow_clean.csv", index=False)

print("Clean datasets saved to processed/")

Clean datasets saved to processed/
