# Zdruzitev CSV datotek iz data/raw

Ta notebook:
- prebere vse CSV datoteke iz `data/raw`
- doda stolpec `station` (ime datoteke brez `.csv`)
- zdruzi vse vrstice v en DataFrame
- shrani rezultat v `data/combined/all_stations_raw.csv`


In [11]:
from pathlib import Path
import pandas as pd

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 200)


In [12]:
def find_project_root(start: Path) -> Path:
    start = start.resolve()
    for c in [start, *start.parents]:
        if (c / "pyproject.toml").exists() and (c / "data").exists():
            return c
    return start

ROOT_DIR = find_project_root(Path.cwd())
RAW_DIR = ROOT_DIR / "data" / "raw"
OUT_DIR = ROOT_DIR / "data" / "raw"
OUT_PATH = OUT_DIR / "all_stations.csv"

OUT_DIR.mkdir(parents=True, exist_ok=True)
files = sorted(RAW_DIR.glob("*.csv"))

print(f"ROOT_DIR: {ROOT_DIR}")
print(f"RAW_DIR: {RAW_DIR}")
print(f"CSV files found: {len(files)}")
if not files:
    raise FileNotFoundError(f"No CSV files found in {RAW_DIR}")


ROOT_DIR: /home/benjamin/priprava_podatkov
RAW_DIR: /home/benjamin/priprava_podatkov/data/raw
CSV files found: 21


In [13]:
column_layouts = {}
for path in files:
    cols = tuple(pd.read_csv(path, nrows=0).columns.tolist())
    column_layouts.setdefault(cols, []).append(path.name)

print(f"Unique column layouts: {len(column_layouts)}")
for i, (cols, members) in enumerate(column_layouts.items(), start=1):
    print(f"Layout {i}: {len(members)} files")
    print(list(cols))

if len(column_layouts) > 1:
    print("OPOZORILO: stolpci se razlikujejo med datotekami; uporabljen bo union stolpcev.")


Unique column layouts: 1
Layout 1: 21 files
['datetime', 'PM10', 'PM2.5', 'temperature', 'rain', 'pressure', 'precipitation', 'wind_speed', 'clouds', 'wind_direction']


In [14]:
frames = []
for path in files:
    station = path.stem
    df = pd.read_csv(path)
    df.insert(0, "station", station)
    frames.append(df)

combined = pd.concat(frames, ignore_index=True, sort=False)
if "datetime" in combined.columns:
    combined["datetime"] = pd.to_datetime(combined["datetime"], errors="coerce")
    combined = combined.sort_values(["station", "datetime"]).reset_index(drop=True)

combined.to_csv(OUT_PATH, index=False)

print(f"Saved: {OUT_PATH}")
print(f"Rows: {len(combined):,}")
print(f"Columns: {len(combined.columns)}")
combined.head()


Saved: /home/benjamin/priprava_podatkov/data/raw/all_stations.csv
Rows: 296,100
Columns: 11


Unnamed: 0,station,datetime,PM10,PM2.5,temperature,rain,pressure,precipitation,wind_speed,clouds,wind_direction
0,E403,2024-05-02 20:00:00,3.0,2.0,11.0,0.0,1008.0,87.0,3.0,0.0,0.0
1,E403,2024-05-02 21:00:00,6.0,3.0,11.0,0.0,1008.0,87.0,3.0,0.0,0.0
2,E403,2024-05-02 22:00:00,7.0,3.0,11.0,0.4,1009.0,84.0,13.0,0.0,0.0
3,E403,2024-05-02 23:00:00,7.0,4.0,10.0,0.0,1009.0,95.0,4.0,0.0,0.0
4,E403,2024-05-03 00:00:00,7.0,3.0,10.0,0.0,1009.0,93.0,3.0,0.0,0.0


In [15]:
summary = combined.groupby("station", as_index=False).size().rename(columns={"size": "rows"})
summary


Unnamed: 0,station,rows
0,E403,14100
1,E404,14100
2,E405,14100
3,E407,14100
4,E408,14100
5,E409,14100
6,E410,14100
7,E411,14100
8,E412,14100
9,E413,14100
