In [36]:
import csv

In [37]:
with open("../../data/out/main.csv") as f:
    data = list(csv.reader(f))

In [39]:
data

[['ID',
  'Art',
  'RingString',
  'Ablesung',
  'Datum',
  'Ort',
  'Gruppengröße',
  'Bemerkung',
  'Melder',
  'gemeldet'],
 ['0', 'Nilgans', 'HE2', 'HE2', '5/19/11', 'Alte Brücke', '45', '', '', '577'],
 ['1',
  'Kanadagans',
  '290379',
  '290 379',
  '2/24/19',
  'Ostpark',
  '',
  '',
  'IR',
  ' '],
 ['2',
  'Graugans',
  '274848',
  '274848',
  '4/17/19',
  'Ostpark',
  '',
  '',
  'IR',
  '?'],
 ['3',
  'Graugans',
  '290334',
  '290 334',
  '6/29/20',
  'Höchst Stadtpark',
  '',
  'flugfähig',
  'IR',
  '?'],
 ['4',
  'Graugans',
  '290335',
  '290 335',
  '7/19/19',
  'Höchst Stadtpark',
  '',
  '',
  'Usinger, H.',
  '?'],
 ['5',
  'Graugans',
  '290344',
  '290 3(44)',
  '6/29/20',
  'Höchst Stadtpark',
  '',
  'flugfähig',
  'IR',
  '?'],
 ['6',
  'Höckerschwan',
  '120830',
  '120830',
  '1/10/23',
  'Bodenheim/Laubenheim, Hochwasserpolder',
  '28',
  '120830/120878; ornitho.de, Foto',
  'Dürk, T.',
  '?'],
 ['7',
  'Höckerschwan',
  '120878',
  '120878',
  '1/10/23',
 

In [38]:
import sys

sys.getsizeof(data) / (1024 * 1024)  # Convert bytes to MB

0.05696868896484375

In [10]:
len(data)

6640

In [18]:
import pickle

In [12]:
with open("../../data/out/main.pkl", "wb+") as f:
    pickle.dump(data[1:], f)

In [19]:
with open("../../data/out/main.pkl", "rb") as f:
    data = pickle.load(f)

EOFError: Ran out of input

In [27]:
from pydantic import BaseModel, field_validator
from datetime import date


class Sighting(BaseModel):
    id: str
    species: str | None = None
    ring: str | None = None
    reading: str
    date: date
    place: str | None = None
    group_size: int | None = None
    comment: str | None = None
    melder: str | None = None
    melded: bool = False

    @field_validator("date", mode="before")
    @classmethod
    def parse_date(cls, value):
        if isinstance(value, str):
            try:
                # Convert mm/dd/yy to yyyy-mm-dd
                month, day, year = value.split("/")
                year = "20" + year if int(year) < 50 else "19" + year
                return date.fromisoformat(f"{year}-{month.zfill(2)}-{day.zfill(2)}")
            finally:
                return date(year=2000, month=1, day=1)
        return value


from enum import Enum


class SightingCols(Enum):
    id = 0
    species = 1
    ring = 2
    reading = 3
    date = 4
    place = 5
    group_size = 6
    comment = 7
    melder = 8

In [51]:
models = [Sighting(**{c.name: d[c.value] for c in SightingCols if d[c.value] != ""}) for d in data[1:]]

In [58]:
import sys

sys.getsizeof(models) / (1024 * 1024)  # Convert bytes to MB

0.05696868896484375

In [60]:
import sys
import json

sys.getsizeof(json.dumps([m.model_dump() for m in models])) / (1024 * 1024)  # Convert bytes to MB

TypeError: Object of type date is not JSON serializable

In [55]:
import sys

sys.getsizeof(data[20:]) / (1024 * 1024)  # Convert bytes to MB

0.05055999755859375

In [61]:
with open("../../data/out/sightings.pyd", "wb+") as f:
    pickle.dump(models, f)

In [16]:
tdata = [tuple(e) for e in data]

In [18]:
with open("../../data/out/sightings.tpl", "wb+") as f:
    pickle.dump(tdata, f)

In [8]:
from db.cols import SightingCols
import pickle


S3_BUCKET = "vogelring-data"
DATA_FILE = "main.pkl"

In [35]:
%%timeit

with open("../../data/out/sightings.pyd", "rb") as f:
    data = pickle.load(f)

7.29 ms ± 144 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [34]:
%%timeit

with open("../../data/out/sightings.tpl", "rb") as f:
    data = pickle.load(f)

    models = [Sighting(**{c.name: d[c.value] for c in SightingCols if d[c.value] != ""}) for d in data[1:]]

25.4 ms ± 947 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [25]:
%%timeit

with open("../../data/out/main.csv", "r") as f:
    data = list(csv.reader(f))

3.83 ms ± 24.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
