In [None]:
%load_ext line_profiler
import polars as pl
import pandas as pd
import numpy as np

In [None]:
import time


def relax():
    pass


def bottleneck():
    time.sleep(0.001)


def some_function():
    nums = [num for num in range(1_000)]
    bigger_nums = [num**2 for num in nums]
    bottleneck()
    bigger_nums.extend(bigger_nums)
    relax()

In [None]:
%lprun -f some_function some_function()

In [None]:
%load_ext pyinstrument

In [None]:
%%pyinstrument
some_function()

In [None]:
%load_ext memory_profiler

In [None]:
@profile
def some_function():
    nums = [num for num in range(1_000)]
    bigger_nums = [num**2 for num in nums]
    bottleneck()
    bigger_nums.extend(bigger_nums)
    relax()

In [None]:
#%%memory_profiler some_function()

In [None]:
from memory_profiler import profile

import polars as pl
import pandas as pd
import numpy as np
def avgVelocity():
    df = pl.read_csv("synthetic_stars.csv")
    df =df.filter(pl.col("temperature") > 20000)
    df= df.group_by("spectral_type").agg([pl.col("velocity").mean().alias("avg_velocity")]).sort("avg_velocity", descending=True)


In [None]:
%lprun -f avgVelocity avgVelocity()

In [None]:
%%pyinstrument
avgVelocity()

In [None]:
#%mprun -f avgVelocity avgVelocity()

In [None]:
def avgVelocity():
    df = pl.scan_csv("synthetic_stars.csv")
    df =df.filter(pl.col("temperature") > 20000)
    df= df.group_by("spectral_type").agg([pl.col("velocity").mean().alias("avg_velocity")]).sort("avg_velocity", descending=True)
    df = df.collect()

In [None]:
%lprun -f avgVelocity avgVelocity()

In [None]:
%%pyinstrument
avgVelocity()

In [None]:
def avgVelocity():
    df = pl.scan_csv("synthetic_stars.csv")
    df =df.filter(pl.col("temperature") > 20000)
    df= df.group_by("spectral_type").agg([pl.col("velocity").mean().alias("avg_velocity")]).sort("avg_velocity", descending=True)
    #df = df.collect()
    print(df.explain())
avgVelocity()

In [None]:
import polars as pl
from polars.testing.parametric import dataframes
from polars import NUMERIC_DTYPES
from hypothesis import given

@given(
    dataframes(
        cols=5,
        allow_null=True,
        allowed_dtypes=NUMERIC_DTYPES,
    )
)
def test_star_numeric_columns(df: pl.DataFrame):
    # Check that all columns are numeric (e.g., temperature, velocity, magnitude)
    assert all(df[col].dtype.is_numeric() for col in df.columns)

In [None]:
import polars as pl
from polars.testing.parametric import column, dataframes
import hypothesis.strategies as st
from hypothesis import given

@given(
    dataframes(
        cols=[
            column("id", strategy=st.text(min_size=3, max_size=6)),
            column("spectral_type", strategy=st.sampled_from(["O", "B", "A", "F", "G", "K", "M"])),
            column("temperature", strategy=st.integers(min_value=2500, max_value=40000)),
            column("velocity", strategy=st.floats(min_value=-1000.0, max_value=1000.0)),
            column("magnitude", strategy=st.floats(min_value=-10.0, max_value=20.0)),
        ],
        min_size=5,
        lazy=True,
    )
)
def test_star_lazyframe_properties(lf: pl.LazyFrame):
    df = lf.collect()
    assert df.shape[1] == 5  # Should have all specified columns

In [None]:
import polars as pl
from polars.testing.parametric import column, dataframes, lists
import hypothesis.strategies as st
from hypothesis import given

@st.composite
def star_observation_pairs(draw: st.DrawFn):
    obs = lists(pl.UInt16, size=2)
    return [sorted(pair) for pair in zip(draw(obs), draw(obs))]

@given(
    dataframes(
        cols=[
            column("temperature_range", strategy=star_observation_pairs()),
            column("velocity_range", strategy=star_observation_pairs()),
            column("magnitude_range", strategy=star_observation_pairs()),
        ],
        min_size=3,
        max_size=3,
    )
)
def test_star_range_lists(df: pl.DataFrame):
    assert all(col in df.columns for col in ["temperature_range", "velocity_range", "magnitude_range"])

In [None]:
import polars as pl
from polars.testing import assert_frame_equal

def transform_pipeline(df: pl.DataFrame) -> pl.DataFrame:
    return (
        df.filter(pl.col("temperature") > 20000)
          .group_by("spectral_type")
          .agg(pl.col("velocity").mean().alias("avg_velocity"))
          .sort("avg_velocity")
    )

def test_transform_pipeline():
    input_df = pl.DataFrame({
        "temperature": [15000, 22000, 30000, 18000],
        "velocity": [50.0, 70.0, 120.0, 40.0],
        "spectral_type": ["G", "O", "O", "K"]
    })

    expected_df = pl.DataFrame({
        "spectral_type": ["O"],
        "avg_velocity": [95.0]
    })

    result_df = transform_pipeline(input_df)
    print(assert_frame_equal(result_df, expected_df))
