# Chapter 5: Eager and Lazy APIs

In [None]:
import polars as pl
pl.show_versions()  # The book is built with Polars version 1.13.1

## Eager API: DataFrame

In [None]:
%%time
trips = pl.read_parquet("data/taxi/yellow_tripdata_*.parquet")  
sum_per_vendor = trips.group_by("VendorID").sum()  

income_per_distance_per_vendor = sum_per_vendor.select(
    "VendorID",
    income_per_distance=pl.col("total_amount") / pl.col("trip_distance"),
)

top_three = income_per_distance_per_vendor.sort(  
    by="income_per_distance", descending=True
).head(3)

top_three

## Lazy API: LazyFrame

In [None]:
# This raises a SchemaError:
# names_lf = pl.LazyFrame({"name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35]})

# erroneous_query = names_lf.with_columns(
#     sliced_age=pl.col("age").str.slice(1, 3)
# )

# result_df = erroneous_query.collect()

## Performance Differences

In [None]:
%%time
trips = pl.scan_parquet("data/taxi/yellow_tripdata_*.parquet")
sum_per_vendor = trips.group_by("VendorID").sum()

income_per_distance_per_vendor = sum_per_vendor.select(
    "VendorID",
    income_per_distance=pl.col("total_amount") / pl.col("trip_distance"),
)

top_three = income_per_distance_per_vendor.sort(
    by="income_per_distance", descending=True
).head(3)

top_three.collect()

In [None]:
lf = pl.LazyFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})

# ... Some heavy computation ...

print(lf.collect())

print(lf.with_columns(pl.col("col1") + 1).collect())  

## Functionality Differences

### Attributes

### Aggregation Methods

### Computation Methods

### Descriptive Methods

### Group By Methods

### Exporting Methods

### Manipulation and Selection Methods

### Miscellaneous Methods

## Tips and Tricks

### Going from LazyFrame to DataFrame and Vice Versa

### Joining a DataFrame With a LazyFrame

In [None]:
# This raises a TypeError:
# big_sales_data = pl.LazyFrame(
#     {"sale_id": [101, 102, 103], "amount": [250, 150, 300]}
# )
#
# sales_metadata = pl.DataFrame(
#     {"sale_id": [101, 102, 103], "category": ["A", "B", "A"]}
# )
#
# big_sales_data.join(sales_metadata, on="sale_id").collect()

In [None]:
big_sales_data = pl.LazyFrame(
    {"sale_id": [101, 102, 103], "amount": [250, 150, 300]}
)

sales_metadata = pl.DataFrame(
    {"sale_id": [101, 102, 103], "category": ["A", "B", "A"]}
)

big_sales_data.join(sales_metadata.lazy(), on="sale_id").collect()

### Caching Intermittent Results

In [None]:
lf = pl.LazyFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})

# ... Some heavy computation ...

lf = lf.collect().lazy()  
print(lf.collect())

print(lf.with_columns(pl.col("col1") + 1).collect())  

## Takeaways