In [None]:
import polars as pl
import altair as alt
import pandas as pd

# Step 1: Create a synthetic botany dataset
data = pl.DataFrame({
    "species": [
        "Oak", "Cactus", "Lotus", "Pine", "Cactus", "Lotus", "Oak", "Pine", "Oak", "Cactus"
    ],
    "leaf_area": [
        120.5, 30.1, 85.7, 95.3, 28.9, 88.0, 122.0, 97.5, 115.0, 31.2
    ],
    "petal_length": [
        5.3, 1.2, 6.0, 3.5, 1.0, 6.2, 5.1, 3.8, 5.7, 1.1
    ],
    "stem_diameter": [
        2.4, 1.5, 2.1, 2.3, 1.4, 2.0, 2.5, 2.2, 2.6, 1.3
    ],
    "moisture_content": [
        45.0, 18.2, 70.0, 40.1, 17.9, 72.5, 46.0, 42.3, 47.2, 19.0
    ],
    "habitat": [
        "Forest", "Desert", "Wetland", "Forest", "Desert", "Wetland", "Forest", "Forest", "Forest", "Desert"
    ]
})

print("=== Data Preview ===")
print(data.head())


In [None]:
print("\n=== Summary Statistics ===")
print(data.describe())

print("\n=== Average Moisture and Stem Diameter by Species ===")
agg = data.group_by("species").agg([
    pl.col("moisture_content").mean().alias("avg_moisture"),
    pl.col("stem_diameter").mean().alias("avg_stem_diameter")
])
print(agg)


In [None]:
# Convert to pandas for Altair

# Create a strip plot (alternative to boxplot)
alt.Chart(data).mark_boxplot(extent='min-max').encode(
    x=alt.X('species:N', title='Species'),
    y=alt.Y('petal_length:Q', title='Petal Length')
).properties(
    title='Petal Length Distribution by Species',
    width=500,
    height=300
)


In [None]:
alt.Chart(data).mark_circle(size=100).encode(
    x=alt.X('leaf_area:Q', title='Leaf Area'),
    y=alt.Y('moisture_content:Q', title='Moisture Content'),
    color='habitat:N',
    tooltip=['species', 'leaf_area', 'moisture_content', 'habitat']
).properties(
    title='Leaf Area vs Moisture Content by Habitat',
    width=500,
    height=300
).interactive()


In [None]:
#Data cleaning example

In [None]:
raw = pl.DataFrame({
    "leaf_area (cm2)": [120, None, 30, 9999, 85],
    "moisture": ["45", "NaN", "18.2", "", "70"],
    "logged_time": ["2024-06-01 08:00:00", "2024-06-01 09:00:00", "", "2024-06-01 11:00:00", "2024-06-01 12:00:00"]
})

# Clean it
clean = (
    raw
    .with_columns([
        pl.col("leaf_area (cm2)").fill_null(strategy="mean").alias("leaf_area"),
        pl.col("moisture").str.replace_all("NaN", "").str.strip_chars().cast(pl.Float64,strict=False).fill_null(0.0),
        pl.col("logged_time").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S", strict=False).alias("timestamp")
    ])
    .drop(["leaf_area (cm2)"])
)

print(clean)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import polars as pl

# Example Polars DataFrame
df = pl.DataFrame({
    "leaf_area": [120, 30, 85, 95, 28, 88, 122, 97],
    "stem_diameter": [2.4, 1.5, 2.1, 2.3, 1.4, 2.0, 2.5, 2.2],
    "moisture_content": [45, 18, 70, 40, 17, 72, 46, 42],
    "drought_tolerant": [1, 1, 0, 0, 1, 0, 0, 0]  # binary target
})

# Train-test split
X = df.drop("drought_tolerant")
y = df["drought_tolerant"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Fit logistic regression directly with Polars DataFrame
model = LogisticRegression()
model.fit(X_train, y_train)
print(model.get_params())

