In [None]:
# Install polars and visualization libraries
!pip install polars matplotlib seaborn

# Polars Tutorial - 5 Minute Guide

**Polars** is a lightning-fast DataFrame library built in Rust with Python bindings. It offers:
- **Speed**: 10-100x faster than pandas for many operations
- **Memory efficiency**: Lower memory usage with lazy evaluation
- **Expressive API**: Clean, consistent syntax
- **Parallel processing**: Built-in parallelization

Perfect for large datasets, ETL pipelines, and performance-critical data processing.

In [None]:
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns

# Load CSV data
df = pl.read_csv('synthetic_customers_120k.csv')

# Alternative loading examples (commented):
# df = pl.read_json('data.json')  # JSON files
# df = pl.read_parquet('data.parquet')  # Parquet files (recommended for speed)
# df = pl.read_excel('data.xlsx')  # Excel files

# Create DataFrame from dictionary:
# data = {'name': ['Alice', 'Bob'], 'age': [25, 30]}
# df = pl.DataFrame(data)

# Quick data exploration
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns)
print("\nFirst 3 rows:")
df.head(3)

In [None]:
# Data overview and info
print("Data types:")
print(df.dtypes)
print("\nSummary statistics:")
df.describe()

## 📊 Data Visualization

In [None]:
# Convert to pandas for plotting (Polars doesn't have native plotting)
df_pd = df.to_pandas()

fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Distribution of purchase values
df_pd['avg_purchase_value'].hist(bins=50, ax=axes[0,0])
axes[0,0].set_title('Purchase Value Distribution')

# Customers by state (top 10)
state_counts = df.select('state').group_by('state').len().sort('len', descending=True).head(10)
state_counts_pd = state_counts.to_pandas()
axes[0,1].bar(state_counts_pd['state'], state_counts_pd['len'])
axes[0,1].set_title('Top 10 States by Customers')
axes[0,1].tick_params(axis='x', rotation=45)

# Scatter: purchases vs spending
axes[1,0].scatter(df_pd['num_purchases'], df_pd['total_spent'], alpha=0.5)
axes[1,0].set_xlabel('Number of Purchases')
axes[1,0].set_ylabel('Total Spent')
axes[1,0].set_title('Purchases vs Spending')

# Category preferences
category_counts = df.select('preferred_category').group_by('preferred_category').len()
category_counts_pd = category_counts.to_pandas()
axes[1,1].pie(category_counts_pd['len'], labels=category_counts_pd['preferred_category'])
axes[1,1].set_title('Category Preferences')

plt.tight_layout()
plt.show()

## 🔍 Data Filtering & Selection

In [None]:
# Filter by state (Polars expressive syntax)
california_customers = df.filter(pl.col('state') == 'CA')
print(f"California customers: {len(california_customers)}")

# Multiple conditions with & operator
high_value = df.filter(
    (pl.col('total_spent') > 1000) & 
    (pl.col('account_status') == 'Active')
)
print(f"High-value active customers: {len(high_value)}")

# Select specific columns
customer_summary = df.select([
    'first_name', 'last_name', 'state', 'total_spent'
]).head(5)
print("\nCustomer summary:")
customer_summary

## 📈 GroupBy Analysis

In [None]:
# Group by state - key metrics (Polars efficient aggregation)
state_stats = df.group_by('state').agg([
    pl.col('customer_id').len().alias('customer_count'),
    pl.col('total_spent').mean().round(2).alias('avg_spent'),
    pl.col('total_spent').sum().round(2).alias('total_revenue'),
    pl.col('avg_purchase_value').mean().round(2).alias('avg_purchase_value'),
    pl.col('loyalty_points').mean().round(2).alias('avg_loyalty')
])

print("Top 10 states by total revenue:")
state_stats.sort('total_revenue', descending=True).head(10)

In [None]:
# Category analysis with Polars expressions
category_analysis = df.group_by('preferred_category').agg([
    pl.col('customer_id').len().alias('customers'),
    pl.col('total_spent').mean().round(2).alias('avg_spent'),
    pl.col('num_purchases').mean().round(2).alias('avg_purchases'),
    pl.col('loyalty_points').mean().round(2).alias('avg_loyalty')
])

print("Category performance:")
category_analysis.sort('avg_spent', descending=True)

## 💾 Data Export

In [None]:
# Export filtered data
high_value.write_csv('high_value_customers_polars.csv')
print(f"Exported {len(high_value)} high-value customers to CSV")

# Export to Parquet (recommended for performance)
state_stats.write_parquet('state_analysis_polars.parquet')
print("Exported state analysis to Parquet")

# Other export formats:
# df.write_json('data.json')  # JSON
# df.write_excel('data.xlsx')  # Excel (requires xlsxwriter)
# df.write_ndjson('data.ndjson')  # Newline-delimited JSON

## ⚡ Essential Operations

In [None]:
# Data cleaning - check for nulls
print("Missing values per column:")
null_counts = df.select([
    pl.all().null_count()
])
print(null_counts)

# Create new calculated columns
df_enhanced = df.with_columns([
    (pl.col('total_spent') / pl.col('num_purchases')).alias('purchase_efficiency'),
    (pl.col('total_spent') > pl.col('total_spent').quantile(0.75)).alias('is_premium')
])

# Quick value counts
print("\nAccount status distribution:")
print(df.select('account_status').group_by('account_status').len().sort('len', descending=True))

print("\nTop 5 cities:")
print(df.select('city').group_by('city').len().sort('len', descending=True).head(5))

In [None]:
# Correlation analysis
numeric_cols = ['num_purchases', 'avg_purchase_value', 'total_spent', 'loyalty_points']
correlation_data = df.select(numeric_cols).to_pandas().corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_data, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlations')
plt.show()

print("Polars analysis complete! ⚡")

## 🚀 Polars Advantages Showcase

In [None]:
# Lazy evaluation example - build query without execution
lazy_query = (
    pl.scan_csv('synthetic_customers_120k.csv')  # Lazy loading
    .filter(pl.col('account_status') == 'Active')
    .group_by('state')
    .agg([
        pl.col('total_spent').sum().alias('revenue'),
        pl.col('customer_id').len().alias('customers')
    ])
    .sort('revenue', descending=True)
    .head(5)
)

print("Lazy query plan:")
print(lazy_query.explain())
print("\nQuery results:")
print(lazy_query.collect())  # Execute the query

In [None]:
# String operations and expressions
email_analysis = df.select([
    'email',
    pl.col('email').str.extract(r'@(.+)', 1).alias('domain'),
    pl.concat_str([pl.col('first_name'), pl.lit(' '), pl.col('last_name')]).alias('full_name')
]).head(5)

print("String operations:")
print(email_analysis)

print("\nPolars is ready for high-performance data processing! 🔥")