In [None]:
import polars as pl 

df = pl.read_csv("electricity_consumption.csv")

print(df.head(10))

In [None]:
import polars as pl

# Load the dataset
df = pl.read_csv("electricity_consumption.csv", try_parse_dates=True)

# Calculate the average consumption per city
average_consumption = df.group_by("City").agg(pl.col("Consumption (kWh)").mean().alias("Avg Consumption (kWh)"))

# Display the result
print(average_consumption)


In [None]:
import polars as pl

# Load the dataset
df = pl.read_csv("electricity_consumption.csv", try_parse_dates=True).with_columns(pl.col("Date").dt.week().alias("Week_Number"))

# Group by City and calculate required metrics
city_stats = df.group_by("City").agg([
    pl.col("Consumption (kWh)").mean().alias("Avg Consumption (kWh)"),
    pl.col("Peak Hours Usage (kWh)").max().alias("Max Peak Hour Usage (kWh)")
])

# Display the result
print(city_stats)


In [None]:
import polars as pl

# Load the dataset
df = pl.read_csv("electricity_consumption.csv", try_parse_dates=True)

# Group by City and calculate required metrics
city_stats = df.group_by("City").agg([
    pl.col("Consumption (kWh)").mean().alias("Avg Consumption (kWh)"),
    pl.col("Peak Hours Usage (kWh)").max().alias("Max Peak Hour Usage (kWh)"),
    pl.col("Temperature (째C)").min().alias("Min Temperature (째C)"),
    pl.col("Temperature (째C)").max().alias("Max Temperature (째C)")
])

# Display the result
print(city_stats)


In [None]:
import polars as pl

# Load dataset
df = pl.read_csv("electricity_consumption.csv", try_parse_dates=True)

# Define a custom function to calculate Peak Hour Load Factor
def calculate_peak_hour_load_factor(df):
    total_consumption = df["Consumption (kWh)"].sum()
    max_peak_hour_usage = df["Peak Hours Usage (kWh)"].max()
    num_days = df["Date"].n_unique()  # Count unique days in the data
    peak_hour_load_factor = total_consumption / (num_days * max_peak_hour_usage) if max_peak_hour_usage > 0 else None
    return pl.DataFrame({"Peak Hour Load Factor": [peak_hour_load_factor]})

# Apply custom function using groupby_map
city_stats = df.group_by("City").map_groups(calculate_peak_hour_load_factor)

# Display results
print(city_stats)


In [None]:
import polars as pl

# Load the dataset
df = pl.read_csv("electricity_consumption.csv", try_parse_dates=True).with_columns(pl.col("Date").dt.week().alias("Week_Number"))

# Group by City and calculate required metrics
city_stats = df.group_by("City","Week_Number").agg([
    pl.col("Consumption (kWh)").mean().alias("Avg Consumption (kWh)"),
    pl.col("Peak Hours Usage (kWh)").max().alias("Max Peak Hour Usage (kWh)")
])

# Display the result
print(city_stats)



In [None]:
states = pl.DataFrame({
    "City": [
        "New York", "Los Angeles", "Chicago", "Houston", "Phoenix",
        "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose"
    ],
    "State": [
        "New York", "California", "Illinois", "Texas", "Arizona",
        "Pennsylvania", "Texas", "California", "Texas", "California"
    ]
})
df = pl.read_csv("electricity_consumption.csv", try_parse_dates=True)

# Perform a FULL OUTER JOIN on "City"
merged_df = df.join(states, on="City", how="inner")

# Display the result
print(merged_df)


In [None]:
import polars as pl

# Load the two CSV files
df_2025 = pl.read_csv("electricity_consumption_2025.csv")
print("2025 dataset row count" , df_2025.shape[0])
df_previous = pl.read_csv("electricity_consumption.csv")
print("2024 dataset row count" , df_previous.shape[0])

# Concatenate the DataFrames vertically (stacking rows)
concatenated_df = pl.concat([df_2025, df_previous])

# Display the first few rows of the concatenated DataFrame
print("concatenated df row count" , concatenated_df.shape[0])



In [None]:
import polars as pl

# Example DataFrames
df1 = pl.DataFrame({
    "City": ["New York", "Los Angeles", "Chicago"],
    "Consumption": [5000, 4500, 6000]
})

df2 = pl.DataFrame({
    "City": ["New York", "Los Angeles", "Chicago"],
    "Temperature": [30, 28, 25]
})

df3 = pl.DataFrame({
    "City": ["New York", "Los Angeles", "Chicago"],
    "Peak Hours Usage": [2500, 2200, 2800]
})

# Merge the DataFrames
merged_df = df1.join(df2, on="City", how="inner") \
               .join(df3, on="City", how="inner")

# Display the merged DataFrame
print(merged_df)


In [None]:
import polars as pl

# Create first DataFrame (First 5 Days)
df1 = pl.DataFrame({
    "City": ["New York", "Los Angeles", "Chicago"],
    "2025-01-01": [500, 450, 600],
    "2025-01-02": [520, 460, 620],
    "2025-01-03": [530, 470, 630],
    "2025-01-04": [540, 480, 640],
    "2025-01-05": [550, 490, 650],
})

# Create second DataFrame (Next 5 Days)
df2 = pl.DataFrame({
    "City": ["New York", "Los Angeles", "Chicago"],
    "2025-01-06": [560, 500, 660],
    "2025-01-07": [570, 510, 670],
    "2025-01-08": [580, 520, 680],
    "2025-01-09": [590, 530, 690],
    "2025-01-10": [600, 540, 700],
})

# Horizontally stack the two DataFrames
hstacked_df = df1.hstack(df2.drop("City"))

# Display the result
print(hstacked_df)


In [None]:
import polars as pl

# Load the dataset
df = pl.read_csv("electricity_consumption.csv", try_parse_dates=True)

# Pivot the data: City as index, Dates as columns, and Consumption as values
pivot_df = df.pivot(
    index="City",
    on="Date",
    values="Consumption (kWh)"
)

# Display the pivoted DataFrame
print(pivot_df)

# Save the pivoted DataFrame to a new CSV file
pivot_df.write_csv("pivoted_electricity_consumption.csv")


In [None]:


# Unpivot the data: Convert multiple date columns into "Date" and "Consumption (kWh)"
melted_df = pivot_df.unpivot(
    index=["City"],      # Columns to keep unchanged
    on=pivot_df.columns[1:],  # Columns to unpivot (all date columns)
    variable_name="Date",        # New column for dates
    value_name="Consumption (kWh)"  # New column for consumption values
)

# Display the unpivoted DataFrame
print(melted_df)



In [None]:
import polars as pl

# Load the dataset and parse dates
df = pl.read_csv("electricity_consumption.csv", try_parse_dates=True)

# Sort the DataFrame by Date before resampling
df_sorted = df.sort("Date")

# Convert daily consumption data into weekly totals
weekly_consumption = df_sorted.group_by_dynamic("Date", every="1w").agg(pl.col("Consumption (kWh)").sum())

# Display the weekly aggregated consumption
print(weekly_consumption)


In [None]:
df_sorted = df.sort("Date")  # Ensure data is sorted by time
df_upsampled = df_sorted.with_columns(
    pl.col("Consumption (kWh)").interpolate()
)


In [None]:
df_upsampled.shape

In [None]:
df_sorted.shape

In [None]:
import polars as pl
import numpy as np

# Load the dataset and parse dates
df = pl.read_csv("electricity_consumption.csv", try_parse_dates=True)
print("original dataframe", df.shape)

# Sort the DataFrame by Date before any operation
df_sorted = df.sort("Date")

# Delete random rows before performing interpolation
random_indices = np.random.choice(df_sorted.height, 50, replace=False)
df_without_random_dates = df_sorted.filter(~pl.col("Date").is_in(df_sorted["Date"][random_indices]))

print("modified dataframe with random samples removed",df_without_random_dates)
# Perform interpolation on the remaining data
df_upsampled = df_without_random_dates.upsample(time_column="Date",every="1d",group_by ="City",maintain_order=True).select(pl.all().forward_fill())

# Display the upsampled data
print("upsampled dataframe" , df_upsampled)


In [None]:
import polars as pl
import numpy as np

# Create a sample DataFrame
df = pl.DataFrame({
    "City": ["New York", "Los Angeles", "Chicago", "Houston"],
    "Consumption (kWh)": [450, 600, 350, 500]
})

# Apply a transformation: Scaling the consumption values by 1.2
df_transformed = df.with_columns(
    (pl.col("Consumption (kWh)") * 1.2).alias("Scaled Consumption")
)

print(df_transformed)


In [None]:
# Create a sample DataFrame
df = pl.DataFrame({
    "City": ["New York", "Los Angeles", "Chicago", "Houston"]
})

# Define a dictionary for mapping cities to state codes
city_to_state = {
    "New York": "NY",
    "Los Angeles": "CA",
    "Chicago": "IL",
    "Houston": "TX"
}

# Map the cities to state codes using the apply method
df_with_states = df.with_columns(
    pl.col("City").map_elements(lambda x: city_to_state.get(x, "Unknown"),return_dtype=pl.Utf8).alias("State")
)

print(df_with_states)


In [None]:
# Create a sample DataFrame
df = pl.DataFrame({
    "City": ["New York", "Los Angeles", "Chicago", "Houston"]
})
print(df.dtypes," <--- default data types")

# Convert the 'City' column to a categorical type
df_categorized = df.with_columns(pl.col("City").cast(pl.Categorical).alias("City Categorical"))

print(df_categorized.dtypes," <--- Categorical data types")

# One-hot encode the categorical column
df_one_hot = df_categorized.to_dummies()

print(df_one_hot)


In [None]:
import polars as pl

# Create a sample stock price dataset with a list of specific dates
df = pl.DataFrame({
    "Date": [
        "2024-01-01", "2024-01-03", "2024-01-05", "2024-01-06", "2024-01-08",
        "2024-01-10", "2024-01-12", "2024-01-15", "2024-01-17", "2024-01-20"
    ],
    "Stock_Price": [100, 102, 98, 105, 110, 108, 112, 115, 118, 120]
})

# Convert the Date column to datetime format
df = df.with_columns(pl.col("Date").str.to_date())

# Compute 3-day rolling average of stock prices
df = df.with_columns(
    pl.col("Stock_Price").rolling_mean(window_size=3).alias("Rolling_Avg_3d")
)

print(df)


In [None]:
import polars as pl

# Sample dataset: Daily stock prices
df = pl.DataFrame({
    "Date": ["2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04", "2024-01-05",
             "2024-01-06", "2024-01-07", "2024-01-08", "2024-01-09", "2024-01-10"],
    "Stock_Price": [100, 102, 98, 105, 110, 108, 112, 115, 118, 120]
},schema={"Date": pl.Date, "Stock_Price": pl.Int32})

# Compute a 5-day rolling mean using expr.rolling_mean()
df = df.with_columns(
    pl.mean("Stock_Price").rolling(index_column="Date", period="3d").alias("Rolling_Avg_3d")
)

print(df)



In [None]:
import polars as pl


# Create a sample dataset for car production
df = pl.DataFrame({
    "Date": [
        "2024-01-01", "2024-01-03", "2024-01-05", "2024-01-06", "2024-01-08",
        "2024-01-10", "2024-01-12", "2024-01-15", "2024-01-17", "2024-01-20"
    ],
    "Cars_Produced": [200, 250, 180, 300, 320, 400, 350, 370, 390, 420]
})


# Compute cumulative sum of cars produced
df = df.with_columns(
    pl.col("Cars_Produced").cum_sum().alias("Total_Cars_Produced")
)


print(df)


In [None]:
import polars as pl

# Sample patient visit data
df = pl.DataFrame({
    "Hospital": ["H1", "H1", "H1", "H2", "H2", "H2"],
    "Patient_ID": [101, 102, 103, 201, 202, 203],
    "Visit_Date": ["2024-02-01", "2024-02-03", "2024-02-05", "2024-02-02", "2024-02-04", "2024-02-06"]
}).with_columns(pl.col("Visit_Date").str.to_date())

# Rank patients based on visit order within each hospital
df = df.with_columns(
    pl.col("Visit_Date").rank().over("Hospital", order_by="Visit_Date").alias("Visit_Rank")
)

print(df)


In [None]:
import polars as pl

# Sample electricity consumption data
df = pl.DataFrame({
    "City": ["NY", "NY", "NY", "LA", "LA", "LA"],
    "Date": ["2024-03-01", "2024-03-01", "2024-03-01", "2024-03-01", "2024-03-01", "2024-03-01"],
    "Hour": [1, 2, 3, 1, 2, 3],
    "Power_Usage": [500, 600, 550, 700, 750, 720]
}).with_columns(pl.col("Date").str.to_date())

# Compute max hourly power usage per city per day
df = df.with_columns(
    pl.col("Power_Usage").max().over(["City", "Date"]).alias("Peak_Hourly_Usage")
)

print(df)


In [None]:
import polars as pl
import numpy as np

# Generate a larger sample size (50 cities with random power usage values)
np.random.seed(42)  # For reproducibility
cities = [f"City_{i}" for i in range(1, 51)]  # 50 cities
power_usage = np.random.randint(100, 1500, size=50)  # Random power usage between 100 and 1500

# Add one extreme value to simulate an outlier
power_usage[-1] = 15000  # Set last value as an outlier

# Create the DataFrame
df = pl.DataFrame({
    "City": cities,
    "Power_Usage": power_usage
})

# Calculate Q1, Q3, and IQR
Q1 = df["Power_Usage"].quantile(0.25)
Q3 = df["Power_Usage"].quantile(0.75)
IQR = Q3 - Q1

# Determine outlier thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Flag outliers (values outside the threshold range)
df = df.with_columns(
    pl.when(pl.col("Power_Usage") < lower_bound)
    .then(True)
    .otherwise(pl.when(pl.col("Power_Usage") > upper_bound).then(True).otherwise(False))
    .alias("Is_Outlier")
)

print(df)


In [None]:
import polars as pl

# Create a DataFrame with date-time data
df = pl.DataFrame({
    "Timestamp": ["2024-01-01 12:30:00", "2024-01-02 14:45:00", "2024-01-03 16:15:00"]
}).with_columns(pl.col("Timestamp").str.strptime(pl.Datetime).alias("Date"))

# Extracting year, month, day, and hour
df = df.with_columns([
    pl.col("Date").dt.year().alias("Year"),
    pl.col("Date").dt.month().alias("Month"),
    pl.col("Date").dt.day().alias("Day"),
    pl.col("Date").dt.hour().alias("Hour")
])

print(df)


In [None]:
import polars as pl

# Create a DataFrame with time-zone-aware data
df = pl.DataFrame({
    "Timestamp": ["2024-01-01 12:30:00+05:30", "2024-01-02 14:45:00+05:30", "2024-01-03 16:15:00+05:30"]
}).with_columns(pl.col("Timestamp").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S%z").alias("Date"))

# Convert timestamps to UTC
df = df.with_columns(pl.col("Date").dt.convert_time_zone("UTC").alias("Date_UTC"))

print(df)


In [None]:
import polars as pl

# Sample data with numerical values
df = pl.DataFrame({
    "Age": [18, 25, 40, 55, 60, 72, 85, 90, 32, 41]
})

# Define bin edges (age ranges)
bins = [0, 20, 40, 60, 80, 100]

# Create a new column "Age_Bucket" by binning the "Age" column
df = df.with_columns(
    pl.col("Age").cut(bins).alias("Age_Bucket")
)

print(df)


In [None]:
df = pl.DataFrame({"salary": [30000, 45000, 60000, 75000, 100000, 120000]})
df.with_columns(
    pl.col("salary").qcut([0.25, 0.75], labels=["low", "med", "high"]).alias("qcut")
)


In [None]:
import polars as pl

# Sample data with nested structures (lists and structs)
df = pl.DataFrame({
    "City": ["New York", "Los Angeles", "Chicago"],
    "Weather": [
        {"Temperature": 25, "Humidity": 65},
        {"Temperature": 28, "Humidity": 70},
        {"Temperature": 20, "Humidity": 75}
    ],
    "Hourly_Temperature": [
        [22, 23, 24, 25],
        [27, 28, 29, 30],
        [19, 20, 21, 22]
    ]
})
print("Original DF")
print(df)
# Flatten the nested structs in the 'Weather' column
df_flattened = df.with_columns(
    pl.col("Weather").struct.field("Temperature").alias("Temperature"),
    pl.col("Weather").struct.field("Humidity").alias("Humidity")
)

# Flatten the 'Hourly_Temperature' list into separate rows using explode
df_exploded = df_flattened.explode("Hourly_Temperature")

# Show the final flattened DataFrame
print("flattened DF")
print(df_exploded)
