In [None]:
import polars as pl

# Read the JSON file into a Polars DataFrame
df = pl.read_json('trades.json')

# Show the first few rows of the DataFrame
print(df.head())

In [None]:
import polars as pl

# Read the NDJSON file into a Polars DataFrame
df = pl.read_ndjson('tradesnd.json')

# Show the first few rows of the DataFrame
print(df.head())

In [None]:
import polars as pl
import os

# File path
file_path = 'credit_card_transactions.parquet'

# Read the Parquet file into a Polars DataFrame
df = pl.read_parquet(file_path)

# Get the file size
file_size = os.path.getsize(file_path)

# Print the DataFrame and file size
print(df.head())
print("row count is = ",df.height)
print(f"File size: {file_size / (1024 * 1024):.2f} MB")  

In [None]:
print(df.height)

In [None]:
import polars as pl

# File path for the CSV file
file_path = 'credit_card_transactions.csv'

# Read the CSV file into a Polars DataFrame
df = pl.read_csv(file_path, separator='|', skip_rows=4)

# Show the first few rows of the DataFrame
print(df.head())

In [None]:
import polars as pl
df = pl.read_excel("Excel Pillar Data.xlsx", sheet_name="Sheet1")
print(df.head())

In [None]:
import polars as pl
import random
from datetime import datetime, timedelta

# Function to generate random traffic data
def generate_traffic_data(rows):
    vehicle_types = ["Car", "Bus", "Motorbike", "Truck", "Auto"]
    road_conditions = ["Clear", "Congested", "Accident", "Construction", "Heavy Rain"]
    
    # Generate random data
    data = {
        "Timestamp": [datetime.now() - timedelta(minutes=random.randint(1, 1000)) for _ in range(rows)],
        "Traffic Volume": [random.randint(50, 3000) for _ in range(rows)],  # Vehicles per hour
        "Road Condition": [random.choice(road_conditions) for _ in range(rows)],
        "Speed Limit": [random.choice([40, 50, 60, 70, 80]) for _ in range(rows)],  # km/h
        "Vehicle Type": [random.choice(vehicle_types) for _ in range(rows)],
    }
    
    return pl.DataFrame(data)

# Create the DataFrame with 100 rows
df = generate_traffic_data(100)
print(df.head())

In [None]:
df.write_csv("bangalore_traffic_data.csv")
print("CSV file saved.") # Export to JSON 
df.write_json("bangalore_traffic_data.json") 
print("JSON file saved.") # Export to NDJSON 
df.write_ndjson("bangalore_traffic_data.ndjson")
print("NDJSON file saved.")


In [None]:
df.write_excel("bangalore_traffic_data.xlsx")

In [None]:
import polars as pl
import random
from datetime import datetime

# Create a sample DataFrame for electronic store sales data
data = {
    "Transaction ID": [f"TXN{str(i).zfill(5)}" for i in range(1, 11)],
    "Product": ["Laptop", "Phone", "Tablet", "Laptop", "Headphones", "Phone", "Tablet", "Laptop", "Phone", "Headphones"],
    "Quantity": [random.randint(1, 5) for _ in range(10)],
    "Price per Unit": [random.randint(500, 1500) for _ in range(10)],
    "Date of Sale": [datetime.now().strftime("%Y-%m-%d") for _ in range(10)],
    "Store Location": ["Bangalore", "Chennai", "Mumbai", "Bangalore", "Delhi", "Chennai", "Mumbai", "Bangalore", "Delhi", "Chennai"]
}

df = pl.DataFrame(data)
print("Original DataFrame:")
print(df)


In [None]:
# Selecting a single column (Price per Unit)
price_column = df["Price per Unit"]
print("Selected 'Price per Unit' column:")
print(price_column)

# Selecting multiple columns (Product, Quantity, Price per Unit)
selected_columns = df.select(["Product", "Quantity", "Price per Unit"])
print("Selected multiple columns (Product, Quantity, Price per Unit):")
print(selected_columns)


In [None]:
# Renaming columns
df_renamed = df.rename({"Transaction ID": "TransactionID", "Price per Unit": "Unit Price"})
print("DataFrame with renamed columns:")
print(df_renamed)


In [None]:
# Adding a new column for total sales (Quantity * Price per Unit)
df_with_total = df.with_columns((df["Quantity"] * df["Price per Unit"]).alias("Total Sales"))
print("DataFrame with 'Total Sales' column:")
print(df_with_total)

In [None]:
# Filtering rows where the quantity sold is greater than 2
filtered_df = df.filter(df["Quantity"] > 2)
print("Filtered DataFrame where Quantity > 2:")
print(filtered_df)


In [None]:
# Sorting by 'Quantity' in ascending order
sorted_df_desc = df.sort("Quantity")
print("DataFrame sorted by 'Quantity' in descending order:")
print(sorted_df_desc)

# Sorting by 'Product' and 'Quantity' (first by product, then by quantity)
sorted_df_multiple = df.sort(["Product", "Quantity"])
print("DataFrame sorted by 'Product' and 'Quantity':")
print(sorted_df_multiple)


In [None]:
# Adding a new column for Total Sales (Quantity * Price per Unit)
df_with_total_sales = df.with_columns((df["Quantity"] * df["Price per Unit"]).alias("Total Sales"))
print(df_with_total_sales)

# Adding a column that categorizes products into 'High Value' or 'Low Value' based on a threshold
df_with_value_category = df_with_total_sales.with_columns(
    (pl.col("Total Sales") > 3000).alias("High Value")
)
print(df_with_value_category)


In [None]:
# Descriptive statistics for numerical columns
summary = df.describe()
print(summary)

# Group-wise aggregation: Calculate the total sales per product
grouped = df_with_value_category.group_by("Product").agg(
    [
        pl.col("Total Sales").sum().alias("Total Sales Per Product"),
        pl.col("Quantity").sum().alias("Total Quantity Sold"),
        pl.col("Price per Unit").mean().alias("Average Price per Unit")
    ]
)
print(grouped)


In [None]:
null_row = pl.DataFrame({
    "Transaction ID": [None],
    "Product": [None],
    "Quantity": [None],
    "Price per Unit": [None],
    "Date of Sale": [None],
    "Store Location": [None]
})

# Append the null row to the existing dataframe
df = df.vstack(null_row)
print("dataframe with null")
print(df)

# Identifying missing data: Check for missing values in the 'Quantity' column
missing_quantity = df.filter(pl.col("Quantity").is_null())
print(missing_quantity)

# Handling missing values: Replace missing values with a default value (e.g., fill with 0 for 'Quantity' and 1000 for 'Price per Unit')
df_filled = df.fill_null(0)
print(df_filled)

# Drop rows with missing values (optional)
df_dropped = df.drop_nulls()
print(df_dropped)


In [None]:
# Remove duplicate rows based on all columns
df_no_duplicates = df.unique()
print("unique rows with one column for selection")
print(df_no_duplicates)

# Remove duplicate rows based on specific columns (e.g., 'Product' and 'Store Location')
df_no_duplicates_specific = df.unique(subset=["Product", "Store Location"])
print("unique rows with two columns for selection")
print(df_no_duplicates_specific)


In [None]:
# Standardize column names: Convert to lower case and replace spaces with underscores
df_standardized_columns = df.rename({col: col.lower().replace(" ", "_") for col in df.columns})
print(df_standardized_columns)

In [None]:
# Clean strings: Convert to lowercase, trim leading/trailing whitespace, and remove any extra spaces between words
df_cleaned_strings = df.with_columns([
    pl.col("Product")
    .str.strip_chars()  # Trim whitespace
    .str.to_lowercase()  # Convert to lowercase
    .str.replace_all(r"\s+", " ")  # Replace multiple spaces with a single space
    .alias("Cleaned Product")
])
print("cleaned column")
print(df_cleaned_strings)

# Explode a string column into a list of words (split by spaces)
df_exploded = df.with_columns([
    pl.col("Product").str.split(" ").alias("Exploded Product")
])
print("exploded column")
print(df_exploded)

# Extract a substring using regular expressions (e.g., extract the first char)
df_regex_extracted = df.with_columns([
    pl.col("Product").str.extract(r"(\w)", 1).alias("First Word")
])
print("Regex Extraction")
print(df_regex_extracted)
