In [None]:
""" MERGING DATAFRAMES """

In [15]:
import pandas as pd
import numpy as np

# Sample DataFrames
orders = pd.DataFrame({
    "order_id": [1, 2, 3],
    "customer_id": [101, 102, 103],
    "amount": [50, 100, 75]
})

customers = pd.DataFrame({
    "customer_id": [101, 102, 104],
    "name": ["Alice", "Bob", "Charlie"],
    "email": ["a@test.com", "b@test.com", "c@test.com"]
})

# Types of Joins

# Inner Join (default)
pd.merge(orders, customers, on="customer_id")

# Left Join (keep all orders)
pd.merge(orders, customers, on="customer_id", how="left")

# Right join (keep all customers)
pd.merge(orders, customers, on="customer_id", how="right")

# Outer join (keep all records)
pd.merge(orders, customers, on="customer_id", how="outer")



Unnamed: 0,order_id,customer_id,amount,name,email
0,1.0,101,50.0,Alice,a@test.com
1,2.0,102,100.0,Bob,b@test.com
2,3.0,103,75.0,,
3,,104,,Charlie,c@test.com


In [None]:
""" ADVANCED DATETIME OPERATIONS """

In [None]:
# TIME-BASED GROUPING

# Create date range with time
date_rng = pd.date_range(start="2023-01-01", end="2023-12-31", freq="D")
df = pd.DataFrame(date_rng, columns=["date"])
df["value"] = np.random.randint(1, 100, size=(len(date_rng)))

# Resample to monthly sums
monthly = df.set_index("date").resample("ME").sum()

# Extract datetime components
df["year"] = df["date"].dt.year
df["quarter"] = df["date"].dt.quarter
df["day_name"] = df["date"].dt.day_name()

# TIME DIFFERENCE
df["days_since"] = (pd.to_datetime("today") - df["date"]).dt.days

In [None]:
""" MEMORY OPTIMIZATION """

In [24]:
# REDUCE DATE SIZE

# Before Optimization
print(df.memory_usage(deep=True))

# Downcast Numeric Columns
df["value"] = pd.to_numeric(df["value"], downcast="integer")

# Convert to categorical
df["day_name"] = df["day_name"].astype("category")

# After Optimization
print(df.memory_usage(deep=True))

Index           132
date           2920
value          2920
year           1460
quarter        1460
day_name      20491
days_since     2920
dtype: int64
Index          132
date          2920
value          365
year          1460
quarter       1460
day_name      1058
days_since    2920
dtype: int64


In [None]:
""" ADVANCE FILTERING with QUERY() """

In [None]:
# Traditional Filtering
df[(df["value"] > 50) & (df["day_name"].isin(["Monday", "Friday"]))]

# Using query() - more readable
df.query("value > 50 and day_name in ['Monday', 'Friday']")

In [None]:
""" HANDLING LARGE DATASETS """

In [None]:
# CHUNK PROCESSING

# Process CSV in chunks
chunk_size = 10_000
for chunk in pd.read_csv("large_file.csv", chunksize=chunk_size):
    process(chunk) # Your Custom Function

In [None]:
# Dask Alternative
import dask.dataframe as dd
ddf = dd.read_csv("very_large_file.csv")
result = ddf.groupby("category").sum().compute()

In [None]:
# PRACTICE PROJECT: Sales Analysis Pro

In [60]:
# MAIN SALES DATA
sales = pd.DataFrame({
    "Date": pd.date_range("2023-01-01", periods=10),
    "Product": ["Laptop", "Phone", "Tablet", "Laptop", "Phone", 
                "Tablet", "Laptop", "Phone", "Tablet", "Laptop"],
    "Price": [999, 699, 399, 1099, 799, 449, 899, 749, 499, 1299],
    "Units": [5, 8, 12, 4, 9, 7, 6, 10, 5, 3],
    "Region": ["West", "East", "North", "South", "East", 
               "West", "North", "South", "East", "West"]
})
sales["Total"] = sales["Price"] * sales["Units"]  # Revenue calculation

# DISCOUNT DATA
discounts = pd.DataFrame({
    "Product": ["Laptop", "Phone", "Tablet"],
    "Discount": [0.1, 0.15, 0.05]  # 10%, 15%, 5% discounts
})

# RETURNS DATA
returns = pd.DataFrame({
    "Date": pd.to_datetime(["2023-01-05", "2023-01-12"]),  # Matching sales dates
    "Product": ["Laptop", "Phone"],
    "Units": [1, 2]  # Returned quantities
})

# Task 1: Merge df.discounts into df.returns data
merged_data = pd.merge(sales, discounts, on="Product", how="left")

# Task: 2: Calculate net revenue after discounts
merged_data["Net_Revenue"] = merged_data["Total"] * (1 - merged_data["Discount"])

# Task 3: Substract returned items from sales
 # Step 1: Calculate return amounts (requires price lookup)
returns_with_price = pd.merge(returns, sales[["Date","Product", "Price"]],
                               on=["Date", "Product"],
                               how="left")
returns_with_price["Return_Amount"] = returns_with_price["Price"] * returns_with_price["Units"]
 
 # Step 2: Aggregate returns by product
product_returns = returns_with_price.groupby("Product")["Return_Amount"].sum()
 
 # Step 3: Substract from net revenue
final_revenue = merged_data.groupby("Product")["Net_Revenue"].sum() - product_returns
print("\nFINAL REVENUE (AFTER DISCOUNTS & RETURNS):")
print(final_revenue.round(2))
 
# Task 4: Weekly revenue trends
weekly_revenue = merged_data.set_index("Date").resample("W")["Net_Revenue"].sum()
print(weekly_revenue)


FINAL REVENUE (AFTER DISCOUNTS & RETURNS):
Product
Laptop    16813.80
Phone     17232.05
Tablet         NaN
dtype: float64
Date
2023-01-01     4495.50
2023-01-08    33577.50
2023-01-15     5877.55
Freq: W-SUN, Name: Net_Revenue, dtype: float64
