# Advanced Topics - Solutions

Rolling and expanding windows, advanced MultiIndex operations, memory optimization, and query() method.

## Question 1
Create a time series DataFrame and apply a rolling window calculation (e.g., 3-period moving average).

In [None]:
import pandas as pd
import numpy as np

# Create time series data
np.random.seed(42)
dates = pd.date_range('2023-01-01', periods=20, freq='D')
values = np.random.randn(20).cumsum() + 100

ts_df = pd.DataFrame({
    'Date': dates,
    'Price': values
})
ts_df.set_index('Date', inplace=True)

print("Original time series:")
print(ts_df.head(10))

# Apply rolling window
ts_df['MA_3'] = ts_df['Price'].rolling(window=3).mean()
ts_df['MA_5'] = ts_df['Price'].rolling(window=5).mean()
ts_df['Rolling_Std'] = ts_df['Price'].rolling(window=3).std()

print("\nWith rolling statistics:")
print(ts_df.head(10))

## Question 2
Use expanding() window to calculate cumulative statistics over time.

In [None]:
# Expanding window calculations
ts_df['Expanding_Mean'] = ts_df['Price'].expanding().mean()
ts_df['Expanding_Std'] = ts_df['Price'].expanding().std()
ts_df['Expanding_Min'] = ts_df['Price'].expanding().min()
ts_df['Expanding_Max'] = ts_df['Price'].expanding().max()

print("Expanding window statistics:")
print(ts_df[['Price', 'Expanding_Mean', 'Expanding_Std', 'Expanding_Min', 'Expanding_Max']].head(10))

## Question 3
Create a DataFrame with MultiIndex and perform advanced indexing operations (cross-section, index slicing).

In [None]:
# Create MultiIndex DataFrame
indices = pd.MultiIndex.from_tuples([
    ('A', 'X', 1), ('A', 'X', 2), ('A', 'Y', 1), ('A', 'Y', 2),
    ('B', 'X', 1), ('B', 'X', 2), ('B', 'Y', 1), ('B', 'Y', 2)
], names=['Group', 'Category', 'Period'])

multi_df = pd.DataFrame({
    'Value1': np.random.randn(8),
    'Value2': np.random.randn(8)
}, index=indices)

print("MultiIndex DataFrame:")
print(multi_df)

# Cross-section (xs)
print("\nCross-section for Group 'A':")
print(multi_df.xs('A', level='Group'))

print("\nCross-section for Category 'X':")
print(multi_df.xs('X', level='Category'))

# Index slicing
print("\nSlicing Group 'A', Category 'X':")
print(multi_df.loc[('A', 'X'), :])

# Multiple level selection
print("\nSelect multiple groups:")
print(multi_df.loc[(['A', 'B'], 'X'), :])

## Question 4
Use the query() method to filter data with complex conditions instead of boolean indexing.

In [None]:
# Create sample data
query_df = pd.DataFrame({
    'A': np.random.randn(100),
    'B': np.random.randn(100),
    'C': np.random.choice(['X', 'Y', 'Z'], 100),
    'D': np.random.randint(1, 10, 100)
})

print("Sample data:")
print(query_df.head())

# Complex query conditions
print("\nQuery: A > 0 and B < 0 and C == 'X':")
result1 = query_df.query("A > 0 and B < 0 and C == 'X'")
print(f"Rows found: {len(result1)}")
print(result1.head())

# Query with variables
threshold = 0.5
print(f"\nQuery: A > {threshold} and D >= 5:")
result2 = query_df.query("A > @threshold and D >= 5")
print(f"Rows found: {len(result2)}")
print(result2.head())

# Query with isin
valid_categories = ['X', 'Y']
result3 = query_df.query("C in @valid_categories and D % 2 == 0")
print(f"\nQuery with isin and modulo: {len(result3)} rows")
print(result3.head())

## Question 5
Optimize memory usage of a DataFrame by converting object columns to categorical and downcasting numeric types.

In [None]:
# Create DataFrame with inefficient types
optim_df = pd.DataFrame({
    'int_col': np.random.randint(0, 100, 10000),  # Can be int8
    'float_col': np.random.randn(10000),  # Can be float32
    'category_col': np.random.choice(['A', 'B', 'C'], 10000),  # Should be categorical
    'big_int': np.random.randint(0, 1000, 10000)  # Can be int16
})

print("Original memory usage:")
print(optim_df.dtypes)
print(f"Memory usage: {optim_df.memory_usage(deep=True).sum() / 1024:.2f} KB")

# Optimize memory
optim_df_copy = optim_df.copy()

# Downcast integers
optim_df_copy['int_col'] = pd.to_numeric(optim_df_copy['int_col'], downcast='integer')
optim_df_copy['big_int'] = pd.to_numeric(optim_df_copy['big_int'], downcast='integer')

# Downcast floats
optim_df_copy['float_col'] = pd.to_numeric(optim_df_copy['float_col'], downcast='float')

# Convert to categorical
optim_df_copy['category_col'] = optim_df_copy['category_col'].astype('category')

print("\nOptimized memory usage:")
print(optim_df_copy.dtypes)
print(f"Memory usage: {optim_df_copy.memory_usage(deep=True).sum() / 1024:.2f} KB")

memory_saved = optim_df.memory_usage(deep=True).sum() - optim_df_copy.memory_usage(deep=True).sum()
print(f"Memory saved: {memory_saved / 1024:.2f} KB ({memory_saved/optim_df.memory_usage(deep=True).sum()*100:.1f}%)")

## Question 6
Implement a custom rolling window function that calculates the range (max - min) over a window.

In [None]:
# Custom rolling function
def rolling_range(series):
    return series.max() - series.min()

# Apply custom function
ts_df['Rolling_Range'] = ts_df['Price'].rolling(window=5).apply(rolling_range, raw=False)

print("Custom rolling range calculation:")
print(ts_df[['Price', 'Rolling_Range']].head(10))

# Compare with built-in functions
ts_df['Rolling_Max'] = ts_df['Price'].rolling(window=5).max()
ts_df['Rolling_Min'] = ts_df['Price'].rolling(window=5).min()
ts_df['Range_Manual'] = ts_df['Rolling_Max'] - ts_df['Rolling_Min']

print("\nVerification (should be equal):")
print((ts_df['Rolling_Range'] == ts_df['Range_Manual']).all())

## Question 7
Work with MultiIndex DataFrames: swap levels, sort by multiple levels, and reset specific levels.

In [None]:
print("Original MultiIndex DataFrame:")
print(multi_df)

# Swap levels
swapped_df = multi_df.swaplevel('Group', 'Category')
print("\nAfter swapping Group and Category levels:")
print(swapped_df)

# Sort by multiple levels
sorted_df = multi_df.sort_index(level=['Category', 'Group'])
print("\nSorted by Category then Group:")
print(sorted_df)

# Reset specific level
reset_df = multi_df.reset_index(level='Period')
print("\nAfter resetting Period level:")
print(reset_df)

# Reorder levels
reordered_df = multi_df.reorder_levels(['Category', 'Group', 'Period'])
print("\nReordered levels:")
print(reordered_df.head())

## Question 8
Use eval() method for efficient evaluation of expressions on large DataFrames.

In [None]:
# Create large DataFrame
eval_df = pd.DataFrame({
    'A': np.random.randn(10000),
    'B': np.random.randn(10000),
    'C': np.random.randn(10000)
})

print("Using eval() for efficient computation:")

# Traditional approach
traditional_result = eval_df['A'] + eval_df['B'] * eval_df['C']

# Using eval()
eval_result = eval_df.eval('A + B * C')

print(f"Results are equal: {(traditional_result == eval_result).all()}")

# More complex expression
eval_df.eval('D = A + B', inplace=True)
eval_df.eval('E = A * B + C', inplace=True)
eval_df.eval('F = (A > 0) & (B > 0)', inplace=True)

print("\nNew columns created with eval():")
print(eval_df[['A', 'B', 'C', 'D', 'E', 'F']].head())

# Query combined with eval
filtered = eval_df.query('A > 0 and B > 0')
print(f"\nFiltered DataFrame shape: {filtered.shape}")

## Question 9
Apply rolling correlation between two time series to detect changing relationships.

In [None]:
# Create two related time series
np.random.seed(42)
n_periods = 100
dates = pd.date_range('2023-01-01', periods=n_periods, freq='D')

# Create series with changing correlation
series1 = np.random.randn(n_periods).cumsum()
series2 = np.concatenate([
    series1[:50] + np.random.randn(50) * 0.2,  # High correlation first half
    np.random.randn(50).cumsum()  # Low correlation second half
])

corr_df = pd.DataFrame({
    'Series1': series1,
    'Series2': series2
}, index=dates)

print("Time series data:")
print(corr_df.head())

# Calculate rolling correlation
window_size = 20
corr_df['Rolling_Correlation'] = corr_df['Series1'].rolling(window=window_size).corr(corr_df['Series2'])

print(f"\nRolling correlation (window={window_size}):")
print(corr_df[['Series1', 'Series2', 'Rolling_Correlation']].head(25))

# Summary statistics of rolling correlation
print(f"\nRolling correlation statistics:")
print(corr_df['Rolling_Correlation'].describe())

# Find periods of high/low correlation
high_corr = corr_df[corr_df['Rolling_Correlation'] > 0.8]
low_corr = corr_df[corr_df['Rolling_Correlation'] < 0.2]

print(f"\nPeriods with high correlation (>0.8): {len(high_corr)}")
print(f"Periods with low correlation (<0.2): {len(low_corr)}")

## Question 10
Create a custom aggregation function using apply() with rolling windows for complex calculations.

In [None]:
# Custom aggregation function: calculate percentage of values above mean
def pct_above_mean(series):
    if len(series) == 0:
        return np.nan
    mean_val = series.mean()
    return (series > mean_val).sum() / len(series) * 100

# Custom function: volatility (coefficient of variation)
def coefficient_of_variation(series):
    if len(series) == 0 or series.mean() == 0:
        return np.nan
    return series.std() / abs(series.mean()) * 100

# Apply custom functions to rolling windows
ts_df['Pct_Above_Mean'] = ts_df['Price'].rolling(window=7).apply(pct_above_mean, raw=False)
ts_df['CoV'] = ts_df['Price'].rolling(window=7).apply(coefficient_of_variation, raw=False)

print("Custom rolling aggregations:")
print(ts_df[['Price', 'Pct_Above_Mean', 'CoV']].head(15))

# Multiple custom functions at once
def rolling_stats(series):
    return pd.Series({
        'custom_mean': series.mean(),
        'custom_std': series.std(),
        'custom_skew': series.skew(),
        'custom_range': series.max() - series.min()
    })

# Apply multiple custom functions
custom_stats = ts_df['Price'].rolling(window=5).apply(lambda x: pd.Series({
    'Q25': x.quantile(0.25),
    'Q75': x.quantile(0.75),
    'IQR': x.quantile(0.75) - x.quantile(0.25)
}), raw=False)

print("\nMultiple custom statistics:")
if hasattr(custom_stats, 'columns'):
    print(custom_stats.head(10))
else:
    print("Custom stats calculated (single series)")

# Alternative approach for multiple stats
ts_df[['Q25_5', 'Q75_5']] = ts_df['Price'].rolling(window=5).quantile([0.25, 0.75]).unstack(level=-1)
ts_df['IQR_5'] = ts_df['Q75_5'] - ts_df['Q25_5']

print("\nQuantile-based rolling statistics:")
print(ts_df[['Price', 'Q25_5', 'Q75_5', 'IQR_5']].head(10))