# UK Inflation Forecasting - Exploratory Data Analysis

This notebook provides exploratory data analysis for the UK inflation forecasting project.

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_collection import DataCollector
from data_preprocessing import DataPreprocessor
from visualization import Visualizer

%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Data Collection

In [None]:
# Collect data
collector = DataCollector(output_dir='../data/raw')
df = collector.collect_all_data()

print(f"Data shape: {df.shape}")
df.head()

## 2. Data Overview

In [None]:
# Basic statistics
df.describe()

In [None]:
# Check for missing values
print("Missing values:")
df.isnull().sum()

## 3. Time Series Visualization

In [None]:
# Plot inflation over time
plt.figure(figsize=(14, 6))
plt.plot(df['date'], df['cpi_inflation'], linewidth=2)
plt.xlabel('Date')
plt.ylabel('CPI Inflation (%)')
plt.title('UK CPI Inflation Over Time')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Plot all economic indicators
fig, axes = plt.subplots(3, 3, figsize=(16, 12))
axes = axes.flatten()

columns = df.select_dtypes(include=[np.number]).columns

for i, col in enumerate(columns):
    if i < len(axes):
        axes[i].plot(df['date'], df[col], linewidth=1.5)
        axes[i].set_title(col.replace('_', ' ').title())
        axes[i].grid(True, alpha=0.3)
        axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 4. Correlation Analysis

In [None]:
# Correlation matrix
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Correlation with target variable (inflation)
target_corr = correlation_matrix['cpi_inflation'].sort_values(ascending=False)

plt.figure(figsize=(10, 6))
target_corr[1:].plot(kind='barh')
plt.xlabel('Correlation with CPI Inflation')
plt.title('Feature Correlations with CPI Inflation')
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

## 5. Distribution Analysis

In [None]:
# Distribution of inflation
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df['cpi_inflation'], bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('CPI Inflation (%)')
plt.ylabel('Frequency')
plt.title('Distribution of CPI Inflation')
plt.grid(True, alpha=0.3, axis='y')

plt.subplot(1, 2, 2)
plt.boxplot(df['cpi_inflation'])
plt.ylabel('CPI Inflation (%)')
plt.title('Boxplot of CPI Inflation')
plt.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 6. Seasonal Patterns

In [None]:
# Add month for seasonal analysis
df['month'] = pd.to_datetime(df['date']).dt.month
df['year'] = pd.to_datetime(df['date']).dt.year

# Monthly average inflation
monthly_avg = df.groupby('month')['cpi_inflation'].mean()

plt.figure(figsize=(10, 6))
plt.plot(monthly_avg.index, monthly_avg.values, marker='o', linewidth=2, markersize=8)
plt.xlabel('Month')
plt.ylabel('Average CPI Inflation (%)')
plt.title('Average Inflation by Month (Seasonal Pattern)')
plt.xticks(range(1, 13))
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Trend Analysis

In [None]:
# Yearly average inflation
yearly_avg = df.groupby('year')['cpi_inflation'].mean()

plt.figure(figsize=(12, 6))
plt.plot(yearly_avg.index, yearly_avg.values, marker='o', linewidth=2, markersize=8)
plt.xlabel('Year')
plt.ylabel('Average CPI Inflation (%)')
plt.title('Average Inflation by Year (Trend)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 8. Feature Engineering Preview

In [None]:
# Create lag features
df_with_lags = df.copy()
for lag in [1, 3, 6, 12]:
    df_with_lags[f'inflation_lag_{lag}'] = df_with_lags['cpi_inflation'].shift(lag)

# Check correlation of lag features
lag_cols = ['cpi_inflation'] + [col for col in df_with_lags.columns if 'lag' in col]
lag_corr = df_with_lags[lag_cols].corr()['cpi_inflation'].sort_values(ascending=False)

print("Correlation of lag features with current inflation:")
print(lag_corr)

## 9. Summary Statistics

In [None]:
print("Dataset Summary:")
print(f"Time Period: {df['date'].min()} to {df['date'].max()}")
print(f"Total Observations: {len(df)}")
print(f"\nInflation Statistics:")
print(f"Mean: {df['cpi_inflation'].mean():.2f}%")
print(f"Median: {df['cpi_inflation'].median():.2f}%")
print(f"Std Dev: {df['cpi_inflation'].std():.2f}%")
print(f"Min: {df['cpi_inflation'].min():.2f}%")
print(f"Max: {df['cpi_inflation'].max():.2f}%")

## Conclusion

This exploratory analysis provides insights into:
- The temporal patterns in UK inflation
- Relationships between economic indicators
- Seasonal and trend components
- Feature correlations for model building

Next steps:
1. Feature engineering and preprocessing
2. Model training and evaluation
3. XAI analysis for model interpretability