In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import numpy as np
import os
from kaggle import KaggleApi

# # Instantiate Kaggle API
# api = KaggleApi()
# api.authenticate()

# # Download and extract population dataset
# dataset = "thedevastator/u-s-public-debt-vs-gdp-from-1947-2020"
# folder = 'data'
# try:
#     api.dataset_download_files(dataset, path=folder, unzip=True)
# except Exception as e:
#     print(f"Error downloading dataset: {e}")
#     exit(1)

# import os
# from kaggle import KaggleApi

# # Instantiate Kaggle API
# api = KaggleApi()
# api.authenticate()

# # Download and extract population dataset
# dataset = "alfredkondoro/u-s-economic-indicators-1974-2024"
# folder = 'data'
# try:
#     api.dataset_download_files(dataset, path=folder, unzip=True)
# except Exception as e:
#     print(f"Error downloading dataset: {e}")
#     exit(1)

# # Step 1: Read and preprocess the datasets
cpi_data = pd.read_csv("data/cpi_data.csv")
gdp_data = pd.read_csv("data/gdp_data.csv")
unemployment_data = pd.read_csv("data/unemployment_data.csv")
us_gdp_debt = pd.read_csv("data/US GDP vs Debt.csv")

# Ensure consistent column names
cpi_data.columns = ["DATE", "CPIAUCSL"]
gdp_data.columns = ["DATE", "GDP"]
unemployment_data.columns = ["DATE", "UNRATE"]
us_gdp_debt.columns = ["index","Quarter", "Gross Domestic Product ($mil)", "Total Public Debt ($mil)"]

# # Step 2 :  Converting Date Columns to Datetime Format
cpi_data['DATE'] = pd.to_datetime(cpi_data['DATE'])
gdp_data['DATE'] = pd.to_datetime(gdp_data['DATE'])
unemployment_data['DATE'] = pd.to_datetime(unemployment_data['DATE'])
us_gdp_debt['Quarter'] = pd.to_datetime(us_gdp_debt['Quarter'])

# # Step 3: Converting Monthly Data to Quarterly Data

# CPI
cpi_data['Quarter'] = cpi_data['DATE'].dt.to_period('Q')
cpi_quarterly = cpi_data.groupby('Quarter')['CPIAUCSL'].mean().reset_index()

# GDP Quarter
gdp_data['Quarter'] = gdp_data['DATE'].dt.to_period('Q')
gdp_quarterly = gdp_data.groupby('Quarter')['GDP'].mean().reset_index()

# unemployment Quarter
unemployment_data['Quarter'] = unemployment_data['DATE'].dt.to_period('Q')
unemployment_quarterly = unemployment_data.groupby('Quarter')['UNRATE'].mean().reset_index()

# # Step 4: Merge the datasets
# Merge CPI, GDP, and Unemployment data on Quarter
merged_quarterly = cpi_quarterly.merge(unemployment_quarterly, on='Quarter', how='inner')
merged_quarterly = merged_quarterly.merge(gdp_quarterly, on='Quarter', how='inner')

# Merge with Debt data
us_gdp_debt.rename(columns={
    'Gross Domestic Product ($mil)': 'GDP_MIL',
    'Total Public Debt ($mil)': 'DEBT_MIL'
}, inplace=True)
us_gdp_debt['Quarter'] = us_gdp_debt['Quarter'].dt.to_period('Q')

# Save the final combined dataset
# final = merged_quarterly.merge(us_gdp_debt, on='Quarter', how='inner')
# final.to_csv("data/final_combined_quarterly_data.csv", index=False)
# print("Final combined dataset saved as 'final_combined_quarterly_data.csv'.")

final_data = pd.read_csv("data/final_combined_quarterly_data.csv")
# # Step 5: Yearly aggregation
final_data['Year'] = final_data['Quarter'].str[:4].astype(int)


annual_data = final_data.groupby('Year').sum().reset_index()
normalized_data = final_data.copy()
columns_to_normalize = ['CPIAUCSL', 'UNRATE', 'GDP_MIL', 'DEBT_MIL']

# # Step 6 : Analysys 1 - Trend Analysis
# Explore how GDP, CPI (inflation), unemployment, and public debt change over time.
# Visualize the data with clear line plots to identify trends.

plt.figure(figsize=(12, 6))
for col in ['CPIAUCSL', 'UNRATE', 'GDP_MIL', 'DEBT_MIL']:
    normalized_data[col] = (final_data[col] - final_data[col].min()) / (final_data[col].max() - final_data[col].min())
    plt.plot(final_data['Year'], normalized_data[col], label=col, linewidth=2)

plt.title("Trend Analysis with Normalized Data: CPI, Unemployment, GDP, and Public Debt")
plt.xlabel("Year")
plt.ylabel("Normalized Values (0 to 1)")
plt.legend()
plt.grid(True)
plt.xticks(ticks=final_data['Year'][::5], labels=final_data['Year'][::5], rotation=45)
plt.show()


# # Step 7 : Analysys 2 - Debt-to-GDP Ratio
# Calculate and analyze the ratio of public debt to GDP.
# Identify periods where borrowing grew faster or slower than GDP.

normalized_data['Debt_to_GDP_Ratio'] = normalized_data['DEBT_MIL'] / normalized_data['GDP_MIL']
plt.figure(figsize=(12, 6))
plt.plot(final_data['Year'], normalized_data['Debt_to_GDP_Ratio'], label='Debt-to-GDP Ratio', color='purple', linewidth=2)
plt.title("Debt-to-GDP Ratio Over Time (Normalized)")
plt.xlabel("Year")
plt.ylabel("Normalized Debt-to-GDP Ratio")
plt.grid(True)
plt.xticks(ticks=final_data['Year'][::5], labels=final_data['Year'][::5], rotation=45)
plt.legend()
plt.show()


# # Step 8 : Analysys 3 - Correlation Analysis:
# Explore relationships between key metrics:
# Unemployment vs. GDP.
# Inflation (CPI) vs. GDP.
# Public Debt vs. GDP Growth.

correlation_matrix = normalized_data[['CPIAUCSL', 'UNRATE', 'GDP_MIL', 'DEBT_MIL']].corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', cbar=True)
plt.title("Correlation Matrix (Normalized)")
plt.show()


# # Step 9 : Analysys 04 - Economic Events Analysis:
# Highlight significant periods like recessions (e.g., 2008, 2020).
# Assess their impact on GDP, inflation, unemployment, and debt.

events = {"2008 Financial Crisis": (2008, 2009), "COVID-19 Pandemic": (2020, 2021)}
plt.figure(figsize=(12, 8))
for col in ['CPIAUCSL', 'UNRATE', 'GDP_MIL', 'DEBT_MIL']:
    plt.plot(final_data['Year'], normalized_data[col], label=col, linewidth=2)

for event, (start, end) in events.items():
    plt.axvspan(start, end, color='gray', alpha=0.3, label=f"{event}")

plt.title("Economic Events Analysis with Normalized Data")
plt.xlabel("Year")
plt.ylabel("Normalized Values (0 to 1)")
plt.legend()
plt.grid(True)
plt.xticks(ticks=final_data['Year'][::5], labels=final_data['Year'][::5], rotation=45)
plt.show()


# # Step 10 : Analysys 05 - Economic Indicator Comparisons:
# Compare the growth rates of GDP, debt, and unemployment over time.
# Identify periods of economic health or stress.

columns_to_normalize = ['GDP_MIL', 'DEBT_MIL']
for col in columns_to_normalize:
    normalized_data[col] = (final_data[col] - final_data[col].min()) / (final_data[col].max() - final_data[col].min())

# Calculate growth rates with NA handling
normalized_data['GDP_Growth_Rate'] = normalized_data['GDP_MIL'].pct_change(fill_method=None) * 100
normalized_data['Debt_Growth_Rate'] = normalized_data['DEBT_MIL'].pct_change(fill_method=None) * 100

# Extract 'Year' for plotting
normalized_data['Year'] = final_data['Quarter'].str[:4].astype(int)

# Filter out extreme outliers for better visualization
threshold = 200  # Limit growth rates to ±200%
filtered_data = normalized_data[
    (normalized_data['GDP_Growth_Rate'].abs() <= threshold) &
    (normalized_data['Debt_Growth_Rate'].abs() <= threshold)
]

# Plot the growth rates with outliers removed
plt.figure(figsize=(12, 6))
plt.plot(filtered_data['Year'], filtered_data['GDP_Growth_Rate'], label='GDP Growth Rate', linestyle='-', marker='o')
plt.plot(filtered_data['Year'], filtered_data['Debt_Growth_Rate'], label='Debt Growth Rate', linestyle='-', marker='x')
plt.title("Growth Rate Comparison: GDP vs. Debt (Outliers Removed)")
plt.xlabel("Year")
plt.ylabel("Growth Rate (%)")
plt.legend()
plt.grid(True)
plt.xticks(filtered_data['Year'].unique()[::2], rotation=45)  # Adjust x-axis ticks
plt.show()

# # Step 11 : Analysys 6 - Seasonality and Cycles:
# Analyze recurring patterns in GDP or inflation using seasonal decomposition.
# Detect long-term cycles in the economy.
from statsmodels.tsa.seasonal import seasonal_decompose

decomposed = seasonal_decompose(gdp_series, model='additive', period=4)
decomposed.plot()
plt.show()


# # Step 12 : Analysys 07 - Debt and Inflation:
# Analyze the relationship between public debt and inflation to detect possible causation or correlation.

plt.figure(figsize=(8, 6))
sns.scatterplot(x=normalized_data['DEBT_MIL'], y=normalized_data['CPIAUCSL'])
plt.title("Debt vs. Inflation (Normalized)")
plt.xlabel("Debt (Normalized)")
plt.ylabel("Inflation (Normalized)")
plt.grid(True)
plt.show()


# # Step 13 : Analysys 08 - Economic Health Score:
# Create a composite score based on GDP growth, unemployment, debt, and inflation.
# Track the overall economic health of the U.S. over time.

normalized_data['Economic_Health_Score'] = (normalized_data['GDP_MIL'] - normalized_data['DEBT_MIL']) / (normalized_data['UNRATE'] + 1)
plt.figure(figsize=(12, 6))
plt.plot(final_data['Year'], normalized_data['Economic_Health_Score'], label='Economic Health Score', linewidth=2)
plt.title("Economic Health Score Over Time (Normalized)")
plt.xlabel("Year")
plt.ylabel("Economic Health Score")
plt.legend()
plt.grid(True)
plt.show()
