In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load preprocessed data
wdi_gdp_df = pd.read_csv('../data/processed/wdi_gdp.csv')
imf_df = pd.read_csv('../data/processed/imf_dot.csv')
life_expectancy_df = pd.read_csv('../data/processed/life_expectancy.csv')
primary_enrollment_df = pd.read_csv('../data/processed/primary_enrollment.csv')
literacy_rate_df = pd.read_csv('../data/processed/literacy_rate.csv')
electricity_access_df = pd.read_csv('../data/processed/electricity_access.csv')
poverty_headcount_df = pd.read_csv('../data/processed/poverty_headcount.csv')

# Function to plot time series data
def plot_time_series(df, x, y, title, xlabel, ylabel):
    plt.figure(figsize=(10, 6))
    sns.lineplot(data=df, x=x, y=y)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()

# Plot GDP over the years
plot_time_series(wdi_gdp_df, 'year', 'value', 'GDP over the Years', 'Year', 'GDP (current US$)')

# Plot trade data over the years
plot_time_series(imf_df, 'year', 'trade_value', 'Trade Value over the Years', 'Year', 'Trade Value (current US$)')

# Plot Life Expectancy over the years
plot_time_series(life_expectancy_df, 'year', 'value', 'Life Expectancy over the Years', 'Year', 'Life Expectancy (years)')

# Plot Primary Enrollment over the years
plot_time_series(primary_enrollment_df, 'year', 'value', 'Gross Enrollment Ratio in Primary Education over the Years', 'Year', 'Enrollment Ratio (%)')

# Plot Adult Literacy Rate over the years
plot_time_series(literacy_rate_df, 'year', 'value', 'Adult Literacy Rate over the Years', 'Year', 'Literacy Rate (%)')

# Plot Access to Electricity over the years
plot_time_series(electricity_access_df, 'year', 'value', 'Access to Electricity over the Years', 'Year', 'Access to Electricity (%)')

# Plot Poverty Headcount over the years
plot_time_series(poverty_headcount_df, 'year', 'value', 'Poverty Headcount Ratio over the Years', 'Year', 'Poverty Headcount Ratio (%)')

# Advanced Analysis: Correlation Matrix
all_data = pd.merge(wdi_gdp_df, life_expectancy_df, on=['country', 'year'], suffixes=('_gdp', '_life_expectancy'))
all_data = pd.merge(all_data, primary_enrollment_df, on=['country', 'year'], suffixes=('', '_primary_enrollment'))
all_data = pd.merge(all_data, literacy_rate_df, on=['country', 'year'], suffixes=('', '_literacy_rate'))
all_data = pd.merge(all_data, electricity_access_df, on=['country', 'year'], suffixes=('', '_electricity_access'))
all_data = pd.merge(all_data, poverty_headcount_df, on=['country', 'year'], suffixes=('', '_poverty_headcount'))

# Compute the correlation matrix
corr_matrix = all_data[['value_gdp', 'value_life_expectancy', 'value', 'value_literacy_rate', 'value_electricity_access', 'value_poverty_headcount']].corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Indicators')
plt.show()

# Distribution Plots
def plot_distribution(df, column, title):
    plt.figure(figsize=(10, 6))
    sns.histplot(df[column], kde=True)
    plt.title(title)
    plt.show()

# Plot distributions
plot_distribution(wdi_gdp_df, 'value', 'Distribution of GDP')
plot_distribution(life_expectancy_df, 'value', 'Distribution of Life Expectancy')
plot_distribution(primary_enrollment_df, 'value', 'Distribution of Primary Enrollment')
plot_distribution(literacy_rate_df, 'value', 'Distribution of Literacy Rate')
plot_distribution(electricity_access_df, 'value', 'Distribution of Access to Electricity')
plot_distribution(poverty_headcount_df, 'value', 'Distribution of Poverty Headcount')
