<a href="https://colab.research.google.com/github/bmwenyemali/vihdataproDataAnalysis/blob/main/vih_data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# VIH Data Analysis Project - Democratic Republic of Congo
## HIV/AIDS Data Analytics Portfolio

**Author:** Bienvenu M Mwenyemali  
**Date:** February 2026  
**Description:** Comprehensive HIV/AIDS data analysis showcasing Python skills with data extraction, cleaning, exploration, visualization, and statistical analysis

---

## Section 1: Import Libraries

In [None]:
# Install required packages (uncomment if needed)
# !pip install pandas numpy matplotlib seaborn scipy openpyxl

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import zscore, pearsonr, spearmanr, shapiro, normaltest
import warnings
warnings.filterwarnings('ignore')

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 100)

# Set visualization style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

print("Libraries imported successfully!")

## Section 2: Data Extraction from Excel

In [None]:
def load_data(filepath):
    """
    Function to load data from Excel file
    Parameters:
        filepath (str): Path to the Excel file
    Returns:
        DataFrame: Pandas DataFrame containing the data
    """
    try:
        df = pd.read_excel(filepath)
        print(f"Data successfully loaded from: {filepath}")
        print(f"Dataset shape: {df.shape[0]} rows x {df.shape[1]} columns")
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None
    except Exception as e:
        print(f"Error loading file: {str(e)}")
        return None

# Load the dataset
df = load_data('datavih.xlsx')

## Section 3: Data Exploration

In [None]:
# DataFrame Info
print("=" * 60)
print("DATAFRAME INFO")
print("=" * 60)
df.info()

In [None]:
# First 10 rows
print("First 10 Rows:")
df.head(10)

In [None]:
# Last 5 rows
print("Last 5 Rows:")
df.tail(5)

In [None]:
# Column names and data types
print("Column Names and Data Types:")
for i, (col, dtype) in enumerate(zip(df.columns, df.dtypes), 1):
    print(f"  {i}. {col}: {dtype}")

In [None]:
# Unique values analysis
print("=" * 60)
print("UNIQUE VALUES PER COLUMN")
print("=" * 60)
for col in df.columns:
    print(f"  {col}: {df[col].nunique()} unique values")

In [None]:
# Display unique provinces (26 provinces of DRC)
print("Unique Provinces (26 provinces of DRC):")
print(sorted(df['provinces'].unique().tolist()))

In [None]:
# Display unique years
print("Unique Years:")
print(sorted(df['annees'].unique()))

In [None]:
# Display unique indicators (sample - first 10)
print("Sample Indicators (first 10):")
for i, ind in enumerate(df['indicateurs'].unique()[:10], 1):
    print(f"  {i}. {ind}")

## Section 4: Data Cleaning

In [None]:
def analyze_missing_values(df):
    """
    Analyze missing values in the DataFrame
    """
    missing = df.isnull().sum()
    missing_percent = (df.isnull().sum() / len(df)) * 100

    missing_df = pd.DataFrame({
        'Column': df.columns,
        'Missing Count': missing.values,
        'Missing Percentage': missing_percent.values
    })
    missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values(
        'Missing Percentage', ascending=False
    )

    return missing_df

print("=" * 60)
print("MISSING VALUES ANALYSIS")
print("=" * 60)
missing_analysis = analyze_missing_values(df)
if len(missing_analysis) > 0:
    display(missing_analysis)
else:
    print("No missing values found!")

In [None]:
def clean_data(df):
    """
    Comprehensive data cleaning function
    """
    df_clean = df.copy()

    print("=" * 60)
    print("CLEANING PROCESS")
    print("=" * 60)

    # 1. Handle missing values in categorical columns
    categorical_cols = ['provinces', 'trimestres', 'indicateurs', 'cibles', 'sexes', 'tranches_ages']
    for col in categorical_cols:
        if col in df_clean.columns:
            missing_count = df_clean[col].isnull().sum()
            if missing_count > 0:
                df_clean[col] = df_clean[col].fillna('Non spécifié')
                print(f"  - Filled {missing_count} missing values in '{col}' with 'Non spécifié'")

    # 2. Handle missing numeric values
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        missing_count = df_clean[col].isnull().sum()
        if missing_count > 0:
            median_val = df_clean[col].median()
            df_clean[col] = df_clean[col].fillna(median_val)
            print(f"  - Filled {missing_count} missing values in '{col}' with median: {median_val}")

    # 3. Remove duplicates
    initial_rows = len(df_clean)
    df_clean = df_clean.drop_duplicates()
    duplicates_removed = initial_rows - len(df_clean)
    if duplicates_removed > 0:
        print(f"  - Removed {duplicates_removed} duplicate rows")
    else:
        print("  - No duplicate rows found")

    # 4. Standardize text columns
    for col in categorical_cols:
        if col in df_clean.columns and df_clean[col].dtype == 'object':
            df_clean[col] = df_clean[col].str.strip()

    print(f"\n  Final cleaned dataset shape: {df_clean.shape}")

    return df_clean

df_clean = clean_data(df)

## Section 5: Descriptive Statistics

In [None]:
def descriptive_statistics(df, column='Valeur'):
    """
    Calculate comprehensive descriptive statistics
    """
    data = df[column].dropna()

    stats_dict = {
        'Count': len(data),
        'Mean': data.mean(),
        'Median': data.median(),
        'Mode': data.mode().iloc[0] if len(data.mode()) > 0 else np.nan,
        'Standard Deviation': data.std(),
        'Variance': data.var(),
        'Min': data.min(),
        'Max': data.max(),
        'Range': data.max() - data.min(),
        'Q1 (25%)': data.quantile(0.25),
        'Q2 (50%)': data.quantile(0.50),
        'Q3 (75%)': data.quantile(0.75),
        'IQR': data.quantile(0.75) - data.quantile(0.25),
        'Skewness': data.skew(),
        'Kurtosis': data.kurtosis(),
        'Coefficient of Variation': (data.std() / data.mean()) * 100
    }

    return pd.DataFrame(list(stats_dict.items()), columns=['Metric', 'Value'])

print("=" * 60)
print("DESCRIPTIVE STATISTICS FOR 'Valeur' COLUMN")
print("=" * 60)
desc_stats = descriptive_statistics(df_clean)
display(desc_stats)

In [None]:
# Statistics by Province
print("=" * 60)
print("STATISTICS BY PROVINCE")
print("=" * 60)
province_stats = df_clean.groupby('provinces')['Valeur'].agg([
    'count', 'sum', 'mean', 'median', 'std', 'min', 'max'
]).round(2)
display(province_stats)

In [None]:
# Statistics by Year
print("=" * 60)
print("STATISTICS BY YEAR")
print("=" * 60)
year_stats = df_clean.groupby('annees')['Valeur'].agg([
    'count', 'sum', 'mean', 'median', 'std'
]).round(2)
display(year_stats)

In [None]:
# Statistics by Gender
print("=" * 60)
print("STATISTICS BY GENDER")
print("=" * 60)
gender_stats = df_clean.groupby('sexes')['Valeur'].agg([
    'count', 'sum', 'mean', 'median'
]).round(2)
display(gender_stats)

## Section 6: Outlier Detection

In [None]:
def detect_outliers_iqr(data, column):
    """
    Detect outliers using IQR method
    """
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]

    print(f"IQR Method Results for '{column}':")
    print(f"  Q1: {Q1:,.2f}")
    print(f"  Q3: {Q3:,.2f}")
    print(f"  IQR: {IQR:,.2f}")
    print(f"  Lower Bound: {lower_bound:,.2f}")
    print(f"  Upper Bound: {upper_bound:,.2f}")
    print(f"  Number of Outliers: {len(outliers)}")
    print(f"  Outlier Percentage: {(len(outliers)/len(data))*100:.2f}%")

    return outliers, lower_bound, upper_bound

print("=" * 60)
print("OUTLIER DETECTION")
print("=" * 60)
outliers_iqr, lb, ub = detect_outliers_iqr(df_clean, 'Valeur')

In [None]:
def detect_outliers_zscore(data, column, threshold=3):
    """
    Detect outliers using Z-score method
    """
    z_scores = np.abs(zscore(data[column].dropna()))
    outliers_idx = np.where(z_scores > threshold)[0]

    print(f"\nZ-Score Method Results for '{column}' (threshold={threshold}):")
    print(f"  Number of Outliers: {len(outliers_idx)}")
    print(f"  Outlier Percentage: {(len(outliers_idx)/len(data))*100:.2f}%")

    return outliers_idx

outliers_zscore = detect_outliers_zscore(df_clean, 'Valeur')

In [None]:
# Outlier analysis for specific indicator: Male Condoms Distributed
print("\n" + "=" * 60)
print("OUTLIER ANALYSIS: MALE CONDOMS DISTRIBUTED")
print("=" * 60)
preservatifs_data = df_clean[df_clean['indicateurs'] == 'Nombre de préservatifs masculins distribués']
if len(preservatifs_data) > 0:
    outliers_preserv, _, _ = detect_outliers_iqr(preservatifs_data, 'Valeur')
    print("\nSample outliers:")
    display(outliers_preserv[['provinces', 'annees', 'trimestres', 'Valeur']].head(10))

## Section 7: Data Aggregation and Pivot Tables

In [None]:
# Pivot table: Values by Province and Year
print("=" * 60)
print("PIVOT TABLE: TOTAL VALUES BY PROVINCE AND YEAR")
print("=" * 60)
pivot_province_year = pd.pivot_table(
    df_clean,
    values='Valeur',
    index='provinces',
    columns='annees',
    aggfunc='sum',
    fill_value=0
)
display(pivot_province_year)

In [None]:
# Pivot table: Values by Gender and Age Group
print("=" * 60)
print("PIVOT TABLE: TOTAL VALUES BY GENDER AND AGE GROUP")
print("=" * 60)
pivot_gender_age = pd.pivot_table(
    df_clean,
    values='Valeur',
    index='sexes',
    columns='tranches_ages',
    aggfunc='sum',
    fill_value=0
)
display(pivot_gender_age)

## Section 8: UNAIDS 95-95-95 Cascade Analysis

In [None]:
def calculate_unaids_95_95_95(df):
    """
    Calculate UNAIDS 95-95-95 cascade indicators
    - 1st 95: % of PLHIV who know their HIV status
    - 2nd 95: % of diagnosed PLHIV on treatment
    - 3rd 95: % of those on treatment with viral suppression
    """
    # Get relevant indicators
    tested = df[df['indicateurs'] == 'Nombre de clients testés']['Valeur'].sum()
    diagnosed = df[df['indicateurs'] == 'Nombre de clients diagnostiqués VIH+']['Valeur'].sum()
    on_tar = df[df['indicateurs'] == 'Nombre de PVVIH sous TAR']['Valeur'].sum()
    viral_suppressed = df[df['indicateurs'] == 'Nombre  de PVVIH sous TAR qui ont supprimée la charge virale']['Valeur'].sum()

    results = {
        'Total Tested': tested,
        'Total Diagnosed HIV+': diagnosed,
        'Total on TAR': on_tar,
        'Total with Viral Suppression': viral_suppressed
    }

    # Calculate cascade rates
    if diagnosed > 0:
        results['2nd 95 (Treatment Rate %)'] = (on_tar / diagnosed) * 100
    if on_tar > 0:
        results['3rd 95 (Viral Suppression Rate %)'] = (viral_suppressed / on_tar) * 100

    return results

print("=" * 60)
print("UNAIDS 95-95-95 CASCADE ANALYSIS")
print("=" * 60)
unaids_results = calculate_unaids_95_95_95(df_clean)
for key, value in unaids_results.items():
    if isinstance(value, float):
        print(f"  {key}: {value:,.2f}")
    else:
        print(f"  {key}: {value:,}")

In [None]:
# UNAIDS Cascade by Year
print("\n" + "=" * 60)
print("UNAIDS CASCADE BY YEAR")
print("=" * 60)

cascade_by_year = []
for year in sorted(df_clean['annees'].unique()):
    year_data = df_clean[df_clean['annees'] == year]
    tested = year_data[year_data['indicateurs'] == 'Nombre de clients testés']['Valeur'].sum()
    diagnosed = year_data[year_data['indicateurs'] == 'Nombre de clients diagnostiqués VIH+']['Valeur'].sum()
    on_tar = year_data[year_data['indicateurs'] == 'Nombre de PVVIH sous TAR']['Valeur'].sum()
    viral_suppressed = year_data[year_data['indicateurs'] == 'Nombre  de PVVIH sous TAR qui ont supprimée la charge virale']['Valeur'].sum()

    cascade_by_year.append({
        'Year': year,
        'Tested': tested,
        'Diagnosed HIV+': diagnosed,
        'On TAR': on_tar,
        'Viral Suppression': viral_suppressed
    })

cascade_df = pd.DataFrame(cascade_by_year)
display(cascade_df)

## Section 9: Advanced Statistical Tests

In [None]:
# Normality Test (D'Agostino-Pearson)
print("=" * 60)
print("NORMALITY TEST (D'Agostino-Pearson)")
print("=" * 60)

sample_data = df_clean['Valeur'].dropna().sample(min(5000, len(df_clean)), random_state=42)

try:
    stat, p_value = normaltest(sample_data)
    print(f"  Test Statistic: {stat:.4f}")
    print(f"  P-value: {p_value:.4e}")
    print(f"  Result: {'Normal distribution' if p_value > 0.05 else 'Not normal distribution'}")
except Exception as e:
    print(f"  Test could not be performed: {e}")

In [None]:
# T-Test: Comparing Values between Genders
print("=" * 60)
print("T-TEST: COMPARING VALUES BETWEEN GENDERS")
print("=" * 60)

male_data = df_clean[df_clean['sexes'] == 'Masculin']['Valeur'].dropna()
female_data = df_clean[df_clean['sexes'] == 'Féminin']['Valeur'].dropna()

if len(male_data) > 0 and len(female_data) > 0:
    t_stat, p_value = stats.ttest_ind(
        male_data.sample(min(1000, len(male_data)), random_state=42),
        female_data.sample(min(1000, len(female_data)), random_state=42)
    )
    print(f"  T-statistic: {t_stat:.4f}")
    print(f"  P-value: {p_value:.4e}")
    print(f"  Result: {'Significant difference' if p_value < 0.05 else 'No significant difference'}")

In [None]:
# Chi-square test for independence
print("=" * 60)
print("CHI-SQUARE TEST: PROVINCE VS GENDER INDEPENDENCE")
print("=" * 60)

contingency = pd.crosstab(df_clean['provinces'], df_clean['sexes'])
chi2, p_value, dof, expected = stats.chi2_contingency(contingency)

print(f"  Chi-square statistic: {chi2:.4f}")
print(f"  P-value: {p_value:.4e}")
print(f"  Degrees of freedom: {dof}")
print(f"  Result: {'Dependent' if p_value < 0.05 else 'Independent'}")

## Section 10: Data Visualization

In [None]:
# Chart 1: Value Distribution (Log-transformed)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(np.log1p(df_clean['Valeur']), bins=50, edgecolor='black', alpha=0.7, color='steelblue')
axes[0].set_xlabel('Log(Value + 1)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Values (Log-transformed)', fontsize=14)
axes[0].axvline(np.log1p(df_clean['Valeur'].median()), color='red', linestyle='--', label='Median')
axes[0].legend()

# Box plot
axes[1].boxplot(np.log1p(df_clean['Valeur'].dropna()), vert=True)
axes[1].set_ylabel('Log(Value + 1)', fontsize=12)
axes[1].set_title('Box Plot of Values (Log-transformed)', fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
# Chart 2: Yearly Trend
yearly_totals = df_clean.groupby('annees')['Valeur'].sum() / 1e9

fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(yearly_totals.index, yearly_totals.values, color='teal', edgecolor='black')
ax.set_xlabel('Year', fontsize=12)
ax.set_ylabel('Total Value (Billions)', fontsize=12)
ax.set_title('Total Values by Year', fontsize=14)

for bar, val in zip(bars, yearly_totals.values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
            f'{val:.2f}B', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# Chart 3: Top 10 Provinces
top_provinces = df_clean.groupby('provinces')['Valeur'].sum().nlargest(10) / 1e9

fig, ax = plt.subplots(figsize=(12, 6))
bars = ax.barh(top_provinces.index, top_provinces.values, color='coral', edgecolor='black')
ax.set_xlabel('Total Value (Billions)', fontsize=12)
ax.set_ylabel('Province', fontsize=12)
ax.set_title('Top 10 Provinces by Total Value', fontsize=14)
ax.invert_yaxis()

plt.tight_layout()
plt.show()

In [None]:
# Chart 4: Gender Distribution
gender_totals = df_clean.groupby('sexes')['Valeur'].sum()

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

colors = ['#ff9999', '#66b3ff', '#99ff99']
axes[0].pie(gender_totals.values, labels=gender_totals.index, autopct='%1.1f%%',
            colors=colors, startangle=90, explode=[0.02]*len(gender_totals))
axes[0].set_title('Distribution by Gender', fontsize=14)

axes[1].bar(gender_totals.index, gender_totals.values / 1e9, color=colors, edgecolor='black')
axes[1].set_xlabel('Gender', fontsize=12)
axes[1].set_ylabel('Total Value (Billions)', fontsize=12)
axes[1].set_title('Total Values by Gender', fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
# Chart 5: Age Group Analysis
age_totals = df_clean.groupby('tranches_ages')['Valeur'].sum().sort_values(ascending=True) / 1e9

fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.barh(age_totals.index, age_totals.values, color='mediumpurple', edgecolor='black')
ax.set_xlabel('Total Value (Billions)', fontsize=12)
ax.set_ylabel('Age Group', fontsize=12)
ax.set_title('Values by Age Group', fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
# Chart 6: Quarterly Heatmap
quarterly_pivot = df_clean.pivot_table(values='Valeur', index='annees',
                                        columns='trimestres', aggfunc='sum') / 1e9

fig, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(quarterly_pivot, annot=True, fmt='.2f', cmap='YlOrRd', ax=ax,
            cbar_kws={'label': 'Value (Billions)'})
ax.set_title('Quarterly Values by Year (Billions)', fontsize=14)
ax.set_xlabel('Quarter', fontsize=12)
ax.set_ylabel('Year', fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
# Chart 7: Scatter Plot for Outlier Detection - Male Condoms Distribution
preservatifs = df_clean[df_clean['indicateurs'] == 'Nombre de préservatifs masculins distribués'].copy()

if len(preservatifs) > 0:
    fig, ax = plt.subplots(figsize=(12, 6))

    scatter = ax.scatter(range(len(preservatifs)), preservatifs['Valeur'],
                        c=preservatifs['annees'], cmap='viridis', alpha=0.6, s=30)

    # Outlier threshold line (IQR method)
    Q1 = preservatifs['Valeur'].quantile(0.25)
    Q3 = preservatifs['Valeur'].quantile(0.75)
    IQR = Q3 - Q1
    upper_bound = Q3 + 1.5 * IQR
    ax.axhline(y=upper_bound, color='red', linestyle='--', linewidth=2,
               label=f'Outlier Threshold: {upper_bound:,.0f}')

    ax.set_xlabel('Observation Index', fontsize=12)
    ax.set_ylabel('Number of Male Condoms Distributed', fontsize=12)
    ax.set_title('Outlier Detection: Male Condom Distribution', fontsize=14)
    ax.legend()
    plt.colorbar(scatter, label='Year')

    plt.tight_layout()
    plt.show()

In [None]:
# Chart 8: Province Yearly Trends (Top 5)
top_5_provinces = df_clean.groupby('provinces')['Valeur'].sum().nlargest(5).index

fig, ax = plt.subplots(figsize=(12, 6))
for province in top_5_provinces:
    prov_data = df_clean[df_clean['provinces'] == province].groupby('annees')['Valeur'].sum() / 1e9
    ax.plot(prov_data.index, prov_data.values, marker='o', linewidth=2, markersize=8, label=province)

ax.set_xlabel('Year', fontsize=12)
ax.set_ylabel('Total Value (Billions)', fontsize=12)
ax.set_title('Yearly Trends - Top 5 Provinces', fontsize=14)
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Chart 9: UNAIDS 95-95-95 Cascade
cascade_indicators = {
    'Tested': df_clean[df_clean['indicateurs'] == 'Nombre de clients testés']['Valeur'].sum(),
    'Diagnosed HIV+': df_clean[df_clean['indicateurs'] == 'Nombre de clients diagnostiqués VIH+']['Valeur'].sum(),
    'On TAR': df_clean[df_clean['indicateurs'] == 'Nombre de PVVIH sous TAR']['Valeur'].sum(),
    'Viral Suppression': df_clean[df_clean['indicateurs'] == 'Nombre  de PVVIH sous TAR qui ont supprimée la charge virale']['Valeur'].sum()
}

fig, ax = plt.subplots(figsize=(10, 6))
colors = ['#3498db', '#e74c3c', '#2ecc71', '#9b59b6']
bars = ax.bar(cascade_indicators.keys(), [v/1e6 for v in cascade_indicators.values()],
              color=colors, edgecolor='black')

ax.set_ylabel('Value (Millions)', fontsize=12)
ax.set_title('UNAIDS 95-95-95 Cascade Indicators (Total)', fontsize=14)

for bar, val in zip(bars, cascade_indicators.values()):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
            f'{val/1e6:.1f}M', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# Chart 10: Year-to-Year Correlation Matrix
province_year_pivot = df_clean.pivot_table(values='Valeur', index='provinces', columns='annees', aggfunc='sum')
correlation_matrix = province_year_pivot.corr()

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=ax, center=0,
            fmt='.3f', square=True)
ax.set_title('Year-to-Year Correlation Matrix', fontsize=14)

plt.tight_layout()
plt.show()

## Section 11: Custom Analysis Functions

In [None]:
def top_provinces_by_indicator(df, indicator, n=5):
    """
    Find top N provinces for a specific indicator
    """
    filtered = df[df['indicateurs'] == indicator]
    top_provinces = filtered.groupby('provinces')['Valeur'].sum().nlargest(n)

    print(f"Top {n} Provinces for '{indicator[:50]}...':")
    for i, (province, value) in enumerate(top_provinces.items(), 1):
        print(f"  {i}. {province}: {value:,.0f}")

    return top_provinces

# Example usage
print("=" * 60)
top_provinces_by_indicator(df_clean, 'Nombre de préservatifs masculins distribués', 5)
print()
top_provinces_by_indicator(df_clean, 'Nombre de PVVIH sous TAR', 5)

In [None]:
def year_over_year_comparison(df, indicator):
    """
    Compare year-over-year changes for an indicator
    """
    yearly_data = df[df['indicateurs'] == indicator].groupby('annees')['Valeur'].sum()

    print(f"Year-over-Year Comparison: {indicator[:50]}...")
    for year in sorted(yearly_data.index):
        value = yearly_data[year]
        if year > yearly_data.index.min():
            prev_value = yearly_data[year - 1]
            change = ((value - prev_value) / prev_value) * 100 if prev_value > 0 else 0
            print(f"  {year}: {value:,.0f} ({change:+.2f}% vs {year-1})")
        else:
            print(f"  {year}: {value:,.0f} (baseline)")

    return yearly_data

# Example usage
print("=" * 60)
yoy_tar = year_over_year_comparison(df_clean, 'Nombre de PVVIH sous TAR')

## Section 12: Data Export

In [None]:
# Export cleaned data
df_clean.to_csv('datavih_cleaned.csv', index=False, encoding='utf-8-sig')
print("Cleaned data exported to: datavih_cleaned.csv")

# Export summary statistics
summary_stats = df_clean.groupby(['provinces', 'annees']).agg({
    'Valeur': ['sum', 'mean', 'count']
}).round(2)
summary_stats.columns = ['Total', 'Average', 'Count']
summary_stats.to_csv('summary_by_province_year.csv', encoding='utf-8-sig')
print("Summary statistics exported to: summary_by_province_year.csv")

# Export UNAIDS indicators data
unaids_indicators = [
    'Nombre de clients testés',
    'Nombre de clients diagnostiqués VIH+',
    'Nombre de PVVIH sous TAR',
    'Nombre  de PVVIH sous TAR qui ont supprimée la charge virale'
]
unaids_df = df_clean[df_clean['indicateurs'].isin(unaids_indicators)]
unaids_df.to_csv('unaids_95_95_95_data.csv', index=False, encoding='utf-8-sig')
print("UNAIDS 95-95-95 data exported to: unaids_95_95_95_data.csv")

## Section 13: Analysis Summary

In [None]:
print("=" * 60)
print("ANALYSIS SUMMARY")
print("=" * 60)

print(f"""
DATASET OVERVIEW:
----------------
- Total Records: {len(df_clean):,}
- Total Columns: {len(df_clean.columns)}
- Time Period: {df_clean['annees'].min()} - {df_clean['annees'].max()}
- Provinces Covered: {df_clean['provinces'].nunique()}
- Indicators Tracked: {df_clean['indicateurs'].nunique()}

KEY FINDINGS:
-------------
1. Top 3 Provinces by Total Values:
   {', '.join(df_clean.groupby('provinces')['Valeur'].sum().nlargest(3).index.tolist())}

2. Yearly Growth Trend:
   - The data shows the evolution of HIV/AIDS response from 2020 to 2024

3. UNAIDS 95-95-95 Cascade:
   - Total Tested: {unaids_results.get('Total Tested', 0):,.0f}
   - Total on Treatment (TAR): {unaids_results.get('Total on TAR', 0):,.0f}
   - Viral Suppression: {unaids_results.get('Total with Viral Suppression', 0):,.0f}

4. Data Quality:
   - Missing values handled appropriately
   - Outliers identified and documented
   - Data cleaned and ready for further analysis
""")

print("=" * 60)
print("ANALYSIS COMPLETE")
print("=" * 60)

---

**Author:** Bienvenu Mwenyemali  
**GitHub:** [bmwenyemali](https://github.com/bmwenyemali)  
**Repository:** [vihdataproDataAnalysis](https://github.com/bmwenyemali/vihdataproDataAnalysis)