<div style="background-color: #e6f2ff; padding: 10px; border-radius: 6px;">
  <h2><b>Title: Adolescent Pregnancy - Data Cleaning, labeling & Exploration</b></h2>
  Author: Dr. Elsie Akwara, PhD, MPH<br>
  Date: May 3rd, 2024 <br>
  Description: Clean and explore Kenya DHS data from 2008, 2014, 2022 to prepare for decompositional analysis.
</div>

In [None]:
# 1. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Load KDHS data (Replace with your actual name of datasets)
# loading CSV files named by survey year
df_2008 = pd.read_csv('data/kdhs_2008.csv')
df_2014 = pd.read_csv('data/kdhs_2014.csv')
df_2022 = pd.read_csv('data/kdhs_2022.csv')

# 3. Define a function to clean each year's data
def clean_kdhs(df):
    # Lowercase columns, replace spaces with _
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    # Create adolescent pregnancy variable (childbearing 15- 19 years)
    # Create variable 'age_at_first_birth' (fill NaNs with 99)
    df['age_at_first_birth'] = df['age_at_first_birth'].fillna(99)
    df['adolescent_pregnancy'] = np.where(df['age_at_first_birth'] < 19, 1, 0)

    # Keep relevant socio-demographic variables
    cols_needed = ['respondent_id', 'age', 'education_level', 'wealth_index', 'residence',
                   'region', 'media_exposure', 'marital_status', 'employment_status', 'adolescent_pregnancy']
    df = df[cols_needed]

    # Handle missing values for categorical vars - fill with 'Unknown'
    cat_vars = ['education_level', 'wealth_index', 'residence', 'region', 'media_exposure', 'marital_status', 'employment_status']
    for col in cat_vars:
        df[col] = df[col].fillna('Unknown')

    # Convert categorical variables to category dtype
    for col in cat_vars:
        df[col] = df[col].astype('category')

    # Age as int
    df['age'] = df['age'].astype('int')

    return df

# 4. Clean datasets
df_2008_clean = clean_kdhs(df_2008)
df_2014_clean = clean_kdhs(df_2014)
df_2022_clean = clean_kdhs(df_2022)

# 5. Save cleaned datasets for STATA
df_2008_clean.to_csv('cleaned_data/kdhs_2008_clean.csv', index=False)
df_2014_clean.to_csv('cleaned_data/kdhs_2014_clean.csv', index=False)
df_2022_clean.to_csv('cleaned_data/kdhs_2022_clean.csv', index=False)

# 6. Exploratory Data Analysis: Adolescent Pregnancy Prevalence Over Years
def adolescent_pregnancy_rate(df, year):
    rate = df['adolescent_pregnancy'].mean() * 100
    print(f"Adolescent pregnancy rate in {year}: {rate:.2f}%")
    return rate

rate_2008 = adolescent_pregnancy_rate(df_2008_clean, 2008)
rate_2014 = adolescent_pregnancy_rate(df_2014_clean, 2014)
rate_2022 = adolescent_pregnancy_rate(df_2022_clean, 2022)

# 7. Plot adolescent pregnancy rate by year
years = [2008, 2014, 2022]
rates = [rate_2008, rate_2014, rate_2022]

plt.figure(figsize=(8,5))
sns.barplot(x=years, y=rates, palette="Blues_d")
plt.title("Adolescent Pregnancy Rate in Kenya (KDHS)")
plt.ylabel("Rate (%)")
plt.xlabel("Survey Year")
plt.ylim(0, max(rates) + 5)
plt.show()

# 8. Cross-tabulation example: Education vs Adolescent Pregnancy (2022)
ct = pd.crosstab(df_2022_clean['education_level'], df_2022_clean['adolescent_pregnancy'], normalize='index') * 100
print(ct.round(2))

# 9. Visualize education vs adolescent pregnancy (2022)
plt.figure(figsize=(10,6))
sns.barplot(x=ct.index, y=ct[1], palette="Set2")
plt.title("Adolescent Pregnancy Rate by Education Level (2022)")
plt.ylabel("Percent with Adolescent Pregnancy")
plt.xlabel("Education Level")
plt.xticks(rotation=45)
plt.show()