<div style="background-color: #e6f2ff; padding: 10px; border-radius: 6px;">
  <h2><b>Title: Adolescent Pregnancy Decompositional Analysis</b></h2>
  Author: Dr. Elsie Akwara, PhD, MPH<br>
  Date: May 3rd, 2024 <br>
  Description: Decompose changes in adolescent pregnancy (<19 years) 
               using 2008, 2014, 2022 KDHS cleaned datasets.
</div>

In [None]:
# 1. Load packages
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

# 2. Define logistic regression function
def logistic_regression(df, outcome_var, predictors):
    X = df[predictors]
    X = sm.add_constant(X)
    y = df[outcome_var]
    model = sm.Logit(y, X).fit(disp=False)
    return model

# 3. Define Fairlie Decomposition function
def fairlie_decomposition(df1, df2, outcome_var, predictors, n_sim=1000):
    combined = pd.concat([df1.assign(group=0), df2.assign(group=1)], axis=0)
    X = combined[predictors]
    X = sm.add_constant(X)
    y = combined[outcome_var]
    model = sm.Logit(y, X).fit(disp=False)

    X1 = sm.add_constant(df1[predictors])
    X2 = sm.add_constant(df2[predictors])
    p1 = model.predict(X1)
    p2 = model.predict(X2)
    diff = p2.mean() - p1.mean()

    contrib = {}
    for var in predictors:
        contrib_var = []
        for _ in range(n_sim):
            x1_sorted = df1[predictors].copy().sample(frac=1).sort_values(by=var).reset_index(drop=True)
            x2_sorted = df2[predictors].copy().sort_values(by=var).reset_index(drop=True)
            X1_temp = sm.add_constant(x1_sorted)
            X2_temp = sm.add_constant(x2_sorted)
            p1_temp = model.predict(X1_temp)
            p2_temp = model.predict(X2_temp)
            contrib_var.append(p2_temp.mean() - p1_temp.mean())
        contrib[var] = np.mean(contrib_var)

    contribution_df = pd.DataFrame.from_dict(contrib, orient='index', columns=['Contribution'])
    contribution_df = contribution_df.sort_values('Contribution', ascending=False)
    return diff, contribution_df

# 4. Define encoding function
def encode_predictors(df, predictors):
    df_encoded = df.copy()
    for var in predictors:
        if df_encoded[var].dtype == 'object' or df_encoded[var].dtype.name == 'category':
            dummies = pd.get_dummies(df_encoded[var], prefix=var, drop_first=True)
            df_encoded = pd.concat([df_encoded.drop(columns=[var]), dummies], axis=1)
    return df_encoded

# 5. Load cleaned data (Update file paths accordingly)
df_2008 = pd.read_csv('cleaned_data/kdhs_2008_clean.csv')
df_2014 = pd.read_csv('cleaned_data/kdhs_2014_clean.csv')
df_2022 = pd.read_csv('cleaned_data/kdhs_2022_clean.csv')

outcome = 'adolescent_pregnancy'
predictors = ['education_level', 'wealth_index', 'residence', 'region', 
              'media_exposure', 'marital_status', 'employment_status']

# 6. Encode categorical predictors
df_2008_enc = encode_predictors(df_2008, predictors)
df_2014_enc = encode_predictors(df_2014, predictors)
df_2022_enc = encode_predictors(df_2022, predictors)

final_predictors = list(df_2008_enc.columns)
final_predictors.remove(outcome)

# 7. Fit logistic regression models
model_2008 = logistic_regression(df_2008_enc, outcome, final_predictors)
model_2014 = logistic_regression(df_2014_enc, outcome, final_predictors)
model_2022 = logistic_regression(df_2022_enc, outcome, final_predictors)

print("Logistic regression for 2008")
print(model_2008.summary())

print("\nLogistic regression for 2014")
print(model_2014.summary())

print("\nLogistic regression for 2022")
print(model_2022.summary())

# 8. Decomposition: 2008 vs 2014
diff_0814, contrib_0814 = fairlie_decomposition(df_2008_enc, df_2014_enc, outcome, final_predictors)
print(f"\nChange in adolescent pregnancy rate (2014 - 2008): {diff_0814:.4f}")
print("Variable contributions (2008 vs 2014):")
print(contrib_0814)

contrib_0814.plot(kind='barh', legend=False, figsize=(8,6), color='skyblue')
plt.title('Fairlie Decomposition: 2014 vs 2008')
plt.xlabel('Contribution')
plt.tight_layout()
plt.grid(True)
plt.show()

# 9. Decomposition: 2014 vs 2022
diff_1422, contrib_1422 = fairlie_decomposition(df_2014_enc, df_2022_enc, outcome, final_predictors)
print(f"\nChange in adolescent pregnancy rate (2022 - 2014): {diff_1422:.4f}")
print("Variable contributions (2014 vs 2022):")
print(contrib_1422)import os
print(os.getcwd())

contrib_1422.plot(kind='barh', legend=False, figsize=(8,6), color='lightcoral')
plt.title('Fairlie Decomposition: 2022 vs 2014')
plt.xlabel('Contribution')
plt.tight_layout()
plt.grid(True)
plt.show()