In [1]:
import pandas as pd
import re
import numpy as np
import unicodedata
import os
from datetime import datetime
import xlsxwriter 


In [2]:

# ===============================
# LOAD TRANSLATED FILES
# ===============================

df4 = pd.read_excel("/Users/anamargarida/Local/Coding/Tsogolo Tsicana/data_clean/translated/q4_translated.xlsx")
df5 = pd.read_excel("/Users/anamargarida/Local/Coding/Tsogolo Tsicana/data_clean/translated/q5_translated.xlsx")
df6 = pd.read_excel("/Users/anamargarida/Local/Coding/Tsogolo Tsicana/data_clean/translated/q6_translated.xlsx")

In [3]:
# ===============================
# 1. REMOVE METADATA (SAFE MODE)
# DOES NOT REMOVE 'recommendations' or 'changes'
# ===============================


metadata_exact = [
    "_uuid", "_submission_time", "_notes", "__version__",
    "start", "end", "today", "start-geopoint", "username",
    "interviewer_name"
]

def drop_metadata(df):
    cols_to_keep = []
    for col in df.columns:
        col_low = col.lower()

        # drop ONLY exact metadata matches
        if col in metadata_exact:
            continue
        
        # drop ONLY if column *starts with* metadata prefix
        if col_low.startswith(("start-geopoint", "_start-geopoint",
                               "_submission", "_notes", "__version__",
                               "username","interviewer_name")):
            continue

        # DO NOT DROP substring matches like "recommendations"
        cols_to_keep.append(col)

    return df[cols_to_keep]

df4 = drop_metadata(df4)
df5 = drop_metadata(df5)
df6 = drop_metadata(df6)

In [4]:
rename_map_q4 = {

    # IDs
    "_id": "id",
    "Participant profile": "participant_profile",
    "Interview date": "interview_date",
    "Name of interviewer": "interviewer_name",
    "District": "district",
    "Community": "community",

    # Leadership type
    "Type of leadership/Traditional": "leader_traditional",
    "Type of leadership/Religious": "leader_religious",
    "Type of leadership/Associative": "leader_associative",
    "Type of leadership/Other": "leader_other",

    # Demographics
    "Sex": "sex",
    "Age (years)": "age_years",
    "Years of community leadership": "leadership_years",

    # Participation
    "Have you participated in Tsogolo Tsicana project activities?/RAMJ club sessions": "part_tsogolo_club_sessions",
    "Have you participated in Tsogolo Tsicana project activities?/Community mobilization": "part_tsogolo_mobilization",
    "Have you participated in Tsogolo Tsicana project activities?/Community dialogues": "part_tsogolo_dialogues",
    "Have you participated in Tsogolo Tsicana project activities?/Parents’ day in schools": "part_tsogolo_parents_day",
    "Have you participated in Tsogolo Tsicana project activities?/Lectures": "part_tsogolo_lectures",
    "Have you participated in Tsogolo Tsicana project activities?/GBV case resolution": "part_tsogolo_gbv_resolution",
    "Have you participated in Tsogolo Tsicana project activities?/Did not participate": "part_tsogolo_none",
    "Have you participated in awareness campaigns or activities about SRH or GBV?": "part_srh_gbv_campaigns",
    "Have you collaborated with schools, health units or community agents to support AGYW?": "part_collaboration_services",

    # Knowledge
    "Have you heard about sexual and reproductive health (SRH)?": "heard_srh",
    "Which SRH topics do you know?/Family planning": "srh_topic_fp",
    "Which SRH topics do you know?/Early pregnancy": "srh_topic_early_preg",
    "Which SRH topics do you know?/Youth-friendly services (SAAJ)": "srh_topic_saaj",
    "Which SRH topics do you know?/Other": "srh_topic_other",
    "Have you heard about gender-based violence (GBV)?": "heard_gbv",
    "Do you know local reporting or support mechanisms for GBV?": "know_gbv_reporting",
    "Have you heard about the Child Helpline?": "heard_child_helpline",

    # Attitudes
    "Girls should have free and safe access to SRH services?": "att_free_safe_access",
    "Can early pregnancy be prevented with proper education and services?": "att_prevent_early_pregnancy",
    "Is it acceptable for girls to marry before 18?": "att_child_marriage",
    "Community should be actively involved in preventing GBV?": "att_community_prevent_gbv",
    "Community leaders should support girls and young mothers in seeking health services?": "att_leader_support_agyw",

    # Behaviour
    "Have you intervened in early pregnancy or GBV cases in your community?": "intervened_cases",
    "What kind of support did you provide?/Referral to services": "support_referral",
    "What kind of support did you provide?/Family mediation": "support_family_mediation",
    "What kind of support did you provide?/Counseling": "support_counseling",
    "What kind of support did you provide?/Other": "support_other",

    # Challenges
    "What are the main challenges faced by girls and young mothers in your community?/Lack of information": "challenge_lack_info",
    "What are the main challenges faced by girls and young mothers in your community?/Rigid cultural norms": "challenge_cultural_norms",
    "What are the main challenges faced by girls and young mothers in your community?/Distance to services": "challenge_distance",
    "What are the main challenges faced by girls and young mothers in your community?/Fear of discrimination": "challenge_fear_discrimination",
    "What are the main challenges faced by girls and young mothers in your community?/Other": "challenge_other",

    # Support needed
    "What type of support is essential to improve the situation of AGYW?/Leader training": "support_leader_training",
    "What type of support is essential to improve the situation of AGYW?/Parental involvement": "support_parental_involvement",
    "What type of support is essential to improve the situation of AGYW?/Service strengthening": "support_service_strengthening",
    "What type of support is essential to improve the situation of AGYW?/Community campaigns": "support_community_campaigns",
    "What type of support is essential to improve the situation of AGYW?/Other": "support_other_needed",

    # Free text
    "changes_observed_leaders": "changes_seen",
    "recommendations_leaders": "recommendations",
}

rename_map_q5 = {

    # IDs
    "_id": "id",
    "Participant profile": "participant_profile",
    "Interview date": "interview_date",
    "Name of interviewer": "interviewer_name",
    "District": "district",
    "Community": "community",

    # Profile
    "Respondent profile/Father": "profile_father",
    "Respondent profile/Mother": "profile_mother",
    "Respondent profile/Guardian": "profile_guardian",

    # Demographics
    "Sex": "sex",
    "Age (years)": "age_years",
    "Number of children or dependents under your care": "n_dependents",
    "Do you have adolescent or young daughters under your care?": "has_agyw_daughters",

    # Participation
    "Have you participated in Tsogolo Tsicana project activities?/Community sessions": "part_tsogolo_sessions",
    "Have you participated in Tsogolo Tsicana project activities?/School meetings": "part_tsogolo_school_meetings",
    "Have you participated in Tsogolo Tsicana project activities?/Campaigns": "part_tsogolo_campaigns",
    "Have you participated in Tsogolo Tsicana project activities?/Did not participate": "part_tsogolo_none",
    "Have you participated in school or community meetings about girls' rights?": "part_school_community_meetings",

    # Knowledge
    "Have you heard about sexual and reproductive health (SRH)?": "heard_srh",
    "Which SRH topics do you know?/Family planning": "srh_topic_fp",
    "Which SRH topics do you know?/Early pregnancy": "srh_topic_early_preg",
    "Which SRH topics do you know?/Menstrual hygiene": "srh_topic_menstrual",
    "Which SRH topics do you know?/Girls' rights": "srh_topic_girls_rights",
    "Which SRH topics do you know?/Other": "srh_topic_other",
    "Have you heard about gender-based violence (GBV)?": "heard_gbv",
    "Do you know available community services supporting adolescent girls and young women?": "know_community_services",
    "Have you heard about the Child Helpline?": "heard_child_helpline",

    # Attitudes
    "Girls should have free and safe access to SRH services?": "att_free_safe_access",
    "att_prevent_early_pregnancy": "att_prevent_early_pregnancy",
    "Can early pregnancy be prevented with education and family support?": "att_prevent_early_pregnancy",
    "Is it acceptable for girls to marry before 18?": "att_child_marriage",
    "Parents and guardians should talk with their children about health, sexuality, and violence prevention?": "att_parental_dialogue",
    "The community should protect and support girls and young mothers?": "att_community_support",

    # Behaviour
    "Have you talked with your children/dependents about sexual health, pregnancy, or violence?": "talked_to_children",
    "Have you supported a girl or young mother in seeking health or protection services?": "supported_agyw_services",
    "Have you referred any case of violence or early pregnancy for support?": "referred_cases",

     # Challenges
    "What are the main challenges faced by girls and young mothers in your community?/Lack of information": "challenge_lack_info",
    "What are the main challenges faced by girls and young mothers in your community?/Rigid cultural norms": "challenge_cultural_norms",
    "What are the main challenges faced by girls and young mothers in your community?/Lack of family dialogue": "challenge_lack_dialogue",
    "What are the main challenges faced by girls and young mothers in your community?/Fear of judgment": "challenge_fear_judgment",
    "What are the main challenges faced by girls and young mothers in your community?/Other": "challenge_other",

    # Support needed
    "What type of support is essential to improve the situation of girls and young mothers?/Parent training": "support_parent_training",
    "What type of support is essential to improve the situation of girls and young mothers?/Psychological support": "support_psychological",
    "What type of support is essential to improve the situation of girls and young mothers?/Service strengthening": "support_service_strengthening",
    "What type of support is essential to improve the situation of girls and young mothers?/Community campaigns": "support_community_campaigns",
    "What type of support is essential to improve the situation of girls and young mothers?/Other": "support_other_needed",


    # Free text
    "changes_observed_family": "changes_seen",
    "recommendations_family": "recommendations",
}

rename_map_q6 = {

    # IDs
    "_id": "id",
    "Participant profile": "participant_profile",
    "Interview date": "interview_date",
    "Name of interviewer": "interviewer_name",
    "District": "district",
    "Community": "community",

    # Profile
    "Marital status": "marital_status",
    "Do you have a partner who is a young mother (adolescent or young woman)?": "has_partner_agyw",
    "Do you live with your partner and child(ren)?": "lives_with_partner_children",

    # Demographics
    "Sex": "sex",
    "Age (years)": "age_years",
    "Number of children": "n_children",
    

    # Knowledge
    "Have you heard about sexual and reproductive health (SRH)?": "heard_srh",
    "Which SRH services do you know?/Youth-friendly services (SAAJ)": "srh_saaj",
    "Which SRH services do you know?/Family planning": "srh_fp",
    "Which SRH services do you know?/Antenatal care": "srh_anc",
    "Which SRH services do you know?/Safe delivery": "srh_safe_delivery",
    "Which SRH services do you know?/Other": "srh_other",
    "Have you heard about the Child Helpline?": "heard_child_helpline",
    "Do you know your rights and responsibilities as a partner of a young mother?": "know_partner_rights",
    "Have you heard about gender-based violence (GBV)?" : "heard_gbv",
    # Attitudes
    "Mothers should have free and safe access to health and social support services?": "att_free_safe_access",
    "Can early pregnancy be prevented with education and proper services?": "att_prevent_early_pregnancy",
    "Is it acceptable for girls to marry before 18?": "att_child_marriage",
    "Partners should support young mothers in seeking health and protection services?": "att_partner_support",
    "Community should be involved in preventing GBV and supporting young mothers?": "att_community_prevent_gbv",

    # Behaviour / participation
    "Have you accompanied your partner to health services (antenatal, postnatal, SRH)?": "accompanied_partner_services",
    "Have you participated in Tsogolo Tsicana project activities?/Community sessions": "part_tsogolo_sessions",
    "Have you participated in Tsogolo Tsicana project activities?/Parent groups": "part_tsogolo_parent_groups",
    "Have you participated in Tsogolo Tsicana project activities?/Campaigns": "part_tsogolo_campaigns",
    "Have you participated in Tsogolo Tsicana project activities?/Did not participate": "part_tsogolo_none",
    "Have you talked with your partner about sexual health, pregnancy, or violence?": "talked_to_partner",
    "Have you supported or referred someone facing violence or needing services?": "supported_or_referred_cases",

    # GBV actions
    "How have you contributed to combating GBV?/Reported cases to authorities or leaders": "gbv_report_authorities",
    "How have you contributed to combating GBV?/Advised or supported someone": "gbv_advise_support",
    "How have you contributed to combating GBV?/Participate in community gender equality groups": "gbv_awareness_sessions",
    "How have you contributed to combating GBV?/Talked to family or neighbours": "gbv_talk_family",
    "How have you contributed to combating GBV?/Referred victims to services": "gbv_referral_services",
    "How have you contributed to combating GBV?/Avoid behaviours promoting violence": "gbv_avoid_promoting_behaviour",
    "How have you contributed to combating GBV?/Support my partner with respect and dialogue": "gbv_support_partner_dialogue",
    "How have you contributed to combating GBV?/Have not contributed directly": "gbv_not_contributed",
    "How have you contributed to combating GBV?/Prefer not to say": "gbv_prefer_not_say",
    "How have you contributed to combating GBV?/Other": "gbv_other",

    # Challenges
    "What are the main challenges faced by young mothers in your community regarding SRH and GBV?/Lack of information": "challenge_lack_info",
    "What are the main challenges faced by young mothers in your community regarding SRH and GBV?/Rigid cultural norms": "challenge_cultural_norms",
    "What are the main challenges faced by young mothers in your community regarding SRH and GBV?/Lack of family support": "challenge_lack_family_support",
    "What are the main challenges faced by young mothers in your community regarding SRH and GBV?/Fear of judgment": "challenge_fear_judgment",
    "What are the main challenges faced by young mothers in your community regarding SRH and GBV?/Other": "challenge_other",

    # Support needed
    "What support is essential to improve the situation of young mothers and their partners?/Training for men in SRH and GBV": "support_training_men",
    "What support is essential to improve the situation of young mothers and their partners?/Psychological support": "support_psychological",
    "What support is essential to improve the situation of young mothers and their partners?/Strengthening of health services": "support_service_strengthening",
    "What support is essential to improve the situation of young mothers and their partners?/Community campaigns": "support_community_campaigns",
    "What support is essential to improve the situation of young mothers and their partners?/Other": "support_other_needed",

    # Free text
    "changes_observed_partners": "changes_seen",
    "recommendations_partners": "recommendations",
}

df4 = df4.rename(columns=rename_map_q4)
df5 = df5.rename(columns=rename_map_q5)
df6 = df6.rename(columns=rename_map_q6)

print("Q4 columns:\n", df4.columns.tolist(), "\n")
print("Q5 columns:\n", df5.columns.tolist(), "\n")
print("Q6 columns:\n", df6.columns.tolist(), "\n")


Q4 columns:
 ['interview_date', 'interviewer_name', 'district', 'community', 'leader_traditional', 'leader_religious', 'leader_associative', 'leader_other', 'Full name (optional)', 'sex', 'age_years', 'leadership_years', 'part_tsogolo_club_sessions', 'part_tsogolo_mobilization', 'part_tsogolo_dialogues', 'part_tsogolo_parents_day', 'part_tsogolo_lectures', 'part_tsogolo_gbv_resolution', 'part_tsogolo_none', 'heard_srh', 'srh_topic_fp', 'srh_topic_early_preg', 'srh_topic_saaj', 'srh_topic_other', 'heard_gbv', 'know_gbv_reporting', 'heard_child_helpline', 'att_free_safe_access', 'att_prevent_early_pregnancy', 'att_child_marriage', 'att_community_prevent_gbv', 'att_leader_support_agyw', 'intervened_cases', 'support_referral', 'support_family_mediation', 'support_counseling', 'support_other', 'part_srh_gbv_campaigns', 'part_collaboration_services', 'challenge_lack_info', 'challenge_cultural_norms', 'challenge_distance', 'challenge_fear_discrimination', 'challenge_other', 'support_leader_tr

# District and community

In [5]:
district_map = {
    "moatize": "Moatize",
    "moatize ": "Moatize",
    "moatize_": "Moatize",

    "changara": "Changara",

    "tete": "Tete City",
    "cidade_de_tete": "Tete City",
    "cidade de tete": "Tete City",
    "tete city": "Tete City"
}

community_map = {
    "matundo": "Matundo",
    "bagamoio": "Bagamoio",
    "sansao mutemba": "Sansao Mutemba",
    "chicolomdwe": "Chicolomdwe",
    "goia": "Goia",
    "chingodzi": "Chingodzi",
    "joaquim chissano": "Joaquim Chissano",

    # Nhachere variations
    "nhachere": "Nhachere",
    "nhanchere": "Nhachere",
    "nhanjere": "Nhanchere",
    "nhachere": "Nhanchere",
    "nhanchere": "Nhanchere",   # you chose Nhanjere vs Nhanchere; keeping Nhanjere canonical

    "luenha": "Luenha",

    # Filipe/Felipe
    "felipe samuel magaia": "Filipe Samuel Magaia",
    "filipe samuel magaia": "Filipe Samuel Magaia",

    "emilia dausse": "Emilia Dausse",
    "josina machel": "Josina Machel",
    "juliasse nherere": "Juliasse Nherere",
}

def clean_district_and_community(df):
    # Standardize district
    df["district"] = (
        df["district"]
        .astype(str).str.lower().str.strip()
        .replace(district_map)
        .fillna(df["district"])
    )

    # Standardize community
    df["community"] = (
        df["community"]
        .astype(str).str.lower().str.strip()
        .replace(community_map)
        .fillna(df["community"])
    )
    
    return df

df4 = clean_district_and_community(df4.copy())
df5 = clean_district_and_community(df5.copy())
df6 = clean_district_and_community(df6.copy())

In [6]:
print("Q4 communities:", sorted(df4["community"].unique()))
print("Q5 communities:", sorted(df5["community"].unique()))
print("Q6 communities:", sorted(df6["community"].unique()))

print("Q4 districts:", sorted(df4["district"].unique()))
print("Q5 districts:", sorted(df5["district"].unique()))
print("Q6 districts:", sorted(df6["district"].unique()))

Q4 communities: ['Bagamoio', 'Chicolomdwe', 'Chingodzi', 'Emilia Dausse', 'Filipe Samuel Magaia', 'Goia', 'Joaquim Chissano', 'Josina Machel', 'Juliasse Nherere', 'Luenha', 'Matundo', 'Nhanchere', 'Sansao Mutemba']
Q5 communities: ['Bagamoio', 'Chingodzi', 'Emilia Dausse', 'Filipe Samuel Magaia', 'Joaquim Chissano', 'Josina Machel', 'Juliasse Nherere', 'Luenha', 'Matundo', 'Nhanchere', 'Sansao Mutemba']
Q6 communities: ['Bagamoio', 'Chicolomdwe', 'Chingodzi', 'Emilia Dausse', 'Filipe Samuel Magaia', 'Joaquim Chissano', 'Josina Machel', 'Juliasse Nherere', 'Luenha', 'Matundo', 'Nhanchere', 'Sansao Mutemba']
Q4 districts: ['Changara', 'Moatize', 'Tete City']
Q5 districts: ['Changara', 'Moatize', 'Tete City']
Q6 districts: ['Changara', 'Moatize', 'Tete City']


# 4. Save the open ended text in different dataframe for later classification

In [7]:
meta_cols = ["id", "district", "community", "participant_profile"]
open_cols = ["changes_seen", "recommendations"]

def extract_open_long(df, meta_cols, open_cols, source):
    present_open_cols = [c for c in open_cols if c in df.columns]

    if not present_open_cols:
        return pd.DataFrame(columns=meta_cols + ["text_type", "text", "source"])

    df_long = (
        df[meta_cols + present_open_cols]
        .melt(
            id_vars=meta_cols,
            value_vars=present_open_cols,
            var_name="text_type",
            value_name="text"
        )
    )

    df_long["source"] = source

    # drop empty text
    df_long = df_long.dropna(subset=["text"])
    df_long = df_long[df_long["text"].astype(str).str.strip() != ""]

    return df_long

df4_open = extract_open_long(df4, meta_cols, open_cols, source="q4")
df5_open = extract_open_long(df5, meta_cols, open_cols, source="q5")
df6_open = extract_open_long(df6, meta_cols, open_cols, source="q6")

df_open = pd.concat([df4_open, df5_open, df6_open], ignore_index=True)
df_open.to_parquet(
    "/Users/anamargarida/Local/Coding/Tsogolo Tsicana/data_clean/open_text/q456_open_ended.parquet",
    index=False
)


## interview date

In [8]:
df4["interview_date"] = df4["interview_date"].dt.normalize()
df5["interview_date"] = df5["interview_date"].dt.normalize()
df6["interview_date"] = df6["interview_date"].dt.normalize()

# Binary

In [9]:

# ===============================
# 2. NORMALIZE YES / NO 
# ===============================

YES_NO_MAP = {
    "yes": 1,
    "no": 0,
    "true": 1,
    "false": 0,
    1: 1,
    0: 0,
}

def recode_binary_inplace(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = (
                df[c]
                .replace(YES_NO_MAP)
                .astype("float")   # allows NaN
            )

            # hard guardrail
            df.loc[~df[c].isin([0, 1]), c] = np.nan
    return df
# -------------------
# Likert (positive direction)
# -------------------
LIKERT_POSITIVE = {
    "strongly disagree": 0,
    "disagree": 0,
    "agree": 1,
    "strongly agree": 1,
}

def recode_likert_inplace(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = (
                df[c]
                .astype(str)
                .str.strip()
                .str.lower()
                .map(LIKERT_POSITIVE)
                .astype("float")
            )
    return df

# ===============================
# 3. Binary inversion
# ===============================
def invert_binary_inplace(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = df[c].apply(
                lambda x: 1 - x if x in [0, 1] else np.nan
            )
    return df

In [10]:
# ===============================
# 3. Apply to Q4 – Leaders
# ===============================
# Binary yes/no
q4_binary = [
    "heard_srh",
    "heard_gbv",
    "know_gbv_reporting",
    "heard_child_helpline",
    "intervened_cases",
    "part_srh_gbv_campaigns",
    "part_collaboration_services",
    "att_prevent_early_pregnancy",
    "att_leader_support_agyw"
]
df4 = recode_binary_inplace(df4, q4_binary)

# Likert
q4_likert = [
    "att_free_safe_access",
    "att_community_prevent_gbv",
]

df4 = recode_likert_inplace(df4, q4_likert)

q4_negative = [
    # Participation
    "part_tsogolo_none",

    # Challenges
    "challenge_lack_info",
    "challenge_cultural_norms",
    "challenge_distance",
    "challenge_fear_discrimination",
    "challenge_other",
]

df4 = invert_binary_inplace(df4, q4_negative)

  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)


In [11]:
q5_binary = [
    "has_agyw_daughters",
    "att_prevent_early_pregnancy",
    "heard_srh",
    "heard_gbv",
    "know_community_services",
    "heard_child_helpline",
    "supported_agyw_services",
    "part_school_community_meetings",
    "referred_cases",
    "talked_to_children",
    "att_community_support"

]

df5 = recode_binary_inplace(df5, q5_binary)

q5_likert = [
    "att_free_safe_access",
    "att_parental_dialogue",
]

df5 = recode_likert_inplace(df5, q5_likert)

q5_negative = [
    "part_tsogolo_none",
    "challenge_lack_info",
    "challenge_cultural_norms",
    "challenge_lack_dialogue",
    "challenge_fear_judgment",
    "challenge_other",
]

df5 = invert_binary_inplace(df5, q5_negative)

  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)


In [12]:
q6_binary = [
    "heard_srh",
    "heard_gbv",
    "know_partner_rights",
    "heard_child_helpline",
    "att_partner_support",
    "accompanied_partner_services",
    "talked_to_partner",
    "supported_or_referred_cases",
    "lives_with_partner_children",
    "has_partner_agyw",
    "att_prevent_early_pregnancy"

]

df6 = recode_binary_inplace(df6, q6_binary)

q6_likert = [
    "att_community_prevent_gbv",
    "att_free_safe_access",
]

df6 = recode_likert_inplace(df6, q6_likert)


q6_negative = [
    # Participation
    "part_tsogolo_none",

    # GBV contribution
    "gbv_not_contributed",

    # Challenges
    "challenge_lack_info",
    "challenge_cultural_norms",
    "challenge_lack_family_support",
    "challenge_fear_judgment",
    "challenge_other",
]

df6 = invert_binary_inplace(df6, q6_negative)

  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)
  .replace(YES_NO_MAP)


In [13]:
def recode_child_marriage_pos(df):
    if "att_child_marriage" in df.columns:
        df["att_child_marriage"] = df["att_child_marriage"].map({
            "no": 1,
            "yes": 0,
            "depends": 0
        })
    return df

df4 = recode_child_marriage_pos(df4)
df5 = recode_child_marriage_pos(df5)
df6 = recode_child_marriage_pos(df6)

In [14]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 53 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   interview_date                 35 non-null     datetime64[ns]
 1   interviewer_name               35 non-null     object        
 2   district                       35 non-null     object        
 3   community                      35 non-null     object        
 4   leader_traditional             35 non-null     int64         
 5   leader_religious               35 non-null     int64         
 6   leader_associative             35 non-null     int64         
 7   leader_other                   35 non-null     int64         
 8   Full name (optional)           34 non-null     object        
 9   sex                            35 non-null     object        
 10  age_years                      35 non-null     int64         
 11  leadership_years     

In [15]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 47 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   interview_date                  59 non-null     datetime64[ns]
 1   interviewer_name                59 non-null     object        
 2   district                        59 non-null     object        
 3   community                       59 non-null     object        
 4   profile_father                  59 non-null     int64         
 5   profile_mother                  59 non-null     int64         
 6   profile_guardian                59 non-null     int64         
 7   sex                             59 non-null     object        
 8   age_years                       59 non-null     int64         
 9   n_dependents                    59 non-null     int64         
 10  has_agyw_daughters              59 non-null     float64       
 11  part_tso

In [16]:
df6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 56 columns):
 #   Column                                                                         Non-Null Count  Dtype         
---  ------                                                                         --------------  -----         
 0   interview_date                                                                 55 non-null     datetime64[ns]
 1   interviewer_name                                                               55 non-null     object        
 2   district                                                                       55 non-null     object        
 3   community                                                                      55 non-null     object        
 4   sex                                                                            55 non-null     object        
 5   age_years                                                                      55 non-n

# fix q5 participant profile

In [17]:
def classify_parent_profile(row):
    if row.get("profile_father") == 1 and row.get("profile_mother") == 1:
        return "parent/guardian (both)"
    if row.get("profile_father") == 1:
        return "parent/guardian (father)"
    if row.get("profile_mother") == 1:
        return "parent/guardian (mother)"
    if row.get("profile_guardian") == 1:
        return "parent/guardian (guardian)"
    return "parent/guardian (unspecified)"

df5["participant_profile_org"] = df5.apply(
    classify_parent_profile, axis=1
)
df5["participant_profile"] = "parent/guardian"

In [19]:
df5["participant_profile"].value_counts(dropna=False)


participant_profile
parent/guardian    59
Name: count, dtype: int64

In [20]:
df5[["profile_father","profile_mother","profile_guardian"]].sum(axis=1).value_counts()

1    57
2     2
Name: count, dtype: int64

In [21]:
cols_to_drop = [
    "profile_father",
    "profile_mother",
    "profile_guardian"
]

df5 = df5.drop(
    columns=[c for c in cols_to_drop if c in df5.columns]
)

In [22]:
# ===============================
# 5. EXPORT CLEAN NORMALIZED FILES
# ===============================

df4.to_excel("/Users/anamargarida/Local/Coding/Tsogolo Tsicana/data_clean/Normalized/q4_normalised.xlsx", index=False)
df5.to_excel("/Users/anamargarida/Local/Coding/Tsogolo Tsicana/data_clean/Normalized/q5_normalised.xlsx", index=False)
df6.to_excel("/Users/anamargarida/Local/Coding/Tsogolo Tsicana/data_clean/Normalized/q6_normalised.xlsx", index=False)