In [91]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import unicodedata


In [92]:
stop_words = {"the", "a", "an", "and", "or", "in", "on", "of", "to", "for", "from", 
    "that", "this", "it", "is", "was", "be", "am", "are", "were", "been",
    "at", "as", "with", "by", "about", "but", "so", "if", "when", "while",
    "i", "you", "he", "she", "we", "they", "me", "my", "your", "our",
    "their", "them", "him", "her",

    # Portuguese
    "o", "a", "os", "as", "de", "da", "do", "das", "dos",
    "e", "em", "um", "uma", "uns", "umas",
    "para", "por", "com", "sem", "que", "na", "no", "nas", "nos",
    "ao", "aos", "à", "às", "se", "eu", "tu", "ele", "ela",
    "nós", "vós", "eles", "elas", "meu", "minha", "teu", "tua"
}

# Convert to list for sklearn
stop_words = list(stop_words)

In [93]:
############################################################
# 3. NORMALIZE TEXT
############################################################

def normalize_text(text):
    if pd.isna(text):
        return ""

    # step 1: lowercase
    text = str(text).lower().strip()

    # fix contractions FIRST
    text = (text.replace("don't", "dont")
        .replace("didn't", "didnt")
        .replace("didn t", "didnt")
        .replace("doesn't", "doesnt")
        .replace("doesn t", "doesnt")
        .replace("can't", "cant")
        .replace("can t", "cant")
        .replace("won't", "wont")
        .replace("won t", "wont")
        .replace("it's", "its")
        .replace("it s", "its")
        .replace("she's", "shes")
        .replace("he's", "hes")
        .replace("i'm", "im")
        .replace("haven t", "havent")
    )
            
    # step 2: remove accents
    text = unicodedata.normalize("NFKD", text)
    text = "".join(c for c in text if not unicodedata.combining(c))

  # step 3: remove punctuation (apostrophes become spaces)
    text = re.sub(r"[^a-z0-9\s]", " ", text)

    # step 4: collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [94]:
############################################################
# 4.1 ATTITUDE CLASSIFIER: FREE AND SAFE ACCESS
############################################################
def categorize_att_free_safe_access(text):
    if pd.isna(text) or str(text).strip() == "":
        return "Free access: No Comment"
    
    text = normalize_text(text)

    # Safety
    if any(w in text for w in ["safe", "protect", "security", "danger", "violence"]):
        return "Free access: safety"

    # Rights based / autonom
    if any(w in text for w in [
        "right", "rights", "freedom", "entitled", "decide for myself",
        "choose", "my choice", "voluntary", "no one should force",
        "full access", "no restriction", "choose her partner", 
        ]):
        return "Free access: rights based"
    
    #pregnancy prevention 
    if any(w in text for w in ["reduce pregn", "prevent early pregn",
        "prevent many early", "reduce pregnancies", "unwanted"]):
        return "Free access: prevent early pregnancy"
    
    # Information / awareness
    if any(w in text for w in [
        "information", "aware", "know", "saber", 
        "learn", "understand", "find out", "improve", "informed"
        ]):
        return "Free access: awareness/information"

    # Support for RAMJ
    if any(w in text for w in ["support", "help", "well being", 
        "report", "problems"]):
        return "Free access: support for girls"
    
    # Positive / general supportive
    if any(w in text for w in [
        "interesting", "hundred", "agree", "yes",
        "good", "important", "right thing"]):
        return "Free access: positive"

    # no explanation
    if any(w in text for w in ["dont know", "no comment", "no idea", "don t know"]):
        return "Free access: no explanation"

    return "Free access: not classified"

In [95]:
############################################################
# 4.2 CHILD MARRIAGE CLASSIFIER – FINAL, COMPLETE VERSION
############################################################

def categorize_att_child_marriage(text):
    if pd.isna(text) or str(text).strip() == "":
        return "Child marriage: No comment"

    text = normalize_text(text)

    # --------------------------------------------------------
    # 1. AGE / UNDERAGE / TOO YOUNG
    # --------------------------------------------------------
    if any(w in text for w in [
        "age", "at that age", "right age", "suitable age", "good age",
        "not suitable", "not very suitable", "not fit", "fit to marry",
        "not old", "not old enough", "too young", "too small",
        "still early", "still small", "its early", "its smaller", "smaller",
        "younger", "younger age", "still a child", "child", "minor", "underage",
        "before 18", "under 18", "18 years", "must be 18", "over 18",
        "before the age of 18", "under the age of 18",
        "age does not allow", "age doesnt allow", "not reached the age",
        "hasnt reached the age", "married at 14", "get married at 14",
        "there are people who get married at 14",
        "the body must be legally ready",
        "not yet at the age", "not yet reached the age",
        "antes dos 18", "18", "17"
    ]):
        return "Child marriage: age or underage"

    # --------------------------------------------------------
    # 2. SOCIAL / FAMILY / CULTURAL PRESSURE
    # --------------------------------------------------------
    if any(w in text for w in [
        "family", "parents", "father", "mother", "dad", "mom",
        "familia", "principios familiares", "alguns familiares",
        "depends on the family", "depends on family", "depends on each family",
        "if parents want", "parents force", "parents oblige",
        "parents accepted", "as soon as the father", "obligation",
        "tradition", "cultural", "culture",
        "influence of family", "influence of some", "influence of friends",
        "se a familia", "depende de principios familiares",
        "depende de alguns familiares",
        "just get your first period", "girls get married early"
    ]):
        return "Child marriage: social or cultural norms"

    # --------------------------------------------------------
    # 3. PERSONAL CHOICE / AGENCY
    # --------------------------------------------------------
    if any(w in text for w in [
        "my will", "its my will", "i want", "i wanted", "i just want",
        "wish", "desire", "if i want", "if you want", "if a person just wants",
        "if the owner wants", "owner wants", "own free will",
        "depends on each person", "depends on who wants",
        "basta eu querer", "bastar eu querer", "i wont make it",
        "depends on conditions", "depends on what each person wants",
        "why is it not", "each person", "depends", "wants to get married"
    ]):
        return "Child marriage: personal choice or agency"

    # --------------------------------------------------------
    # 4. MATURITY / READINESS
    # --------------------------------------------------------
    if any(w in text for w in [
        "not ready", "not prepared", "not yet ready",
        "needs to grow", "need to grow", "must grow", "grow up",
        "grow first", "still teen", "still a teenager",
        "maturity", "mature", "not mature", "not right",
        "responsibility", "responsibilities",
        "emotions are not controllable", "emotional state",
        "development", "cognitive", "cant unite",
        "has a lot to learn", "have a lot to learn", "need to learn",
        "lot to learn", "first you must form", "turn 18",
        "cannot withstand", "cant withstand", "reach 18",
        "not yet prepared for the care", "not yet prepared for the newborn",
        "dont know how to care for a newborn", "not know how to take care"
    ]):
        return "Child marriage: psychological or maturity impact"

    # --------------------------------------------------------
    # 5. EDUCATION IMPACT
    # --------------------------------------------------------
    if any(w in text for w in [
        "study", "school", "studies", "finish studies", "finish school",
        "continue studying", "stay in studies",
        "leave school", "drop out", "abandon studies",
        "abandono escolar", "abandona escola", "abandon school",
        "abandonment of studies", "abandono",
        "interaction with friends and family",
        "miss a lot of opportunities", "lack of opportunities",
        "goals", "achieve their goals", "lack of clear information"
    ]):
        return "Child marriage: education disruption"

    # --------------------------------------------------------
    # 6. HEALTH RISKS
    # --------------------------------------------------------
    if any(w in text for w in [
        "pregnant", "engravidar", "pregnancy", "pregnancies",
        "unwanted pregnancies", "unwanted pregnancy",
        "risk", "danger", "maternal", "health",
        "sick", "mortality", "death",
        "suffer", "sofrimento", "hospital",
        "cesariana", "c section", "newborn", "take care of another baby",
        "recem nascido", "emotional problems"
    ]):
        return "Child marriage: health risks"

    # --------------------------------------------------------
    # 7. VIOLENCE / PROTECTION
    # --------------------------------------------------------
    if any(w in text for w in [
        "violence", "abuse", "abusive",
        "sofrimento no lar", "crime",
        "against minors", "not allowed", "prohibited",
        "punished", "law", "lei", "jail",
        "law doesnt allow", "law doesnt",
        "it cannot"
    ]):
        return "Child marriage: violence or protection"

    # --------------------------------------------------------
    # 8. FUTURE CONSEQUENCES
    # --------------------------------------------------------
    if any(w in text for w in [
        "future", "ruin", "ruins", "destroy",
        "destroys", "regret", "financial",
        "damage", "damage life"
    ]):
        return "Child marriage: future consequences"

    # --------------------------------------------------------
    # 9. POSITIVE ACCEPTANCE
    # --------------------------------------------------------
    if any(w in text for w in [
        "its good", "it is good", "good age", "good look",
        "acceptable", "good to start", "good age profile",
        "good age range", "you can now join someone",
        "she is grown", "already grown", "prepared to get married",
        "maybe yes", "why cant you", "not advisable", "pode uni"
    ]):
        return "Child marriage: positive acceptance"

    # --------------------------------------------------------
    # 10. NO EXPLANATION / NON-INFORMATIVE
    # --------------------------------------------------------
    if any(w in text for w in [
        "dont know", "i dont know", "no comments", "others",
        "nhanjere", "hundred comments", "no idea",
        "i myself have not good experience", "i have no experience"
    ]):
        return "Child marriage: No explanation"

    # --------------------------------------------------------
    # DEFAULT
    # --------------------------------------------------------
    return "Child marriage: Not classified"

In [96]:
############################################################
# 4.3 ATTITUDE CLASSIFIER: COMMUNITY GBV ENGAGEMENT (REVISED)
############################################################

def categorize_att_community_engage_gbv(text):
    if pd.isna(text) or str(text).strip() == "":
        return "Community GBV: no comment"

    text = normalize_text(text)
    # --------------------------------------------------------
    # 1. VIOLENCE REDUCTION / RAPE PREVENTION
    # --------------------------------------------------------
    if any(w in text for w in [
        "violence", "violencia",
        "rape", "raped", "child rape",
        "sexual abuse", "abuso sexual",
        "risk of physical emotional sexual abuse",
        "reduce infection rate sexual abuse",
        "reduce infection rate",
        "cases in the community", "fewer cases in the community",
        "no one would want to see a girl receiving that kind of act",
        "no one would like to see a girl receiving that kind of act",
        "no one should suffer these types of acts"
    ]):
        return "Community GBV: violence reduction"

    # --------------------------------------------------------
    # 2. PREVENT EARLY PREGNANCY / PREMATURE UNION
    # --------------------------------------------------------
    if any(w in text for w in [
        "early pregnancy", "early pregnancies",
        "gravidez precoce", "gravidez precose",
        "getting pregnant early", "get pregnant early",
        "avoid getting pregnant early",
        "prevent pregnancy", "avoid pregnancy",
        "premature union", "no premature union",
        "reduce unwanted pregnancies", "reduction of unwanted pregnancies",
        "reduce the rate of unwanted pregnancies",
        "unwanted pregnancies", "being born while they are children",
        "para prevenir as gravidez precose y violencia"
    ]):
        return "Community GBV: prevent early pregnancy or marriage"

    # --------------------------------------------------------
    # 3. SAFETY, PROTECTION, PRIVACY
    # --------------------------------------------------------
    if any(w in text for w in [
        "safe", "safety", "protect", "proteger", "security",
        "prevent abuse", "avoid abuse", "reduce risks",
        "risk of physical emotional sexual abuse",
        "because you want privacy", "privacy",
        "because they are confidential things", "confidential",
        "every girl has the right to be defended",
        "prevention of girls is a right", "worst doesnt",
        "para as raparigas si cuidar", "rights",
        "prevenir problemas nas raparigas"
    ]):
        return "Community GBV: safety and protection"

    # --------------------------------------------------------
    # 4. VIOLENCE REPORTING & INTERVENTION
    # --------------------------------------------------------
    if any(w in text for w in [
        "report", "denounce", "denunciar",
        "easier to report", "accompany a minor",
        "community can intervene", "before the police",
        "make the case known", "case known",
        "talk about the case", "talking about the case",
        "disclosure of care", "prevent"
    ]):
        return "Community GBV: violence reporting and intervention"

    # --------------------------------------------------------
    # 5. COLLECTIVE COMMUNITY ACTION
    # --------------------------------------------------------
    if any(w in text for w in [
        "entire community", "whole community", "every community",
        "community is fighting", "community fighting",
        "community involved", "more involved the community",
        "united community", "unity is strength",
        "we need society", "the mirror of girls",
        "it is in the community where young people are inserted",
        "the community has the right to get involved",
        "the community itself that lives the reality",
        "community experiences reality",
        "the community is what encourages teenage girls to plan their future",
        "the community serves as the source of teaching",
        "the community accompanies the child in their life",
        "community must"
    ]):
        return "Community GBV: collective community action"

    # --------------------------------------------------------
    # 6. PARENTAL / ADULT INVOLVEMENT
    # --------------------------------------------------------
    if any(w in text for w in [
        "parents", "mothers", "fathers", "our parents",
        "pais", "maes", "pai", "mae",
        "they are still children",
        "they are the determination of children",
        "they are the source", "source of children",
        "they must be present", "must be present",
        "they must be following",
        "eles devem estar",
        "they must be",
        "they must get involved",
        "they secure us in the community",
        "we are going to need them to guide us",
        "precisamos deles", "precisamos deles para nos guiar",
        "parenting is always good",
        "the community can meet with those responsible for educating their children"
    ]):
        return "Community GBV: parental involvement"

    # --------------------------------------------------------
    # 7. EMOTIONAL / SOCIAL SUPPORT (INCL. RAMJ)
    # --------------------------------------------------------
    if any(w in text for w in [
        "support", "apoio", "help", "ajudar",
        "for us to give advice", "give advice",
        "advise", "advising", "counsel", "counseling",
        "welcoming", "welcome",
        "we can gain courage and speak",
        "easier to talk", "open up",
        "someone trustworthy to talk",
        "constant support for girls",
        "valuation of girls",
        "for the benefit of the general community",
        "helping girls stand tall",
        "vent to someone", "grow well"
        "taking care of the rights of ramj",
        "it will benefit the ramj",
        "it will be beneficial to the ramj community",
        "so that ramj feel free in society",
        "there will be more freedom for new experiences at ramj",
        "vai proteger mais ramj",
        "it will be good to distance myself from abuse"
    ]):
        return "Community GBV: emotional or social support"

    # --------------------------------------------------------
    # 8. EDUCATION & AWARENESS (LECTURES, TEACHING, KNOWLEDGE)
    # --------------------------------------------------------
    if any(w in text for w in [
        "learn", "learning", "aprender",
        "lectures", "palestras", "sessions",
        "someone to talk about the subject",
        "talk about the subject not just at a certain period but at all times",
        "educating people", "education", "awareness",
        "sensibility in communities", "sensitivity in communities",
        "to know", "for us to know",
        "for more information", "more information",
        "knowledge", "have knowledge",
        "to gain knowledge", "to have knowledge",
        "so that we are all well informed",
        "community involvement can facilitate access to health services and ensure that information reaches across borders",
        "the community must always speak and explain to girls",
        "the community should get involved if yes talk to the girls",
        "to teach us", "teaching young people",
        "teach girls how to prevent",
        "teach how to do and prevent", "teach",
        "to find out if the girls understand what is being said in the room",
        "for girls to be able to obtain information"
    ]):
        return "Community GBV: education and awareness"

    # --------------------------------------------------------
    # 9. SRHR / HEALTH IMPROVEMENT
    # --------------------------------------------------------
    if any(w in text for w in [
        "health", "health services", "reproductive health",
        "ssr", "sexual and reproductive",
        "more grip on ssr", "prevent"
        "disease", "disease reduction",
        "reduce infection rate",
        "reduce the mortality rate", "mortality rate",
        "improve sexual health",
        "girls can achieve their goals",  # often tied to health/rights
        "for gender equality", "gender equality"
    ]):
        return "Community GBV: srhr improvement"

    # --------------------------------------------------------
    # 10. CHILD DEVELOPMENT & GROWTH
    # --------------------------------------------------------
    if any(w in text for w in [
        "best growth", "good development in girls",
        "best creicimetos", "for better creicimetos",
        "pra nos crescer bem", "for us to grow well",
        "children s growth", "childrens development",
        "children to grow", "for children to study",
        "for children to continue", "for children to continue with their studies",
        "continuous change with studies",
        "to go to school", "for us to study",
        "to study", "we need them", "grow well",
        "development", "for the best development of children",
        "for better growth and the well being of girls",
        "for healthy creicer", "to grow healthy",
        "for our good", "so as not to drop out of school",
        "best childrens toys"
    ]):
        return "Community GBV: child development and growth"

    # --------------------------------------------------------
    # 11. SUPERVISION & CONTROL
    # --------------------------------------------------------
    if any(w in text for w in [
        "more control", "have more control",
        "we will have more control", "so that there is more control in the community",
        "the community must control girls",
        "pay attention", "be more vigilant",
        "to be more vigilant", "guide",
        "surveillance", "monitoring the girl",
        "they must be present when monitoring the girl",
        "to improve control"
    ]):
        return "Community GBV: supervision and control"

    # --------------------------------------------------------
    # 12. JUSTICE & ACCOUNTABILITY
    # --------------------------------------------------------
    if any(w in text for w in [
        "justice", "do justice",
        "must be justice", "laws", "rules", "norms",
        "everyone should know it is a crime",
        "everyone should know its a crime",
        "crime",
        "the community has the right to this",
        "these are our rights"
    ]):
        return "Community GBV: justice and accountability"

    # --------------------------------------------------------
    # 13. NON-INVOLVEMENT / COUPLE PRIVACY
    # --------------------------------------------------------
    if any(w in text for w in [
        "no one gets into a fight between a couple",
        "nobody gets involved in a couple s fight",
        "nobody gets involved in a couple fight",
        "if they are fighting they have their reasons the community cant enter",
        "community cant enter",
        "they have their reasons the community cant enter"
    ]):
        return "Community GBV: non-involvement or couple privacy"

    # --------------------------------------------------------
    # 14. GENERAL BENEFIT / POSITIVE EVALUATION
    # --------------------------------------------------------
    if any(w in text for w in [
        "yes itshould", "yes it should", "yes itreally should",
        "yes it really should", "yes they should",
        "yes they should get involved", "yes the community must get involved",
        "yes you should get involved", "community intervention is necessary",
        "yes i completely agree", "sim deve", "sim",
        "itshould", "must be involved", "acceptable",
        "there will be changes", "there are changes",
        "it will be important", "important",
        "very good", "good", "it will be good",
        "it is good", "good for society", "ressource",
        "very right and better", "and better and right",
        "and good and very and right", "consideration",
        "so that the community can see and understand that it is possible",
        "it will be beneficial to the ramj community",
        "for our good", "i agree", "yes", "contribute"
    ]):
        return "Community GBV: general benefit"

    # --------------------------------------------------------
    # 15. NO EXPLANATION / NOT CLEAR
    # --------------------------------------------------------
    if any(w in text for w in [
        "dont know", "put nothing",
        "no comment", "no comments",
        "no idea", "one hundred comments", "one hundred ideas",
        "why"
    ]):
        return "Community GBV: no explanation"

    # --------------------------------------------------------
    # 16. FALLBACK
    # --------------------------------------------------------
    return "Community GBV: not classified"

In [97]:
############################################################
# 5. TALKING TO PARENTS CLASSIFIER (FINAL – FULL COVERAGE)
############################################################

def categorize_talked_to_parents_reason(text):
    if pd.isna(text) or str(text).strip() == "":
        return "Parents communication: no comment"
    
    text = normalize_text(text)
    
    # --------------------------------------------------------
    # 1. FEAR
    # --------------------------------------------------------
    if any(w in text for w in [
        "fear", "afraid", "scared", "receio",
        "i feel scared", "i am scared"
    ]):
        return "Parents communication: fear"

    # --------------------------------------------------------
    # 2. SHAME / NO COURAGE / EMBARRASSMENT
    # --------------------------------------------------------
    if any(w in text for w in [
        "shame", "ashamed", "embarrass", "embarrassed",
        "no courage", "dont have the courage",
        "i dont have the courage", "i didnt have the courage",
        "i dont like talking", "not easy to open up",
        "still dont have the courage", "lack of confidence"
    ]):
        return "Parents communication: shame"

    # --------------------------------------------------------
    # 3. TOO YOUNG / MINOR / CHILD
    # --------------------------------------------------------
    if any(w in text for w in [
        "only child", "still a child", "i am a child", "im a child",
        "i m a child", "i am also a child", "just a child",
        "still studying and a child", "because they consider me a child",
        "because i m smaller", "underage", "minor", "still smaller",
        "hasnt happened", "lack of opportunity", "not interested",
        "never managed", "because i am a child", "no knowledge",
        "lack of information",
    ]):
        return "Parents communication: too young or minor"

    # --------------------------------------------------------
    # 4. NEVER EXPERIENCED VIOLENCE / NEVER HAPPENED
    # --------------------------------------------------------
    if any(w in text for w in [
        "never went through", "never suffered",
        "not happened", "its not happened", "it hasnt happened",
        "never experienced", "never happened to me",
        "never victim", "never gone through this",
        "i havent gone through this", "i havent experienced",
        "i havent participated in these cases",
        "haven t experienced these cases of violence yet",
        "i havent done it yet"
    ]):
        return "Parents communication: never experienced violence"

    # --------------------------------------------------------
    # 5. DOESN'T KNOW HOW TO EXPLAIN / LACK OF KNOWLEDGE
    # --------------------------------------------------------
    if any(w in text for w in [
        "dont know how to explain", "dont know what to say",
        "dont know how to talk", "i dont know where to start",
        "i have no idea", "i dont know about violence",
        "i dont know about these things",
        "i dont know about these things about preventing early pregnancy",
        "i dont know what its about", "lack of information"
        "cant do it because i dont have knowledge",
        "lack of knowledge"
    ]):
        return "Parents communication: does not know how to explain"

    # --------------------------------------------------------
    # 6. CULTURAL / TABOO / DISAGREEMENT / FAMILY DIALOGUE BARRIER
    # --------------------------------------------------------
    if any(w in text for w in [
        "taboo", "taboos", "culture", "cultural",
        "restrict matters", "ssr", "mother doesnt accept",
        "parents dont accept", "community culture",
        "disagreement", "due to disagreement",
        "lack of dialogue", "no understanding"
    ]):
        return "Parents communication: cultural or taboo barrier"

    # --------------------------------------------------------
    # 7. PARENTS UNAVAILABLE / ABSENT / NO TIME / NO OPPORTUNITY
    # --------------------------------------------------------
    if any(w in text for w in [
        "dad doesnt have time", "parents dont have time",
        "dont have time", "lack of family dialogue"
        "far away", "distance",
        "never had the chance",
        "i didnt have the chance", "i didnt have the opportunity",
        "i didnt have time", "i didnt spend much time",
        "i never had the opportunity to do this",
        "i dont have my parents alive",
        "didnt grow up with my parents", "not enough time",
        "didnt have chance", "have the chance", "have time",
        "havent"
    ]):
        return "Parents communication: parents unavailable"

    # --------------------------------------------------------
    # 8. PREFERS SIBLINGS / OTHER RELATIVES / FRIENDS
    # --------------------------------------------------------
    if any(w in text for w in [
        "talk to my sister", "talk to my older sister",
        "older sister", "older brother",
        "talk to my siblings", "talk to my brother",
        "i talk to my mother only",
        "friends", "my friends",
        "some nervous people"  # meaning unsafe to talk to parents
    ]):
        return "Parents communication: prefers siblings or friends"

    # --------------------------------------------------------
    # 9. LACK OF INTEREST / UNWILLINGNESS
    # --------------------------------------------------------
    if any(w in text for w in [
        "dont want", "i dont want", "i dont want to talk",
        "lack of will", "not interested",
        "never thought about it",
        "dont think about talking",
        "because i never mattered",
        "why not", "lack of time",
        "parents are boring", "lack of interest",
    ]):
        return "Parents communication: lack of interest"

    # --------------------------------------------------------
    # 10. VIOLENCE AS A BARRIER
    # --------------------------------------------------------
    if any(w in text for w in [
        "i suffered violence", "because i have suffered violence",
        "suffered violence"
    ]):
        return "Parents communication: violence barrier"

    # --------------------------------------------------------
    # 11. NO REASON / UNCLEAR
    # --------------------------------------------------------
    if any(w in text for w in [
        "i dont know", "dont know",
        "no reason", "nothing", "no comments"
    ]):
        return "Parents communication: no reason"

    return "Parents communication: not classified"

In [98]:
############################################################
# 6. SAVINGS SUPPORT NEEDED CLASSIFIER (FINAL + COMPLETE)
############################################################

def categorize_savings_support_needed(text):
    if pd.isna(text) or str(text).strip() == "":
        return "Savings support: no comment"

    text = normalize_text(text)

    # --------------------------------------------------------
    # A. Information, Training, Knowledge
    # --------------------------------------------------------
    if any(w in text for w in [
        "information", "more information", "need information",
        "training", "train", "treinamento", "formacao",
        "finance training", "financial training",
        "tips", "dicas", "advice", "just advice",
        "knowledge", "ideia", "iniciativa",
        "how to get started", "how to start",
        "help with ideas",
        "i would like to be trained",
        "support with school materials"  # capacity building
    ]):
        return "Savings support: information or training"

    # --------------------------------------------------------
    # B. Financial Capital or Material Support
    # --------------------------------------------------------
    if any(w in text for w in [
        "monetary", "money", "valor", "receive a value",
        "capital", "financial", "fund", "funds",
        "material needed", "material para", "safe material",
        "support with school materials",  # counted also here
        "help girls make savings",
        "projects support", "support is projects",
        "material"
    ]):
        return "Savings support: financial capital"

    # --------------------------------------------------------
    # C. Emotional or Psychological Support
    # --------------------------------------------------------
    if any(w in text for w in [
        "emocional", "emotional", "psychological",
        "psychological support", "emotional support"
    ]):
        return "Savings support: emotional or psychological support"

    # --------------------------------------------------------
    # D. Not participating / no experience / too young / no capacity
    # --------------------------------------------------------
    if any(w in text for w in [
        "havent participated", "havent participated",
        "never participated", "never",
        "no experience", "no capacity",
        "i am a child", "im a child",
        "child", "i dont have the capacity",
        "i cant participate", "i cant participate",
        "i cant afford", "i cant afford",
        "didnt participate", "didnt participate",
        "havent joined", "havent joined",
    ]):
        return "Savings support: no participation or no capacity"


    # --------------------------------------------------------
    # F. Don’t know / no idea
    # --------------------------------------------------------
    if any(w in text for w in [
        "dont know", "i dont know",
        "no idea", "never heard",
        "ainda n ouvi dizer", "none", 
        "nothing", "no type",
        "no comments", "i dont have",
        "there is nothing", "theres nothing",
        "outro", "no"
    ]):
        return "Savings support: no idea"

    # --------------------------------------------------------
    # G. General "support" (unspecified)
    # --------------------------------------------------------
    if any(w in text for w in [
        "support", "service support", "help",
        "may they continue", "continue more"
    ]):
        return "Savings support: general support"

    # --------------------------------------------------------
    # H. Encouraging participation of others
    # --------------------------------------------------------
    if any(w in text for w in [
        "i want girls", "i want young mothers",
        "girls as well as young mothers"
    ]):
        return "Savings support: expand participation"

    # --------------------------------------------------------
    # I. No explanation filler
    # --------------------------------------------------------

    return "Savings support: not classified"

In [99]:
############################################################
# 7. SAVINGS COMMENTS CLASSIFIER (FINAL)
############################################################

def categorize_savings_comments(text):
    if pd.isna(text) or str(text).strip() == "":
        return "Savings comments: no comment"

    t = normalize_text(text)

    # 1. No content / Nothing to say / Blank responses
    if any(w in text for w in [
        "no", "nothing", "nao", "none", 
        "does not have", "doesnt have", 
        "there is nothing", "it does not have",
        "no comments", "não tenho", 
        "i dont have", "i don't have",
        "i dont have anything to add",
        "i have nothing", "no comment", "no idea"
    ]):
        return "Savings comments: no comment"

    # 2. No experience / Never participated / Not part of group
    if any(w in text for w in [
        "never participated", "i never participated", 
        "never", "i havent participated", "i haven t participated",
        "i havent", "i havent", "i have not", 
        "not at the moment", "i havent joined",
        "havent heard", "never heard",
        "i am a child and i dont have the capacity",
        "i m a child", "i am a child"
    ]):
        return "Savings comments: no experience"

    # 3. Positive experience / gratitude / benefits
    if any(w in text for w in [
        "thank", "grateful", "appreciate", "good", 
        "helped", "very good", "independent", 
        "experiences", "good to participate", 
        "economically", "grow economically",
        "it has helped us a lot", "has helped us a lot",
        "its going to be very good", "it was good",
        "helped me", "i really like",
        "good changes", "satisfactory", "support they gave me",
        "continue to save", "i am very grateful"
    ]):
        return "Savings comments: positive experience"

    # 4. Group functioning / unity / discipline / seriousness
    if any(w in text for w in [
        "serious", "seriousness", "unity", "united",
        "stronger", "collaboration", "group grow",
        "more unity", "u groups be serious",
        "debt", "return it", "run away with money",
        "avoid unnecessary withdrawals", "pay the amount",
        "compliance", "people who carry debt",
        "serious people", "group participation is satisfactory"
    ]):
        return "Savings comments: group functioning"

    # 5. Suggestions for improvement
    if any(w in text for w in [
        "improve", "improvement", "continuation", "continue", 
        "strengthen", "awareness", "material",
        "cycle", "ensure continuation", "needs", 
        "should", "i can suggest", "further strengthen", 
        "may they continue", "continuing work",
        "better the meetings", "i would like",
        "needs to improve", "please improve"
    ]):
        return "Savings comments: suggestions for improvement"

    # 6. Financial or material needs
    if any(w in text for w in [
        "money", "goods", "household", "shoes",
        "financial support", "support for my business",
        "buying things", "help for school",
        "material", "value", "monetary"
    ]):
        return "Savings comments: financial or material needs"

    # 7. Encouraging participation / wanting others to join
    if any(w in text for w in [
        "more people show up", "girls should participate",
        "young mothers participate", "i want girls to participate",
        "anyone welcome to support"
    ]):
        return "Savings comments: encourage participation"

    # 8. Social support / solidarity
    if any(w in text for w in [
        "support", "tell us to support", "help each other",
        "talking to your friends", "support will be welcome"
    ]):
        return "Savings comments: social support"

    return "Savings comments: not classified"

In [100]:
############################################################
# 8. CHANGES SEEN CLASSIFIER (EXHAUSTIVE + CLEAN)
############################################################

def categorize_changes_seen_multi(text):
    categories = []

    text = normalize_text(text)

    if text == "":
        return ["Changes: no comment"]

    
    ############################################################
    # 1. NO CHANGE / DIDN'T PARTICIPATE / NOT AWARE
    ############################################################
    if any(w in text for w in [
        "didnt participate", "did not participate",
        "didnt", "havent heard",
        "for now none", "none", "nothing", "nada",
        "i did not see", "i didnt notice",
        "i havent noticed", "havent",
        "nunca", "no idea", "no change",
        "i am not aware", "not aware",
        "didnt observe", "did not observe",
        "i didnt find any changes",
        "i wasnt aware", "i was not aware",
        "first time to participate",
        "didnt see", "not any thing",
        "dont hear any changes", "dont see",
        "dont know", "never participated",
    ]):
        categories.append("Changes: no change or no knowledge")


    ############################################################
    # 2. REDUCTION IN EARLY / UNWANTED PREGNANCY
    ############################################################
    if any(w in text for w in [
        "pregnancy", "early pregnancies", "gravidez",
        "unwanted pregnancy", "unwanted pregnancies",
        "reduction in pregnancy", "reduction of pregnancy",
        "reduced pregnancy", "reduced pregnancies",
        "decrease in pregnancy", "fewer pregnant",
        "reduction in number of partners",  # often fertility-related
        "menor numero de casamentos prematuros",
        "premature pregnancies decreased",
        "reduced early pregnancies", "safe birth",
        "reduction in number of children",
        "pregnant early", "children born",
        "safe births", "reduce early pregnancies",
        "reduction in the number of early pregnancies"
    ]):
        categories.append("Changes: reduction in early or unwanted pregnancy")

    ############################################################
    # 3. REDUCTION IN PREMATURE UNIONS / CHILD MARRIAGE
    ############################################################
    if any(w in text for w in [
        "premature union", "premature marriages",
        "early marriage", "child marriage",
        "fewer marriages between minors",
        "girls are not getting married while minors",
        "reduced early marriage",
        "reduction in early marriage"
        "decrease in premature",
        "contributes greatly",
        "reduced number",
        "si boas mudancas",
        "no longer have many cases of premature marriages",
        "early marriage reduced", "fewer marriages"
    ]):
        categories.append("Changes: reduction in premature unions or child marriage")

    ############################################################
    # 4. REDUCTION IN VIOLENCE / IMPROVED SAFETY
    ############################################################
    if any(w in text for w in [
        "violence", "rape", "abuse",
        "safer", "security", "less risk",
        "risk of mortality", "reduced mortality",
        "mortality rate", "infant mortality",
        "rape of minors", "abuso", "sre protected",
        "less harassment", "not a lot of violence",
        "girls safety"
    ]):
        categories.append("Changes: reduction in violence or improved safety")

    ############################################################
    # 5. AWARENESS / KNOWLEDGE / RIGHTS UNDERSTANDING
    ############################################################
    if any(w in text for w in [
        "more insight", "more aware",
        "awareness", "knowledge", "rights",
        "girls right to choose", "entendimento",
        "opened minds", "opened some girls minds",
        "for us to know", "more information",
        "to gain knowledge", "conhecimento",
        "now i hear", "now i listen", "information",
        "perception", "entender", "understood",
        "more grip", "now i know", "many girls know they cannot",
        "raising awareness", "know what my rights", "gained a lot of knowledge",
        "aware", "pay more attention", "well informed",
    ]):
        categories.append("Changes: increased awareness and knowledge")

    ############################################################
    # 6. EDUCATION IMPROVEMENTS
    ############################################################
    if any(w in text for w in [
        "school", "studies", "study", "studying",
        "girls finishing school", "returned to school",
        "finish their studies", "focus on studies",
        "go to school", "back to school",
        "my children studied", "learn", "acces to information",
        "teaching", "more informed", "girls have returned to school",
        "focusing on studies"

    ]):
        categories.append("Changes: improved education for girls")

    ############################################################
    # 7. SRHR / HEALTH / PLANNING IMPROVEMENTS
    ############################################################
    if any(w in text for w in [
        "planning", "family planning",
        "sex life", "sexual life",
        "healthy", "health", "disease",
        "infection", "prevent illness",
        "reproductive health", "hiv", "personal hygiene",
        "establish link", "should prevent themselves",
        "link between saaj services and adolescent girls",
        "planeamento familiares"
        
    ]):
        categories.append("Changes: SRHR or health improvements")

    ############################################################
    # 8. EMPOWERMENT / CONFIDENCE / INDEPENDENCE
    ############################################################
    if any(w in text for w in [
        "freedom", "i now have the freedom",
        "psychologically active", "emotionally active",
        "i talk freely", "not ashamed anymore",
        "self esteem", "confidence",
        "progressing in life", "bright future",
        "entrepreneur", "business",
        "independent", "female empowerment",
        "more unity", "improved the life",
        "improve thesex", "overcome", "inclusion",
        "selfesteem", "my rights", 
        "improving girls future",
        "without fear of being judged"
    ]):
        categories.append("Changes: empowerment or socio-economic improvements")

    ############################################################
    # 9. BEHAVIOURAL CHANGE
    ############################################################
    if any(w in text for w in [
        "girls dont play badly", "stop playing badly",
        "dont play badly", "play badly",
        "girls behaving well", "girls acting well",
        "changed behavior", "changed attitudes",
        "girls stopped dating", "dont date",
        "discipline", "respect parents", "development",
        "behavior", "financial", "good behavior",
        "changed mentality", "lifestyle change",
        "acting well", "attitude", "mentality", "listen to parents",
        "children are behaving"
    ]):
        categories.append("Changes: behavioural change")

    ############################################################
    # 10. COMMUNITY SUPPORT / INVOLVEMENT
    ############################################################
    if any(w in text for w in [
        "community", "community support",
        "community helping", "neighborhood",
        "towing at homes", "houses", "trust",
        "confidence", "ligacao saaj", "calm and understanding"
    ]):
        categories.append("Changes: community support or involvement")

      ############################################################
    # 10. In the home 
    ############################################################
    if any(w in text for w in [
        "home improvements", "improvements at home",
        "better living conditions", "better home",
        "in the houses", "improving the relationship between parents and girls",
        "family dialoque"
    ]):
        categories.append("Changes: home improvements")

    ############################################################
    # 11. GENERAL POSITIVE CHANGE (UNSPECIFIC)
    ############################################################
    if any(w in text for w in [
        "many things", "a lot of things",
        "ideas", "one hundred comments", "thank you",
        "keep work", "continue", "helped a lot", "a lot has changed",
        "improvement", "better now", "much better",
        "things have improved", "good change", "i see change",
        "having results", "see some change", "improved",
        "helped", "positive", "helping", "changes", "it has changed",
        "doing good", "many changes", "sim", "decreased a lot"
    ]):
        categories.append("Changes: good changes (unspecific)")

    # -----------------------------------------
    # 1. REMOVE "NO CHANGE" IF OTHER SIGNALS EXIST
    # -----------------------------------------
    if "Changes: no change or no knowledge" in categories and len(categories) > 1:
        categories = [
            c for c in categories
            if c != "Changes: no change or no knowledge"
        ]

    # -----------------------------------------
    # 2. DEDUPLICATE WHILE PRESERVING ORDER
    # -----------------------------------------
    seen = set()
    out = []

    for c in categories:
        if c not in seen:
            out.append(c)
            seen.add(c)

    # -----------------------------------------
    # 3. FALLBACK
    # -----------------------------------------
    if not out:
        return ["Changes: no comment"]

    return out
        


In [101]:
############################################################
# 9. RECOMMENDATIONS CLASSIFIER — ULTRA EXPANDED
############################################################
def categorize_recommendations(text):
    categories = []
    text = normalize_text(text)

    if text == "":
        return ["Recommendations: none"]

    # =====================================================
    # 1. NONE / NO SUGGESTIONS
    # =====================================================
    if any(w in text for w in [
        "dont know", "don t know", "i dont know", "i don t know",
        "no idea", "no comments", "nothing", "none",
        "nunca", "i have no suggestions",
        "one hundred comments", "out of ideas",
        "i m out of ideas", "m out of ideas", "i dont have",
        "no recommendation", "no suggestions","out of ideas", 
        "m out of ideas", "i m out of ideas", "i do not have",
        "out of ideas", "no recommendations", "no idea",
        "not aware of this project", "dont know about this project"
    ]):
        categories.append("Recommendations: none")
    

    # =====================================================
    # 2. SERVICE QUALITY IMPROVEMENT
    # =====================================================
    if any(w in text for w in [
        "improve service", "improvement in services", "improve care",
        "better service", "good service",
        "health facility", "health facilities", "health unit",
        "clinic", "hospital", "monitoring of professionals",
        "better approaches", "improvements in service",
        "improving information for girls",
        "improve their service", "improvements in service", 
        "may there always be improvements", "serve us well", 
        "respect us in groups","they dont respond well", 
        "they do not respond well",
        "they say i m underage", "they say im underage",
        "atendimento", "health professionals", "unidade sanit", 
        "clinic", "sometimes it doesnt work well", 
        "sometimes it does not work well",
        "sometimes they send us back", "improvements only",
        "always work and improve", "valuation of girls",
        "always improve your service",
        "improve their service", "improvements only",
        "respect us in groups",
        "show confidence through professionalism",
    ]):
        categories.append("Recommendations: improve services")


    # =====================================================
    # 3. CONTINUE / EXPAND PROJECT / COME MORE OFTEN
    # =====================================================
    if any(w in text for w in [
        "continue", "continuar", "dont stop", "don t stop",
        "should not stop", "keep working", "keep going",
        "keep the project", "go forward", "go ahead",
        "strengthen", "expand project", "expanding the work",
        "come more often", "visit more often", "come by more often",
        "voltar sempre", "come back", "more work",
        "reach more areas", "reach more communities",
        "cover the entire country", "cover services",
        "open more projects", "create more projects",
        "increase activists", "hire more young", "hire more activists",
        "let them work harder", "project will face",
        "may many come", "be very successful",
        "come more often", "working in communities",
        "may many days come", "may many come",
        "let them come", "come to our community",
        "keep gathering", "keep visiting us",
        "cover services throughout the province", "intensify",
        "project should", "may many come", "be very successful"
        "happen more often", "wish the project more strength",
        "fazer mais vezes", "let there be more sections",
        "keep bringing other girls together",
        "keep motivating", "keep motivating but",
        "keep gathering the teenage girls",
        "project be for everyone", "continuation",
        "ensure continuation of services",
        "the project goes further", "project goes further",
        "we are learning a lot from your mobilization",
        "many teenage girls have returned to school",
        "happen more often", "better the meetings",
        "better the meetings", "project goes further",
        "continuation of the project",
        "continuing to do good work",
        "activities do not stop", "work harder",
        "do not stop with the sensibilizacoes",
        "come home so we can meet",
        "keep contact", "do not stop",
        "project helps girls",
        "work harder", "covers the entire",
        "but work", "bring other girls together",
        "may it happen more often", "continuing"
        "continuing awareness", "has the right to protect herself"
    ]):
        categories.append("Recommendations: continue or expand project")


    # =====================================================
    # 4. MORE LECTURES / INFORMATION / EDUCATION
    # =====================================================
    if any(w in text for w in [
        "lecture", "lectures", "palestra", "palestras",
        "sessions", "session", "talks", "campaign",
        "campaigns", "sensibilizar", "sensitization",
        "teach", "teaching", "education", "educate",
        "more information", "increase information",
        "explain more", "explain better",
        "get more in depth information",
        "more knowledge", "always educate", "better understanding",
        "more welcome", "more welcoming","inform",
        "give more advice", "training", "treinamento", "formacao",
        "section", "sections",
        "dig deeper into the subject", 
        "information to reach", "community information improvements",
        "always keep information",
        "that there is training so that the girls understand more about the subject",
        "mais entendimento sobre ssr",
        "ideas", "a lot of patience when talking to girls",
        "expand services"
    ]):
        categories.append("Recommendations: more lectures or information")


    # =====================================================
    # 5. EXPAND COVERAGE / ACCESS / HOUSE-TO-HOUSE
    # =====================================================
    if any(w in text for w in [
        "cover more communities", "expand", "expand information",
        "reach more areas", "reach more communities",
        "door to door", "house to house",
        "go house to house", "areas", "zones",
        "create more centers", "more access", "access to services",
        "project reach areas", "centers nationally",
        "increase number of activists", "activists",
        "more projects", "reach more", "reach the areas", "reach schools",
        "expand to communities", "expand to schools",
        "create more", "cover the province",
        "cover entire country", "que as redes sejam abrajentes",
        "grupo de", "more coverage", "more sessions in communities"
        "covers the entire country", "abrasive", "comprehensive",
        "covers the entire province","create service centers",
        "service centers", "more surveillance",
        
    ]):
        categories.append("Recommendations: expand coverage")


    # =====================================================
    # 6. EXPAND SRHR INFORMATION
    # =====================================================
    if any(w in text for w in [
        "sexual and reproductive", "srh", "ssr",
        "saaj", "health information",
        "prevent pregnancy", "prevent early pregnancy",
        "violence prevention", "gbv prevention",
        "sobre saude sexual",
        "girls have more knowledge", "education",
        "lectures", "srhr", "ssr", "sexual", "reproductive",
        "avoid pregnancy", "information about health services",
        "sex life", "saaj", "health information",
        "information about health", "contraceptive",
        "family planning", "implant", "hygiene",
        "prevent pregnancy", "prevent early pregnancy",
        "avoid unwanted early pregnancy", "avoid unwanted pregnancy",
        "subject of sex", "thesubject of sex"
    
    ]):
        categories.append("Recommendations: expand SRHR information")


    # =====================================================
    # 7. COMMUNITY / PARENTAL INVOLVEMENT
    # =====================================================
    if any(w in text for w in [
        "community", "involve the community",
        "community support", "our community",
        "parents", "pais", "mothers", "guardians",
        "family involvement", "visit homes",
        "participation of girls", "participation of young girls",
        "talk to guardians",
        "listen to girls needs and concerns",
        "family members", "pais", "maes",
        "monitor the growth of other adolescent girls",
        "keep bringing other girls together", 
        "benefit the community", "those in charge must have conversations",
        "the family should help girls", "we have to help",
        "take more care of women", "cessivizations in communities",
        "men have to help our partners", "respect each other"
    ]):
        categories.append("Recommendations: community or parental involvement")


    # =====================================================
    # 8. BEHAVIOURAL ADVICE TO GIRLS
    # =====================================================
    if any(w in text for w in [
        # Early pregnancy / marriage
        "avoid getting pregnant early", "dont get pregnant early",
        "avoid early pregnancy", "avoid pregnancy",
        "avoid getting married early", "dont get married early",
        "avoid dating early", "dont date early",
        "avoid early relationships", "advise them",
        "i recommend all girls", "recommend that every",
        "recommend to all girls", "take knowledge",
        "i would like to advise", "i would like to tell",
        "avoid early marriages", "avoid dating",
        "do not get married early",
        "not getting involved before the right age",
        "avoid dating before your age", "avoid dating early",
        "not to date early",
        "cant date early", "cannot date early",
        "shouldnt accept getting married early", "should not accept getting married early",
        "girls do not marry early before their age",
        "girl should not get married before the age of 20",
        "nobody gets married early before the age of 18",
        "ask teenage girls not to make hasty decisions",
        "avoid unwanted early pregnancy and premature marriages",
        "pay more attention to their studies",

        # Men / risky behaviour
        "avoid men", "careful with men", "play badly", "behavior",

        # School
        "study", "go back to school", "voltar a estudar",
        "finish studying", "continue with studies",
        "focus on studies", "school first", 
        "avoid premature marriage", "to study",
        "avoid early relationships",
        "gostaria de aconselhar", "continuing with studies",

        # more generic advice
        "girls should", "girls must", "advise girls",
        "advise minors", "be careful", "behaviour",
        "put into practice", "follow the recommendation",
        "dont date", "avoid getting involved early", 
        "should not get involved early",
        "i would like to advise", "i would like to tell them",
        "i would like to advise minors",
        "i recommend that girls", "i recommend that all girls",
        "i recommend that every girl",
        "follow the recommendation of the tsogolo tsicana project",
        "each girl puts into practice the information",
        "take care of their health",
        "support all girls to achieve their goals",
        "repent",
        "protect themselves",
        "behaviors",
        "children are already going to school",
    ]):
        categories.append("Recommendations: behavioural advice to girls")


    # =====================================================
    # 9. SAVINGS / ECONOMIC SUPPORT
    # =====================================================
    if any(w in text for w in [
        "savings", "savings group", "grupo de poupanca",
        "funds", "financial support", "economic support",
        "business", "lack of finances", "help with money",
    ]):
        categories.append("Recommendations: savings or economic support")


    # =====================================================
    # 10. VIOLENCE / GBV RESPONSE
    # =====================================================
    if any(w in text for w in [
        "violence", "gbv", "report violence",
        "prevent violence", "support against violence",
        "protection", "protect girls",
        "complaints mechanism", "gender based",
        "rape", "abuse", "complaints mechanism"
    ]):
        categories.append("Recommendations: violence or GBV response")


    # =====================================================
    # 11. COUNSELLING / SUPPORT CENTERS
    # =====================================================
    if any(w in text for w in [
        "counseling", "counselling",
        "support center", "care center",
        "psychological", "emotional support",
        "self esteem", "talk to someone",
        "help girls stand tall",
        "texting services", "more support", 
        "reinforcement", "support", "may they always help us", 
        "devemos nos ajudar", "support and protection",
        "valuation of girls"
        "ask for advice",
        "apoiar",
        "stand tall",
    ]):
        categories.append("Recommendations: counselling or support centers")


    # =====================================================
    # 12. GRATITUDE
    # =====================================================
    if any(w in text for w in [
        "thank", "thanks", "thank you", "grateful",
        "i thank", "obrigado", "appreciate",
        "very good", "good job", "agradecer",
        "go ahead", "keep it up", "gracas a projeto",
        "worked well"
    ]):
        categories.append("Recommendations: gratitude for project")


    # -----------------------------------------
    # 1. REMOVE "NO CHANGE" IF OTHER SIGNALS EXIST
    # -----------------------------------------
    if "Recommendations: none" in categories and len(categories) > 1:
        categories = [
            c for c in categories
            if c != "Recommendations: none"
        ]

    # -----------------------------------------
    # 2. DEDUPLICATE WHILE PRESERVING ORDER
    # -----------------------------------------
    seen = set()
    out = []

    for c in categories:
        if c not in seen:
            out.append(c)
            seen.add(c)

    # -----------------------------------------
    # 3. FALLBACK
    # -----------------------------------------
    if not out:
        return ["Recommendations: none"]

    return out

    

# Load parquet 

In [102]:
META_COLS = ["id", "district", "community", "participant_profile"]
OPEN_COLS = ["changes_seen", "recommendations"]

def extract_open_text(df, source_label):
    cols = META_COLS + [c for c in OPEN_COLS if c in df.columns]

    out = (
        df[cols]
        .melt(
            id_vars=META_COLS,
            value_vars=[c for c in OPEN_COLS if c in df.columns],
            var_name="text_source",
            value_name="text"
        )
        .dropna(subset=["text"])
    )

    out["source"] = source_label
    return out

df1 = pd.read_parquet(
    "/Users/anamargarida/Local/Coding/Tsogolo Tsicana/data_clean/open_text/q1_open_ended.parquet"
)

df4 = pd.read_parquet(
    "/Users/anamargarida/Local/Coding/Tsogolo Tsicana/data_clean/open_text/q456_open_ended.parquet"
)

df_long = pd.concat(
    [df1, df4],
    ignore_index=True
)

df_long["text"] = df_long["text"].apply(normalize_text)
#df_long.to_parquet("/Users/anamargarida/Local/Coding/project/data_clean/full_text.parquet")

# Apply normalization

In [103]:
############################################################
# ROUTER (ADD CHANGES + RECOMMENDATIONS)
############################################################
CATEGORY_ROUTER = {
    "savings_participation_comment": categorize_savings_comments,
    "support_improvement": categorize_savings_support_needed,
    "talked_to_parents_comment": categorize_talked_to_parents_reason,
    "att_community_engage_gbv_comment": categorize_att_community_engage_gbv,
    "att_child_marriage_comment": categorize_att_child_marriage,
    "att_free_safe_access_comment": categorize_att_free_safe_access,

    # add these:
    "changes_seen": categorize_changes_seen_multi,     # or your single-label categorize_changes_seen
    "recommendations": categorize_recommendations,
}


def apply_categorisation(row):
    func = CATEGORY_ROUTER.get(row["text_type"])
    if func is None:
        return np.nan
    return func(row["text"])


# IMPORTANT: category must be object dtype because it's lists
df_long["category"] = pd.Series([None] * len(df_long), dtype="object")
df_long["category"] = df_long.apply(apply_categorisation, axis=1)

# explore rec and changes

In [104]:
df_check = df_long[
    df_long["text_type"].isin(["changes_seen", "recommendations"])
].copy()

df_check["category"].apply(type).value_counts()

category
<class 'list'>    1866
Name: count, dtype: int64

In [105]:
df_check["n_categories"] = df_check["category"].apply(
    lambda x: len(x) if isinstance(x, list) else 1
)

df_check["n_categories"].value_counts().sort_index()

n_categories
1    1218
2     437
3     167
4      40
5       4
Name: count, dtype: int64

In [106]:
# Make sure category is always a list
df_long["category"] = df_long["category"].apply(
    lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else [x])
)

In [107]:
df_long["n_categories"] = df_long["category"].apply(len)

In [108]:
cols_to_export = [
    "id",
    "district",
    "community",
    "participant_profile",
    "source",
    "text_type",
    "text",
    "category",
    "n_categories",
]

df_explore = df_long[cols_to_export].copy()

df_explore["category"]

0                        [Free access: support for girls]
1                             [Free access: rights based]
2                             [Free access: rights based]
3                             [Free access: rights based]
4                        [Free access: support for girls]
                              ...                        
4393    [Recommendations: more lectures or information...
4394    [Recommendations: continue or expand project, ...
4395       [Recommendations: behavioural advice to girls]
4396    [Recommendations: expand SRHR information, Rec...
4397        [Recommendations: continue or expand project]
Name: category, Length: 4398, dtype: object

In [109]:
df_long["category"].apply(type).value_counts()

category
<class 'list'>    4398
Name: count, dtype: int64

In [125]:
df_long["category_str"] = df_long["category"].apply(
    lambda x: "; ".join(x) if x else "Not classified"
)

df_explore.to_excel(
    "/Users/anamargarida/Local/Coding/Tsogolo Tsicana/data_clean/text_categorised/classification_audit.xlsx",
    index=False
)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

## 3. Mandatory checks (do not skip)

In [126]:
# Distribution check
df_long.groupby("text_type")["category"].apply(lambda x: x.isna().sum())

text_type
att_child_marriage_comment          0
att_community_engage_gbv_comment    0
att_free_safe_access_comment        0
changes_seen                        0
recommendations                     0
savings_participation_comment       0
support_improvement                 0
talked_to_parents_comment           0
Name: category, dtype: int64

## 4. Explode categories

In [None]:
df_long.to_excel(
    "/Users/anamargarida/Local/Coding/Tsogolo Tsicana/data_clean/text_categorised/text_classified_long.xlsx",
    index=False
)

df_long.to_parquet(
    "/Users/anamargarida/Local/Coding/Tsogolo Tsicana/data_clean/text_categorised/text_classified_long.parquet",
    index=False
)

In [127]:
import pandas as pd

# Load the classified long data
df_long = pd.read_parquet("/Users/anamargarida/Local/Coding/Tsogolo Tsicana/data_clean/text_categorised/text_classified_long.parquet")

# Quick check
df_long.columns

Index(['id', 'district', 'community', 'participant_profile', 'text_type',
       'text', 'source', 'category', 'n_categories', 'category_str'],
      dtype='object')

In [128]:
import pandas as pd

df = df_long.copy()

df = df.rename(columns={
    "participant_profile": "group",
    "text_type": "type",
    "category": "theme",
    "id": "resp_id"
})

# Clean
for c in ["group", "type", "theme"]:
    df[c] = df[c].astype(str).str.strip()

# Drop empty themes
df = df[df["theme"].notna() & (df["theme"] != "")]

DROP_THEMES = {
    "Not classified",
    "No comment",
    "Prefer not to say",
    "Other"
}
df = df[~df["theme"].isin(DROP_THEMES)]


def build_matrix(df, text_type_value):
    d = df[df["type"] == text_type_value].copy()

    denom = (
        d[["group", "resp_id"]]
        .drop_duplicates()
        .groupby("group")["resp_id"]
        .nunique()
        .rename("denom")
        .reset_index()
    )

    num = (
        d[["group", "theme", "resp_id"]]
        .drop_duplicates()
        .groupby(["group", "theme"])["resp_id"]
        .nunique()
        .rename("n")
        .reset_index()
    )

    out = num.merge(denom, on="group", how="left")
    out["pct"] = (out["n"] / out["denom"] * 100).round(1)

    return out.sort_values(["group", "pct", "n"], ascending=[True, False, False])


def to_wide(df_long):
    return (
        df_long
        .pivot_table(index="theme", columns="group", values="pct", fill_value=0)
        .reset_index()
    )

In [129]:
annex_f_changes = build_matrix(df, "changes_seen")
annex_f_recs = build_matrix(df, "recommendations")
annex_f_support = build_matrix(df, "support_improvement")

with pd.ExcelWriter("/Users/anamargarida/Local/Coding/Tsogolo Tsicana/Annex/Annex_F_OpenEnded_Indicator_Matrices.xlsx", engine="openpyxl") as writer:

    annex_f_changes.to_excel(writer, sheet_name="F1_Changes_long", index=False)
    to_wide(annex_f_changes).to_excel(writer, sheet_name="F1_Changes_wide", index=False)

    annex_f_recs.to_excel(writer, sheet_name="F2_Recommendations_long", index=False)
    to_wide(annex_f_recs).to_excel(writer, sheet_name="F2_Recommendations_wide", index=False)

    annex_f_support.to_excel(writer, sheet_name="F3_SupportNeeds_long", index=False)
    to_wide(annex_f_support).to_excel(writer, sheet_name="F3_SupportNeeds_wide", index=False)

print("Saved: Annex_F_OpenEnded_Indicator_Matrices.xlsx")

Saved: Annex_F_OpenEnded_Indicator_Matrices.xlsx


In [144]:
import pandas as pd

df = df_long.copy()

df = df.rename(columns={
    "participant_profile": "group",
    "text_type": "type",
    "category": "theme",
    "id": "resp_id"
})

for c in ["group", "type", "theme"]:
    df[c] = df[c].astype(str).str.strip()

df = df[df["theme"].notna() & (df["theme"] != "")]

# Optional: keep "Other" for these explanation annexes (often useful),
# but drop "Not classified" and true non-answers.
DROP_THEMES = {"Not classified", "No comment", "Prefer not to say"}
df = df[~df["theme"].isin(DROP_THEMES)]


def build_matrix(df, text_type_value):
    d = df[df["type"] == text_type_value].copy()

    denom = (
        d[["group", "resp_id"]]
        .drop_duplicates()
        .groupby("group")["resp_id"]
        .nunique()
        .rename("denom")
        .reset_index()
    )

    num = (
        d[["group", "theme", "resp_id"]]
        .drop_duplicates()
        .groupby(["group", "theme"])["resp_id"]
        .nunique()
        .rename("n")
        .reset_index()
    )

    out = num.merge(denom, on="group", how="left")
    out["pct"] = (out["n"] / out["denom"] * 100).round(1)

    return out.sort_values(["group", "pct", "n"], ascending=[True, False, False])


def to_wide(df_long):
    return (
        df_long
        .pivot_table(index="theme", columns="group", values="pct", fill_value=0)
        .reset_index()
    )


EXTRA_TYPES = [
    "att_child_marriage_comment",
    "att_free_safe_access_comment",
    "att_community_engage_gbv_comment",
    "talked_to_parents_comment",
    "savings_participation_comment"
]

type_labels = {
    "att_child_marriage_comment": "G1_ChildMarriage_reasoning",
    "att_free_safe_access_comment": "G2_FreeAccess_reasoning",
    "att_community_engage_gbv_comment": "G3_CommunityRoleGBV_reasoning",
    "talked_to_parents_comment": "G4_ParentChildDialogue_barriers",
    "savings_participation_comment": "G5_SavingsParticipation_reasoning"
}

out_path = "/Users/anamargarida/Local/Coding/Tsogolo Tsicana/Annex/Annex_G_OpenEnded_Rationales_and_Barriers.xlsx"

with pd.ExcelWriter(out_path, engine="openpyxl") as writer:
    for t in EXTRA_TYPES:
        tbl = build_matrix(df, t)
        base = type_labels.get(t, t)

        tbl.to_excel(writer, sheet_name=f"{base}_long"[:31], index=False)
        to_wide(tbl).to_excel(writer, sheet_name=f"{base}_wide"[:31], index=False)

print(f"Saved: {out_path}")

Saved: /Users/anamargarida/Local/Coding/Tsogolo Tsicana/Annex/Annex_G_OpenEnded_Rationales_and_Barriers.xlsx


In [138]:
ATTITUDE_ROUTER = {
    "att_child_marriage_comment": "att_child_marriage",
    "att_free_safe_access_comment": "att_free_safe_access",
    "att_community_engage_gbv_comment": "att_community_engage_gbv",
}

def build_matrix_by_attitude(df, text_type_value, attitude_col, labels=None):
    d = df[df["type"] == text_type_value].copy()

    # keep only rows where the closed item exists
    d = d[d[attitude_col].notna()].copy()

    # map 0/1 to readable labels
    if labels is None:
        labels = {0: "Disagree", 1: "Agree"}

    d["attitude"] = d[attitude_col].map(labels)

    # denominator per attitude
    denom = (
        d[["attitude", "resp_id"]]
        .drop_duplicates()
        .groupby("attitude")["resp_id"]
        .nunique()
        .rename("denom")
        .reset_index()
    )

    # numerator per attitude × theme
    num = (
        d[["attitude", "theme", "resp_id"]]
        .drop_duplicates()
        .groupby(["attitude", "theme"])["resp_id"]
        .nunique()
        .rename("n")
        .reset_index()
    )

    out = num.merge(denom, on="attitude", how="left")
    out["pct"] = (out["n"] / out["denom"] * 100).round(1)

    return out.sort_values(["attitude", "pct", "n"], ascending=[True, False, False])

In [139]:
def build_matrix_by_attitude(df, text_type_value, attitude_col, labels=None):
    d = df[df["type"] == text_type_value].copy()

    # keep only rows where the closed item exists
    d = d[d[attitude_col].notna()].copy()

    # map 0/1 to readable labels
    if labels is None:
        labels = {0: "Disagree", 1: "Agree"}

    d["attitude"] = d[attitude_col].map(labels)

    # denominator per attitude
    denom = (
        d[["attitude", "resp_id"]]
        .drop_duplicates()
        .groupby("attitude")["resp_id"]
        .nunique()
        .rename("denom")
        .reset_index()
    )

    # numerator per attitude × theme
    num = (
        d[["attitude", "theme", "resp_id"]]
        .drop_duplicates()
        .groupby(["attitude", "theme"])["resp_id"]
        .nunique()
        .rename("n")
        .reset_index()
    )

    out = num.merge(denom, on="attitude", how="left")
    out["pct"] = (out["n"] / out["denom"] * 100).round(1)

    return out.sort_values(["attitude", "pct", "n"], ascending=[True, False, False])

In [146]:
if t in ATTITUDE_ROUTER:
    att_col = ATTITUDE_ROUTER[t]

    # set labels correctly per item
    if att_col == "att_child_marriage":
        labels = {0: "Rejects child marriage", 1: "Accepts child marriage"}
    else:
        labels = {0: "Disagree", 1: "Agree"}

    tbl_att = build_matrix_by_attitude(df, t, att_col, labels=labels)

    base = type_labels.get(t, t)
    tbl_att.to_excel(writer, sheet_name=f"{base}_byAtt_long"[:31], index=False)

    # wide view: theme × attitude
    (
        tbl_att.pivot_table(index="theme", columns="attitude", values="pct", fill_value=0)
        .reset_index()
        .to_excel(writer, sheet_name=f"{base}_byAtt_wide"[:31], index=False)
    )

    tbl_att

In [141]:
import pandas as pd
import numpy as np

df = df_long.copy()

df = df.rename(columns={
    "participant_profile": "group",
    "text_type": "type",
    "category": "theme",
    "id": "resp_id"
})

TARGET_TYPE = "talked_to_parents_comment"
d = df[df["type"] == TARGET_TYPE].copy()

# make sure theme is list-like, then explode to one theme per row
d["theme"] = d["theme"].apply(
    lambda x: x if isinstance(x, list) else ([] if pd.isna(x) else [x])
)
d = d.explode("theme").dropna(subset=["theme"])

# remove empty / junk categories
DROP = {"Not classified", "No comment", "Prefer not to say"}
d["theme"] = d["theme"].astype(str).str.strip()
d = d[~d["theme"].isin(DROP)]

# denominator PER GROUP
denom = (
    d[["group", "resp_id"]]
    .drop_duplicates()
    .groupby("group")["resp_id"]
    .nunique()
    .rename("denom")
    .reset_index()
)

# numerator PER GROUP × THEME
num = (
    d[["group", "theme", "resp_id"]]
    .drop_duplicates()
    .groupby(["group", "theme"])["resp_id"]
    .nunique()
    .rename("n")
    .reset_index()
)

tbl_long = num.merge(denom, on="group", how="left")
tbl_long["pct"] = (tbl_long["n"] / tbl_long["denom"] * 100).round(1)

tbl_long = tbl_long.sort_values(["group", "pct"], ascending=[True, False])
tbl_long

Unnamed: 0,group,theme,n,denom,pct
10,adolescent girl,['Parents communication: too young or minor'],38,147,25.9
2,adolescent girl,['Parents communication: fear'],23,147,15.6
9,adolescent girl,['Parents communication: shame'],23,147,15.6
7,adolescent girl,['Parents communication: parents unavailable'],18,147,12.2
6,adolescent girl,['Parents communication: not classified'],10,147,6.8
5,adolescent girl,['Parents communication: no reason'],9,147,6.1
8,adolescent girl,['Parents communication: prefers siblings or f...,7,147,4.8
1,adolescent girl,['Parents communication: does not know how to ...,6,147,4.1
3,adolescent girl,['Parents communication: lack of interest'],6,147,4.1
4,adolescent girl,['Parents communication: never experienced vio...,5,147,3.4


In [142]:
tbl_wide = (
    tbl_long
    .pivot(index="theme", columns="group", values="pct")
    .fillna(0)
    .round(1)
    .reset_index()
)

tbl_wide

group,theme,adolescent girl,adolescent mother,young mother,young woman (no child)
0,['Parents communication: cultural or taboo bar...,1.4,0.0,13.6,8.3
1,['Parents communication: does not know how to ...,4.1,0.0,4.5,0.0
2,['Parents communication: fear'],15.6,0.0,13.6,8.3
3,['Parents communication: lack of interest'],4.1,0.0,4.5,0.0
4,['Parents communication: never experienced vio...,3.4,0.0,4.5,0.0
5,['Parents communication: no reason'],6.1,0.0,0.0,0.0
6,['Parents communication: not classified'],6.8,0.0,4.5,8.3
7,['Parents communication: parents unavailable'],12.2,33.3,13.6,0.0
8,['Parents communication: prefers siblings or f...,4.8,0.0,0.0,16.7
9,['Parents communication: shame'],15.6,33.3,31.8,50.0


In [143]:
df_long["child_marriage_attitude"] = df["child_marriage_attitude"].map({
    0: "Rejects child marriage",
    1: "Accepts child marriage"
})

KeyError: 'child_marriage_attitude'

In [None]:
df_long = df_long[df_long["child_marriage_attitude"].notna()]

In [None]:
d = df_long[df_long["text_type"] == "att_child_marriage_comment"].copy()

In [None]:
DROP = {"Child marriage: Not classified", "Child marriage: No explanation"}
d = d[d["category"].notna() & (~d["category"].isin(DROP))]

In [None]:
denom = (
    d[["child_marriage_attitude", "id"]]
    .drop_duplicates()
    .groupby("child_marriage_attitude")["id"]
    .nunique()
    .rename("denom")
    .reset_index()
)

In [None]:
num = (
    d[["child_marriage_attitude", "category", "id"]]
    .drop_duplicates()
    .groupby(["child_marriage_attitude", "category"])["id"]
    .nunique()
    .rename("n")
    .reset_index()
)

In [None]:
tbl_rationales = num.merge(denom, on="child_marriage_attitude", how="left")
tbl_rationales["pct"] = (tbl_rationales["n"] / tbl_rationales["denom"] * 100).round(1)

tbl_rationales = tbl_rationales.sort_values(
    ["child_marriage_attitude", "pct"],
    ascending=[True, False]
)
tbl_rationales