In [1]:
# Libraries that we will use, if you are missing a library,
# create a new cell with e.g.:
#   !pip install NAME_OF_MISSING
# where NAME_OF_MISSING is the library that you are missing.

import google.generativeai as genai
from tqdm import tqdm
import numpy as np
import pandas as pd
import typing_extensions as typing
import json
import random
import seaborn as sns
import matplotlib.pyplot as plt

## Exercise 1: Initialize the Generative Model and conduct a basic promt

Let's start by veryifying that we can initialize and call a model. Ask it to write a poem about your country of origin.

In [2]:
import google.generativeai as genai

genai.configure(api_key="Your API Key here")  # Replace with your actual API key


In [3]:
# Use a supported model.
# model = genai.GenerativeModel(model_name="gemini-2.0-flash")
model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-05-20")
response = model.generate_content("Escribe un poema corto sobre pozole verde")
print(response.text)

En el tazón, un verde intenso,
maíz cacahuazintle, tierno y denso.
Con hebras de cerdo, suave y jugoso,
un aroma que invita, ¡qué delicioso!

Tomatillo y chile, secreto del color,
epazote que canta, con gran fervor.
Un toque de picor que el alma enciende,
cada cucharada, el gusto sorprende.

Rábanos rojos, lechuga fresca,
limón que el paladar refresca.
Tradición mexicana, con devoción,
¡Pozole verde, pura pasión!


In [None]:
import google.generativeai as genai

# For Gemini 1.5 Pro
# model = genai.GenerativeModel(model_name="gemini-1.5-pro")

# Alternative model 2
# model = genai.GenerativeModel(model_name="gemini-2.5-flash-preview-05-20")

# Alternative model 3
model = genai.GenerativeModel(model_name="gemini-2.0-flash")

response = model.generate_content("Escribe un poema corto sobre Leo Messi")
print(response.text)

## Exercise 5: World Health Organization-Five Well-Being Index (WHO-5)

In the following code.

1. We will randomly select "n" individual responses
2. We will pass demographics features to the AI model
3. Wi will generate synthetic responses based on real demographics
4. Compare real results with synthetic results

In [21]:
## sample data from the WHO
import pandas as pd

# Load the CSV file
who5 = pd.read_csv("https://www.dropbox.com/scl/fi/c11o0hb87n3w9n0mmdtr3/data_who5.csv?rlkey=2qbhjdn4pkwfql6i2n4xilouy&dl=1")

### **Function for mapping variable sex**

In [22]:
####  function for maping the column SEXO into textual values
import pandas as pd

def map_sexo_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Maps numeric SexUser values to categorical labels.

    Parameters:
        df (pd.DataFrame): A DataFrame containing the column 'SEXO'

    Returns:
        pd.DataFrame: A copy of the DataFrame with 'SEXO' values mapped
    """
    sexo_map = {
        1: "Hombre",
        2: "Mujer"
    }
    df = df.copy()  # Avoid modifying the original DataFrame
    df["SexUser"] = df["SexUser"].map(sexo_map)
    return df

In [23]:
who5 = map_sexo_column(who5)

## The World Health Organization-Five Well-Being Index (WHO-5)

The WHO-5 is a self-report instrument measuring mental well-being. It consists of five statements relating to the past two weeks. Each statement is rated on a 6-point scale, with higher scores indicating better mental well-being. The instrument has been translated into over 30 languages..


| **ITEM** | **Description** |
|----------------------|-----------------|
| **Por favor, indique para cada una de las cinco afirmaciones cual define mejor como se ha sentido usted durante la últimas dos semanas** |  |
| **WHO1** | Me he sentido alegre y de buen humor  |
| **WHO2** | Me he sentido tranquilo y relajado. |
| **WHO3#** | Me he sentido activo y enérgico. |
| **WHO4** | Me he despertado fresco y descansaso. |
| **WHO%** | Mi vida cotidiana ha estado llena de cosas que me interesan |
| **PPIT 4** | Retrieve highly personal and password-protected financial |


Detailed information about the WHO5 is available in the following link: [Five Well-Being Index (WHO-5)](https://www.who.int/publications/m/item/WHO-UCN-MSD-MHE-2024.01)

The questionnaire in Spanish is available for download at:  [WHO5-questionnaire](https://cdn.who.int/media/docs/default-source/mental-health/oms-(cinco)-indice-de-bienestar-(oms-5).pdf?sfvrsn=ed43f352_11&download=true)

### **Randomly selecting n observations for the WHO5 survey**

In [39]:
# Randomly select n rows
who5_sampled = who5.sample(n=5, random_state=965)  # Set random_state for reproducibility

In [40]:
who5_sampled

Unnamed: 0,ID,WHO1,WHO2,WHO3,WHO4,WHO5,SexUser,Age
285,286,0,2,2,2,0,Mujer,20
97,98,5,5,5,4,4,Mujer,18
114,115,2,0,0,2,3,Hombre,21
29,30,2,2,2,3,4,Hombre,20
82,83,5,0,3,2,3,Mujer,30


In [41]:
import pandas as pd
import json
from typing_extensions import TypedDict
import google.generativeai as genai

# Define expected response format
class WHO5Response(TypedDict):
    WHO1: int
    WHO2: int
    WHO3: int
    WHO4: int
    WHO5: int

# Survey questions as text
survey_questions = """
Durante las últimas dos semanas: Usa la siguiente escala 1 (Nunca) a 5 (Todo el tiempo).
1. Me he sentido alegre y de buen humor.
2. Me he sentido tranquilo y relajado.
3. Me he sentido activo y enérgico.
4. Me he despertado fresco y descansado.
5. Mi vida cotidiana ha estado llena de cosas que me interesan.
Siempre responde con un número del 1 al 5 para cada pregunta.
"""

# Mapping numeric strings to Likert scale labels
response_key_mapping = {
    '5': 'Todo_el_tiempo',
    '4': 'La_mayoria_del_tiempo',
    '3': 'Mas_de_la_mitad_del_tiempo',
    '2': 'De_vez_en_cuando',
    '1': 'Nunca'
}

# Store all responses
responses = []

# Loop through each sampled person
for _, row in who5_sampled.iterrows():
    # Customize system prompt for each person
    system_prompt = f"Tu eres {row['SexUser']} de {row['Age']} años."

    # Initialize the generative model with the system persona
    model = genai.GenerativeModel(
        model_name="gemini-2.5-flash-preview-05-20",
        system_instruction=system_prompt
    )

    # Construct user prompt to get Likert-style answers
    user_prompt = f"""Por favor, responde en formato JSON con las claves WHO1 a WHO5, cada una con un valor entre 1 y 5.
    {survey_questions}
    """

    # Attempt content generation
    try:
        response = model.generate_content(
            user_prompt,
            generation_config=genai.GenerationConfig(
                response_mime_type="application/json"
            )
        )

        # Parse the JSON response
        result = json.loads(response.text)

        # Collect and store the record
        responses.append({
            "SexUser": row["SexUser"],
            "Age": row["Age"],
            "WHO1": result.get("WHO1"),
            "WHO2": result.get("WHO2"),
            "WHO3": result.get("WHO3"),
            "WHO4": result.get("WHO4"),
            "WHO5": result.get("WHO5")
        })

    except Exception as e:
        # Handle failure by storing NaN or fallback values
        responses.append({
            "SexUser": row["SexUser"],
            "Age": row["Age"],
            "WHO1": None,
            "WHO2": None,
            "WHO3": None,
            "WHO4": None,
            "WHO5": None
        })
        print(f"Error with row {row.to_dict()}: {e}")

# Convert to DataFrame
who5_responses_df = pd.DataFrame(responses)
print(who5_responses_df.head())

  SexUser  Age  WHO1  WHO2  WHO3  WHO4  WHO5
0   Mujer   20     4     3     4     3     4
1   Mujer   18     4     3     4     3     4
2  Hombre   21     4     3     4     3     4
3  Hombre   20     4     3     4     3     4
4   Mujer   30     4     3     4     3     4


In [30]:
import numpy as np
import pandas as pd

def calculate_rmse_per_item(who5_sampled: pd.DataFrame, who5_responses_df: pd.DataFrame) -> dict:
    """
    Computes RMSE for each WHO item (WHO1 to WHO5) and overall RMSE.

    Parameters:
    - who5_sampled: pd.DataFrame with columns WHO1 to WHO5 (ground truth)
    - who5_responses_df: pd.DataFrame with columns WHO1 to WHO5 (predictions)

    Returns:
    - dict: RMSE values per item and overall, e.g. {"WHO1": 0.6, ..., "Overall": 0.72}
    """
    who_columns = ["WHO1", "WHO2", "WHO3", "WHO4", "WHO5"]
    rmse_results = {}

    for col in who_columns:
        if col not in who5_sampled.columns or col not in who5_responses_df.columns:
            raise ValueError(f"Missing column: {col} in one of the DataFrames.")

        # Compute RMSE per column (ignores NaNs)
        diffs = who5_sampled[col] - who5_responses_df[col]
        mse = np.nanmean(diffs ** 2)
        rmse_results[col] = np.sqrt(mse)

    # Compute overall RMSE (flatten all values)
    squared_diffs = (who5_sampled[who_columns] - who5_responses_df[who_columns]) ** 2
    overall_rmse = np.sqrt(np.nanmean(squared_diffs.values))
    rmse_results["Overall"] = overall_rmse

    return rmse_results


In [34]:
rmse_dict = calculate_rmse_per_item(who5_sampled, who5_responses_df)

# Print nicely
for item, value in rmse_dict.items():
    print(f"{item}: {value:.4f}")


WHO1: nan
WHO2: nan
WHO3: nan
WHO4: nan
WHO5: nan
Overall: nan


  mse = np.nanmean(diffs ** 2)
  overall_rmse = np.sqrt(np.nanmean(squared_diffs.values))


In [35]:
who5_sampled

Unnamed: 0,ID,WHO1,WHO2,WHO3,WHO4,WHO5,SexUser,Age
267,268,0,3,2,3,4,Hombre,19
132,133,3,4,2,2,2,Hombre,24
53,54,5,5,5,5,0,Mujer,33
190,191,4,4,4,4,3,Hombre,24
171,172,5,4,5,4,4,Mujer,32


In [36]:
who5_responses_df

Unnamed: 0,SexUser,Age,WHO1,WHO2,WHO3,WHO4,WHO5
0,Hombre,19,4,3,4,3,4
1,Hombre,24,4,3,4,3,4
2,Mujer,33,4,3,4,3,4
3,Hombre,24,4,3,4,3,4
4,Mujer,32,4,3,4,3,4
