In [1]:
from huggingface_hub import login
from google.colab import userdata
huggingface_token = userdata.get('huggingface_llama32')

# Login using the token
login(huggingface_token)


In [2]:
import pandas as pd
import json
import torch
from transformers import pipeline
from tqdm import tqdm  # Import tqdm for progress bars

# GitHub raw URL for the CSV file
url = "https://raw.githubusercontent.com/CBDRH/hds-datathon-data/refs/heads/main/release/epiwatch-latest.csv"

# Read the CSV file from the URL into a pandas DataFrame
data = pd.read_csv(url)

# Add a new column with the row index numbers (if needed)
data['id'] = range(1, len(data) + 1)

# Filter the data to include only rows with id from 1 to 50
filtered_data = data[data['id'].between(30001, 35000)]

# Set the device
device = 0 if torch.cuda.is_available() else -1  # 0 for CUDA, -1 for CPU

# Parameters
parameters = {
    "model_name": "meta-llama/Llama-3.2-3B-Instruct",
    "device": device,
    "torch_dtype": torch.bfloat16,
    "temperature": 0.2,
    "top_p": 0.9,
    "max_new_tokens": 512,
    "text_column": "title",  # Replace with the actual column name in your CSV
    "id_column": "id"        # Replace with the actual column name in your CSV
}

def extract_information_from_dataframe(
    dataframe,
    model_name,
    device,
    torch_dtype,
    temperature,
    top_p,
    max_new_tokens,
    text_column,
    id_column
):
    """
    Extract structured information from a DataFrame using a language model pipeline.
    """
    # Set the device
    # Initialize the pipeline
    generator = pipeline("text-generation", model=model_name, device=device, torch_dtype=torch_dtype)

    # Prepare results list
    results = []

    # Iterate through the DataFrame rows
    for _, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc="Processing rows"):
        data = row["title"]
        unique_id = row["id"]

        # Prepare the prompt
        prompt = [
                {
                    "role": "system",
                    "content": """You are an epidemiologist. Your task is to extract information from unstructured text data.

                                  Specifically, you will identify and extract:
                                  - The number of cases (incidence or prevalence), if none = NA.
                                  - The number of mortality (dead or kill), if none = NA.
                                  - The name of the disease(s), if none = NA.
                                  - The name of the syndrome(s), if none = NA.
                                  - The location(s) where the case(s) occurred, if none = NA.
                                  - Determine if it is an "outbreak", "increase", "decrease", "decline", "new", "rise", or "alert" or any synonym of those words, if none = NA.

                                  There are seven example of diseases:
                                  Influenza (many strains), Covid-19, Mpox, Legionnaires', Dengue, Measles, Cholera
                                  However, it also potentially any other diseases outside the list.

                                  The syndromes refer to more generalised symptoms, usually recorded when the disease is unknown. Common syndromes include
                                  Acute gastroenteritis, Severe acute respiratory syndrome, Febrile syndromes, Pneumonia, Influenza-like illness.


                                  Example 1:
                                  The data: "271 new cases of corona virus infection in Odisha, two patients killed - Navbharat Times"
                                  The Output:
                                  "cases": "271",
                                  "dead": "2",
                                  "disease": "corona virus infection",
                                  "syndrome": "NA",
                                  "location": "Odisha",
                                  "keywords": "new"

                                  Example 2:
                                  The data: "Alert in Europe for new cases of avian influenza | Agrofy News"
                                  The Output:
                                  "cases": "NA",
                                  "dead": "NA",
                                  "disease": "avian influenza",
                                  "syndrome": "NA",
                                  "locations": "Europe",
                                  "transmission": "NA",
                                  "keywords": "new, alert"

                                  Example 3:
                                  The data: "CIDRAP - India's COVID-19 total tops 5 million as cases rise in Europe"
                                  The Output:
                                  "cases": "5000000",
                                  "dead": "NA",
                                  "disease": "COVID-19",
                                  "syndrome": "NA",
                                  "locations": "Europe",
                                  "keywords": "rise"

                                  Example 4:
                                  The data: "New Crown Pneumonia in the United States confirmed 6.59 million deaths over 195,000;"
                                  The Output:
                                  "cases": "6590000",
                                  "dead": "195000",
                                  "disease": "NA",
                                  "syndrome": "Pneumonia",
                                  "locations": "United States",
                                  "keywords": "new"

                                  Example 5:
                                  The data: "Nearly half a million children in the United States have been infected with the new coronavirus;"
                                  The Output:
                                  "cases": "500000",
                                  "dead": "NA",
                                  "disease": "coronavirus",
                                  "syndrome": "NA",
                                  "locations": "United States",
                                  "keywords": "new"

                                  Example 6:
                                  The data: "US - Approx. a daily increase of 46,425 cases, 1,076 deaths for totals of 6,649,458 cases,Â 197,223 deathsÂ - Fauci says enough people have to take a Covid-19 vaccine to be efficient - September 17, 2020;"
                                  The Output:
                                  "cases": "46425",
                                  "dead": "1076",
                                  "disease": "Covid-19",
                                  "syndrome": "NA",
                                  "locations": "US",
                                  "keywords": "daily increase"

                                  Example 7:
                                  The data: "Also the scourge of infectious diseases: 38 people suffering from fever in 15 days ...;"
                                  The Output:
                                  "cases": "38",
                                  "dead": "NA",
                                  "disease": "NA",
                                  "syndrome": "fever",
                                  "locations": "NA",
                                  "keywords": "daily increase"

                                  Return your response in JSON format without any additional explanation.""",
                },
                {
                    "role": "user",
                    "content": f"""Here is the data:
                                  "{data}"
                                  Extract the required information.""",
                },
            ]

        # Generate the response
        generation = generator(
            prompt,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            max_new_tokens=max_new_tokens
        )

        # Extract the JSON-like respons
        assistant_response = generation[0]["generated_text"]
        assistant_content = assistant_response[2]['content']
        # Add results to the list
        results.append({
            id_column: unique_id,
            "extracted_info": assistant_content
        })

    return results

# Example usage
results = extract_information_from_dataframe(
    dataframe=filtered_data,
    **parameters
)

# Print the results in a readable format
print(json.dumps(results, indent=4))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing rows:   0%|          | 3/5000 [00:06<2:38:58,  1.91s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing rows:   0%|          | 4/5000 [00:07<2:24:36,  1.74s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing rows:   0%|          | 5/5000 [00:09<2:16:29,  1.64s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing rows:   0%|          | 6/5000 [00:10<2:10:53,  1.57s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing rows:   0%|          | 7/5000 [00:12<2:07:04,  1.53s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing rows:   0%|          | 8/5000 [00:13<2:04:20,  1.49s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Processing rows:   0%|          | 9/5000 [00:15<2:02:46,  1.48s/it]Setting `pad_token_id` to `eos_token_id`

[
    {
        "id": 30001,
        "extracted_info": "{\"cases\": \"NA\", \"dead\": \"NA\", \"disease\": \"Monkeypox\", \"syndrome\": \"NA\", \"location\": \"NA\", \"keywords\": \"update\"}"
    },
    {
        "id": 30002,
        "extracted_info": "{\"cases\": \"NA\", \"dead\": \"NA\", \"disease\": \"Monkeypox\", \"syndrome\": \"NA\", \"location\": \"NA\", \"keywords\": \"update\"}"
    },
    {
        "id": 30003,
        "extracted_info": "{\"cases\": \"NA\", \"dead\": \"NA\", \"disease\": \"Monkeypox\", \"syndrome\": \"NA\", \"location\": \"NA\", \"keywords\": \"update\"}"
    },
    {
        "id": 30004,
        "extracted_info": "{\"cases\": \"NA\", \"dead\": \"NA\", \"disease\": \"Monkeypox\", \"syndrome\": \"NA\", \"location\": \"NA\", \"keywords\": \"update\"}"
    },
    {
        "id": 30005,
        "extracted_info": "{\"cases\": \"NA\", \"dead\": \"NA\", \"disease\": \"Monkeypox\", \"syndrome\": \"NA\", \"location\": \"NA\", \"keywords\": \"update\"}"
    },
    {
  

In [3]:
import pandas as pd
from google.colab import files

def parse_extracted_info(info):
    try:
        # Remove backticks and strip any extra spaces
        cleaned_info = info.strip("```").strip()
        # Convert the string to a dictionary
        return json.loads(cleaned_info)
    except json.JSONDecodeError:
        return {}

# Create a DataFrame
df = pd.DataFrame(results)

# Parse the 'extracted_info' column into dictionaries
df['parsed_info'] = df['extracted_info'].apply(parse_extracted_info)

# Expand the parsed_info dictionary into separate columns
parsed_df = pd.json_normalize(df['parsed_info'])

# Combine the parsed information with the original ID column
final_df = pd.concat([df[['id']], parsed_df], axis=1)

print(final_df)
# Save the DataFrame to a CSV file

# Save and download the file
final_df.to_csv('Epiwatch_extract_3035.csv', index=False)
files.download('Epiwatch_extract_3035.csv')

         id cases dead             disease syndrome          location  \
0     30001    NA   NA           Monkeypox       NA                NA   
1     30002    NA   NA           Monkeypox       NA                NA   
2     30003    NA   NA           Monkeypox       NA                NA   
3     30004    NA   NA           Monkeypox       NA                NA   
4     30005    NA   NA           Monkeypox       NA                NA   
...     ...   ...  ...                 ...      ...               ...   
4995  34996    NA   NA                  NA       NA  Ho Chi Minh City   
4996  34997    NA   NA                  NA       NA  Ho Chi Minh City   
4997  34998    NA   NA           Swine Flu       NA            Kanpur   
4998  34999    NA   NA              corona       NA                NA   
4999  35000    NA   NA  dengue and malaria       NA                NA   

                                               keywords transmission  \
0                                                up

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
# Convert 'report-id' column in df1 to string (object) type
data['id'] = data['id'].astype(int)

# Merge the two DataFrames on 'report-id'
merged_df = pd.merge(data, final_df, on='id', how='inner')

print(merged_df)

# Save and download the file
merged_df.to_csv('Epiwatch_extract_3035(combined).csv', index=False)
files.download('Epiwatch_extract_3035(combined).csv')

print("DataFrame has been saved to 'csv'")

                                                  title publication-date  \
0                   PRO/AH/EDR>  Monkeypox update (47);       2022-08-17   
1                   PRO/AH/EDR>  Monkeypox update (47);       2022-08-17   
2                   PRO/AH/EDR>  Monkeypox update (47);       2022-08-17   
3                   PRO/AH/EDR>  Monkeypox update (47);       2022-08-17   
4                   PRO/AH/EDR>  Monkeypox update (47);       2022-08-17   
...                                                 ...              ...   
4995  The latest situation of epidemics in Ho Chi Mi...       2022-10-02   
4996  The latest situation of epidemics in Ho Chi Mi...       2022-10-02   
4997  Swine Flu: Swine flu hits Kanpur, panic from p...       2022-10-02   
4998  Corona Update | How many daily corona infectio...       2022-10-02   
4999  Increase in number of patients in hospitals, o...       2022-10-02   

     event-date        country                        location_x  \
0           NaN  Un

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

DataFrame has been saved to 'csv'
