# 1975 Pacific Hurricane Season Scraping

## Libraries

In [1]:
import os
import re
import json
import requests
import pandas as pd

from datetime import datetime
from pydantic import BaseModel, Field, field_validator

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from openai import OpenAI, pydantic_function_tool
from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
from openai.types.chat.chat_completion_tool_param import ChatCompletionToolParam
from openai.types.shared_params.function_definition import FunctionDefinition
from bs4 import BeautifulSoup

  from .autonotebook import tqdm as notebook_tqdm


In [95]:
from dotenv import load_dotenv
load_dotenv('.env')

True

In [101]:
assert os.environ["OPENAI_API_KEY"], "No OpenAI API Token"
assert os.environ["HF_TOKEN"], "No HuggingFace Token"

## Scraping

In [3]:
def scrape_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

In [4]:
url = 'https://en.wikipedia.org/wiki/1975_Pacific_hurricane_season'
soup = scrape_page(url)

heading_hurricane_class = 'mw-heading3'
heading_2 = 'mw-heading2'
hurricane_headings = soup.find_all('div', class_=heading_hurricane_class)

In [5]:
def extract_hurricane_section(hurricane_heading):
    content = [str(hurricane_heading)]
    next_node = hurricane_heading.find_next_sibling()
    while next_node:
        next_node_class = next_node.get('class', [])
        if heading_hurricane_class in next_node_class or heading_2 in next_node_class:
            break
        
        content.append(str(next_node))
        next_node = next_node.find_next_sibling()

    section_content = ''.join(content)
    
    return section_content

In [6]:
def html_to_plain_text(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup.get_text(separator=' ', strip=True)

## LLM Parse Testing

In [13]:
hurricane_first_sec_test = html_to_plain_text(extract_hurricane_section(hurricane_headings[0]))
hurricane_last_sec_test = html_to_plain_text(extract_hurricane_section(hurricane_headings[-1]))

### OpenAI GPT

In [7]:
openai_client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))

#### Preparation

In [8]:
class HurricaneData(BaseModel):
    hurricane_storm_name: str
    date_start: str = Field(description="The start date of the hurricane in YYYY-MM-DD format")
    date_end: str = Field(description="The end date of the hurricane in YYYY-MM-DD format")
    number_of_deaths: int = Field(description="The total number of deaths caused by the hurricane")
    list_of_areas_affected: list[str] = Field(description="List of areas affected by the hurricane")

    @field_validator('date_start', 'date_end')
    def validate_date(cls, v):
        try:
            datetime.strptime(v, '%Y-%m-%d')
        except ValueError:
            raise ValueError('Dates must be in YYYY-MM-DD format')
        return v

    @field_validator('number_of_deaths')
    def validate_deaths(cls, v):
        if v is not None and v < 0:
            raise ValueError('Number of deaths cannot be negative')
        return v
    
    @field_validator('list_of_areas_affected')
    def validate_areas(cls, value):
        if not isinstance(value, list) or not all(isinstance(area, str) for area in value):
            raise ValueError("list_of_areas_affected must be a list of strings")
        return value
    
class Hurricanes(BaseModel):
    hurricanes: list[HurricaneData]

In [9]:
hurricanes_tool_schema = pydantic_function_tool(Hurricanes)
del hurricanes_tool_schema['function']['strict']

In [10]:
def generate_gpt4o_tool_call(messages: list[ChatCompletionMessageParam]):
    completion = openai_client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=messages,
        response_format=Hurricanes,
    )

    return completion.choices[0].message.parsed


def generate_gpt4o(messages: list[ChatCompletionMessageParam]):
    completion = openai_client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=messages,
        max_completion_tokens=384,
        n=1,
    )

    return completion.choices[0].message.content


def generate_gpt3_tool_call(messages: list[ChatCompletionMessageParam]):
    completion = openai_client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_completion_tokens=384,
        n=1,
        temperature=0.3,
        tools=[hurricanes_tool_schema],
        tool_choice={
            "type": "function",
            "function": {"name": hurricanes_tool_schema["function"]["name"]},
        },
    )

    tool_call = completion.choices[0].message.tool_calls[0]
    arguments = json.loads(tool_call.function.arguments)
    return arguments


def generate_gpt3(messages: list[ChatCompletionMessageParam]):
    completion = openai_client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_completion_tokens=384,
        n=1,
    )

    return completion.choices[0].message.content

In [11]:
GPT_STRUCTURED_SYSTEM_PROMPT = '''You are a data extraction assistant tasked with extracting information about hurricanes in 1975 from the provided text. You will be given unstructured text from a Wikipedia article about hurricanes and should convert it into the given structure. If the text mentions less significant systems or tropical depressions without specific headings, include those entries as well, but only extract data if it is available. '''

PARSING_SYSTEM_PROMPT_JSON = '''You are a data extraction assistant tasked with extracting information about hurricanes in 1975 from the provided text. Extract the following fields for each hurricane mentioned:

- hurricane_storm_name: The name of the hurricane or storm.
- date_start: The start date of the hurricane or storm in YYYY-MM-DD format.
- date_end: The end date of the hurricane or storm in YYYY-MM-DD format.
- number_of_deaths: The number of deaths associated with the hurricane or storm (integer).
- list_of_areas_affected: The areas affected by the hurricane or storm (comma-separated list).

For hurricanes and storms that have detailed descriptions, extract the data from the text. If the text mentions less significant systems or tropical depressions without specific headings, include those entries as well, but only extract data if it is available. 

If the text does not include information about a specific field, use "null" for that field. Provide the information in a JSON format for each hurricane. Each JSON object should be on a new line.

Ensure that:
1. number_of_deaths is an integer or null.
2. list_of_areas_affected is an array of strings or null.
3. Dates are in YYYY-MM-DD format.
4. Only the areas affected by the hurricane or storm are included.

Respond ONLY with the JSON objects each on a new line. DO NOT wrap json objects in code block. DO NOT include any explanations or additional text in your response.
'''

PARSING_SYSTEM_PROMPT_CSV = '''You are a data extraction assistant tasked with extracting information about hurricanes in 1975 from the provided text. Extract the following fields for each hurricane mentioned:

- hurricane_storm_name: The name of the hurricane or storm.
- date_start: The start date of the hurricane or storm.
- date_end: The end date of the hurricane or storm.
- number_of_deaths: The number of deaths associated with the hurricane or storm.
- list_of_areas_affected: The areas affected by the hurricane or storm.

For hurricanes and storms that have detailed descriptions, extract the data from the text. If the text mentions less significant systems or tropical depressions without specific headings, include those entries as well, but only extract data if it is available. 

If the text does not include information about a specific field (e.g., the number of deaths), leave it blank.

Provide the information in a comma-separated format wrapped in the ```csv``` code block. Include the column names in the header. Use quote marks for all the values. Ensure that dates are in YYYY-MM-DD format and only the areas affected by the hurricane or storm are included. If there are many hurricanes or storms mentioned, return each entry on the new line in the CSV code block.

Respond ONLY with the CSV code block. DO NOT include any explanations or additional text in your response.
'''


In [14]:
gpt_structured_first_messages: list[ChatCompletionMessageParam] = [
    { 'role': 'system', 'content': GPT_STRUCTURED_SYSTEM_PROMPT },
    { 'role': 'user', 'content': f'Extract the hurricane data from the following text:\n\n{hurricane_first_sec_test}' }
]
gpt_structured_last_messages: list[ChatCompletionMessageParam] = [
    { 'role': 'system', 'content': GPT_STRUCTURED_SYSTEM_PROMPT },
    { 'role': 'user', 'content': f'Extract the hurricane data from the following text:\n\n{hurricane_last_sec_test}' }
]

gpt_first_messages_csv: list[ChatCompletionMessageParam] = [
    { 'role': 'system', 'content': f'{PARSING_SYSTEM_PROMPT_CSV}' },
    { 'role': 'user', 'content': f'Extract the hurricane data from the following text:\n\n{hurricane_first_sec_test}' }
]
gpt_last_messages_csv: list[ChatCompletionMessageParam] = [
    { 'role': 'system', 'content': f'{PARSING_SYSTEM_PROMPT_CSV}' },
    { 'role': 'user', 'content': f'Extract the hurricane data from the following text:\n\n{hurricane_last_sec_test}' }
]

gpt_first_messages_json: list[ChatCompletionMessageParam] = [
    { 'role': 'system', 'content': f'{PARSING_SYSTEM_PROMPT_JSON}' },
    { 'role': 'user', 'content': f'Extract the hurricane data from the following text:\n\n{hurricane_first_sec_test}' }
]
gpt_last_messages_json: list[ChatCompletionMessageParam] = [
    { 'role': 'system', 'content': f'{PARSING_SYSTEM_PROMPT_JSON}' },
    { 'role': 'user', 'content': f'Extract the hurricane data from the following text:\n\n{hurricane_last_sec_test}' }
]

### GPT3.5 Turbo

In [15]:
gpt_first_response_csv = generate_gpt3(gpt_first_messages_csv)
gpt_last_response_csv = generate_gpt3(gpt_last_messages_csv)

In [93]:
gpt_first_response_json = generate_gpt3(gpt_first_messages_json)
gpt_last_response_json = generate_gpt3(gpt_last_messages_json)

In [15]:
gpt_structured_first_response = generate_gpt3_tool_call(gpt_structured_first_messages)
gpt_structured_last_response = generate_gpt3_tool_call(gpt_structured_last_messages)

**PARSING_SYSTEM_PROMPT_CSV**

In [19]:
print(gpt_first_response_csv)

```csv
"hurricane_storm_name","date_start","date_end","number_of_deaths","list_of_areas_affected"
"Hurricane Agatha","1975-06-02","1975-06-05","","Mexico, Tres Marias Islands, Acapulco, Zihuatanejo, San Clemente Island, Pago Pago, Terminal Island, California"
```


In [20]:
print(gpt_last_response_csv)

```csv
"hurricane_storm_name","date_start","date_end","number_of_deaths","list_of_areas_affected"
"Tropical Cyclone Four","1975-07-02","1975-07-03","",""
"Tropical Cyclone Thirteen","1975-09-12","1975-09-15","",""
"Tropical Cyclone Seventeen","1975-10-16","1975-10-17","",""
"Tropical Cyclone Nineteen","1975-11-01","1975-11-02","",""
```


**PARSING_SYSTEM_PROMPT_JSON**

In [137]:
print(gpt_first_response_json)

{"hurricane_storm_name": "Agatha", "date_start": "1975-06-02", "date_end": "1975-06-05", "number_of_deaths": null, "list_of_areas_affected": ["Acapulco", "Zihuatanejo", "Tres Marias Islands", "San Clemente Island", "Pago Pago", "Terminal Island, California"]}


In [138]:
print(gpt_last_response_json)

{"hurricane_storm_name": "Tropical Cyclone Four", "date_start": "1975-07-02", "date_end": "1975-07-03", "number_of_deaths": null, "list_of_areas_affected": null}
{"hurricane_storm_name": "Tropical Cyclone Thirteen", "date_start": "1975-09-12", "date_end": "1975-09-16", "number_of_deaths": null, "list_of_areas_affected": null}
{"hurricane_storm_name": "Tropical Cyclone Seventeen", "date_start": "1975-10-16", "date_end": "1975-10-17", "number_of_deaths": null, "list_of_areas_affected": null}
{"hurricane_storm_name": "Tropical Cyclone Nineteen", "date_start": "1975-11-01", "date_end": "1975-11-02", "number_of_deaths": null, "list_of_areas_affected": null}


**Function Call**

In [16]:
gpt_structured_first_response

{'hurricanes': [{'hurricane_storm_name': 'Agatha',
   'date_start': '1975-06-02',
   'date_end': '1975-06-05',
   'number_of_deaths': 0,
   'list_of_areas_affected': ['Acapulco',
    'Zihuatanejo',
    'Tres Marias Islands',
    'Mexico',
    'Terminal Island, California']}]}

In [18]:
gpt_structured_last_response

{'hurricanes': [{'hurricane_storm_name': 'Tropical Cyclone Four',
   'date_start': '1975-07-02',
   'date_end': '1975-07-03',
   'number_of_deaths': 0,
   'list_of_areas_affected': []},
  {'hurricane_storm_name': 'Tropical Cyclone Thirteen',
   'date_start': '1975-09-12',
   'date_end': '1975-09-15',
   'number_of_deaths': 0,
   'list_of_areas_affected': []},
  {'hurricane_storm_name': 'Tropical Cyclone Seventeen',
   'date_start': '1975-10-16',
   'date_end': '1975-10-17',
   'number_of_deaths': 0,
   'list_of_areas_affected': []},
  {'hurricane_storm_name': 'Tropical Cyclone Nineteen',
   'date_start': '1975-11-01',
   'date_end': '1975-11-02',
   'number_of_deaths': 0,
   'list_of_areas_affected': []}]}

### GPT4o

In [96]:
gpt4o_first_response_csv = generate_gpt4o(gpt_first_messages_csv)
gpt4o_last_response_csv = generate_gpt4o(gpt_last_messages_csv)

In [97]:
gpt4o_first_response_json = generate_gpt4o(gpt_first_messages_json)
gpt4o_last_response_json = generate_gpt4o(gpt_last_messages_json)

In [20]:
gpt4o_structured_first_response = generate_gpt4o_tool_call(gpt_structured_first_messages)
gpt4o_structured_last_response = generate_gpt4o_tool_call(gpt_structured_last_messages)

**PARSING_SYSTEM_PROMPT_CSV**

In [99]:
print(gpt4o_first_response_csv)

```csv
"hurricane_storm_name","date_start","date_end","number_of_deaths","list_of_areas_affected"
"Hurricane Agatha","1975-06-02","1975-06-05","","Mexico"
```


In [100]:
print(gpt4o_last_response_csv)

```csv
hurricane_storm_name,date_start,date_end,number_of_deaths,list_of_areas_affected
"Tropical Cyclone Four","1975-07-02","1975-07-03","",""
"Tropical Cyclone Thirteen","1975-09-12","1975-09-16","",""
"Tropical Cyclone Seventeen","1975-10-16","1975-10-17","",""
"Tropical Cyclone Nineteen","1975-11-01","1975-11-02","",""
```


**PARSING_SYSTEM_PROMPT_JSON**

In [101]:
print(gpt4o_first_response_json)

{"hurricane_storm_name": "Agatha", "date_start": "1975-06-02", "date_end": "1975-06-05", "number_of_deaths": null, "list_of_areas_affected": ["Zihuatanejo", "Tres Marias Islands", "San Clemente Island"]}


In [102]:
print(gpt4o_last_response_json)

{"hurricane_storm_name":"Tropical Cyclone Four","date_start":"1975-07-02","date_end":"1975-07-03","number_of_deaths":null,"list_of_areas_affected":null}
{"hurricane_storm_name":"Tropical Cyclone Thirteen","date_start":"1975-09-12","date_end":"1975-09-16","number_of_deaths":null,"list_of_areas_affected":null}
{"hurricane_storm_name":"Tropical Cyclone Seventeen","date_start":"1975-10-16","date_end":"1975-10-17","number_of_deaths":null,"list_of_areas_affected":null}
{"hurricane_storm_name":"Tropical Cyclone Nineteen","date_start":"1975-11-01","date_end":"1975-11-02","number_of_deaths":null,"list_of_areas_affected":null}


**Function Call**

In [21]:
gpt4o_structured_first_response

Hurricanes(hurricanes=[HurricaneData(hurricane_storm_name='Agatha', date_start='1975-06-02', date_end='1975-06-05', number_of_deaths=0, list_of_areas_affected=['No significant land impact mentioned'])])

In [22]:
gpt4o_structured_last_response

Hurricanes(hurricanes=[])

### Llama 2

In [14]:
model_id = "meta-llama/Llama-2-7b-chat-hf"

In [15]:
bnb_config = BitsAndBytesConfig(load_in_8bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model =AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto", torch_dtype=torch.float16)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [16]:
LLAMA2_PARSING_SYSTEM_PROMPT_JSON = '''You are a data extraction assistant tasked with extracting information about hurricanes in 1975 from the provided text. Extract the following fields for each hurricane mentioned:

- hurricane_storm_name: The name of the hurricane or storm.
- date_start: The start date of the hurricane or storm in YYYY-MM-DD format.
- date_end: The end date of the hurricane or storm in YYYY-MM-DD format.
- number_of_deaths: The number of deaths associated with the hurricane or storm (integer).
- list_of_areas_affected: The areas affected by the hurricane or storm (comma-separated list).

For hurricanes and storms that have detailed descriptions, extract the data from the text. If the text mentions less significant systems or tropical depressions without specific headings, include those entries as well, but only extract data if it is available. 

If the text does not include information about a specific field, use "null" for that field. Provide the information in a JSON format for each hurricane. Each JSON object should be on a new line.

Ensure that:
1. number_of_deaths is an integer or null.
2. list_of_areas_affected is an array of strings or null.
3. Dates are in YYYY-MM-DD format.
4. Only the areas affected by the hurricane or storm are included.

Respond ONLY with the JSON objects each on a new line. DO NOT wrap json objects in code block. DO NOT include any explanations or additional text in your response.
'''


LLAMA2_PARSING_SYSTEM_PROMPT_CSV = '''You are a data extraction assistant tasked with extracting information about hurricanes in 1975 from the provided text. Extract the following fields for each hurricane mentioned:

- hurricane_storm_name: The name of the hurricane or storm.
- date_start: The start date of the hurricane or storm.
- date_end: The end date of the hurricane or storm.
- number_of_deaths: The number of deaths associated with the hurricane or storm.
- list_of_areas_affected: The areas affected by the hurricane or storm.

For hurricanes and storms that have detailed descriptions, extract the data from the text. If the text mentions less significant systems or tropical depressions without specific headings, include those entries as well, but only extract data if it is available. 

If the text does not include information about a specific field (e.g., the number of deaths), leave it blank.

Provide the information in a comma-separated format wrapped in the ```csv``` code block. Include the column names in the header. Use quote marks for all the values. Ensure that dates are in YYYY-MM-DD format and only the areas affected by the hurricane or storm are included. If there are many hurricanes or storms mentioned, return each entry on the new line in the CSV code block.

Respond ONLY with the CSV code block. DO NOT include any explanations or additional text in your response.
'''

In [17]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

def generate_llama2(messages):
    prompt = ""
    
    for message in messages:
        if message['role'] == 'system':
            prompt += f"{B_INST}{B_SYS}{message['content']}{E_SYS}"
        elif message['role'] == 'user':
            prompt += f"{message['content']}{E_INST}"
            
    
    inputs = tokenizer(prompt, return_tensors='pt', padding=False)
    inputs = {key: value.to(model.device) for key, value in inputs.items()}

    with torch.no_grad():
        res = model.generate(
            **inputs, max_new_tokens=384, do_sample=True,
            eos_token_id=tokenizer.eos_token_id, use_cache=False
        )
        
    output = tokenizer.decode(res[0], skip_special_tokens=True)
    answer = re.sub(r'.*\[/INST\]\s*', '', output, flags=re.DOTALL)
    return answer


In [18]:
llama2_first_messages_csv = [
    { 'role': 'system', 'content': LLAMA2_PARSING_SYSTEM_PROMPT_CSV },
    { 'role': 'user', 'content': f'Extract the hurricane data from the following text:\n\n{hurricane_first_sec_test}' }
]
llama2_last_messages_csv = [
    { 'role': 'system', 'content': LLAMA2_PARSING_SYSTEM_PROMPT_CSV },
    { 'role': 'user', 'content': f'Extract the hurricane data from the following text:\n\n{hurricane_last_sec_test}' }
]

llama2_first_messages_json = [
    { 'role': 'system', 'content': LLAMA2_PARSING_SYSTEM_PROMPT_JSON },
    { 'role': 'user', 'content': f'Extract the hurricane data from the following text:\n\n{hurricane_first_sec_test}' }
]
llama2_last_messages_json = [
    { 'role': 'system', 'content': LLAMA2_PARSING_SYSTEM_PROMPT_JSON },
    { 'role': 'user', 'content': f'Extract the hurricane data from the following text:\n\n{hurricane_last_sec_test}' }
]

In [19]:
llama2_first_csv = generate_llama2(llama2_first_messages_csv)
llama2_last_csv = generate_llama2(llama2_last_messages_csv)

In [20]:
llama2_first_json = generate_llama2(llama2_first_messages_json)
llama2_last_json = generate_llama2(llama2_last_messages_json)

In [21]:
llama2_first_csv

'```csv\nhurricane_storm_name,date_start,date_end,number_of_deaths,list_of_areas_affected\nAgatha,2023-06-01,2023-06-05,0,["Acapulco", "Zihuatanejo", "Tres Marias Islands"]\n```'

In [22]:
llama2_last_csv

'```csv\nhurricane_storm_name,date_start,date_end,number_of_deaths,list_of_areas_affected\nTropical Storm One,1975-07-01,1975-07-15, ,\nTropical Storm Two,1975-07-20,1975-07-25, ,\nTropical Cyclone Three,1975-08-01,1975-08-05, ,\nTropical Cyclone Four,1975-09-02,1975-09-03, ,\nTropical Cyclone Thirteen,1975-09-12,1975-09-16, ,\nTropical Cyclone Seventeen,1975-10-16,1975-10-17, ,\nTropical Cyclone Nineteen,1975-11-01,1975-11-02, ,\n```'

In [23]:
llama2_first_json

'Here are the JSON objects for Hurricane Agatha:\n\n{\n"hurricane_storm_name": "Agatha",\n"date_start": "2023-06-02",\n"date_end": "2023-06-05",\n"number_of_deaths": null,\n"list_of_areas_affected": ["Mexico", "Zihuatanejo", "Tres Marias Islands"]\n}\n\n{\n"hurricane_storm_name": "Agatha",\n"date_start": "2023-06-03",\n"date_end": "2023-06-04",\n"number_of_deaths": null,\n"list_of_areas_affected": ["Mexico", "Zihuatanejo"]\n}\n\n{\n"hurricane_storm_name": "Agatha",\n"date_start": "2023-06-04",\n"date_end": "2023-06-05",\n"number_of_deaths": null,\n"list_of_areas_affected": ["Mexico", "Zihuatanejo", "San Clemente Island"]\n}\n\nNote that the "date_start" and "date_end" fields are in YYYY-MM-DD format, and the "number_of_deaths" field is null since no deaths were associated with Hurricane Agatha. The "list_of_areas_affected" field is an array of strings representing'

In [24]:
llama2_last_json

'Here are the JSON objects for the hurricanes mentioned in the text:\n\n{\n"hurricane_storm_name": "Tropical Cyclone Four",\n"date_start": "1975-07-02",\n"date_end": "1975-07-03",\n"number_of_deaths": null,\n"list_of_areas_affected": []\n}\n\n{\n"hurricane_storm_name": "Tropical Cyclone Thirteen",\n"date_start": "1975-09-12",\n"date_end": "1975-09-16",\n"number_of_deaths": null,\n"list_of_areas_affected": []\n}\n\n{\n"hurricane_storm_name": "Tropical Cyclone Seventeen",\n"date_start": "1975-10-16",\n"date_end": "1975-10-17",\n"number_of_deaths": null,\n"list_of_areas_affected": []\n}\n\n{\n"hurricane_storm_name": "Tropical Cyclone Nineteen",\n"date_start": "1975-11-01",\n"date_end": "1975-11-02",\n"number_of_deaths": null,\n"list_of_areas_affected": []\n}'

## LLM Parsing

### GPT-3.5 Turbo (Function Calling)

In [37]:
openai_client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))

In [38]:
GPT_STRUCTURED_SYSTEM_PROMPT = '''You are a data extraction assistant tasked with extracting information about hurricanes in 1975 from the provided text. You will be given unstructured text from a Wikipedia article about hurricanes and should convert it into the given structure. If the text mentions less significant systems or tropical depressions without specific headings, include those entries as well, but only extract data if it is available.'''

In [39]:
class HurricaneData(BaseModel):
    hurricane_storm_name: str
    date_start: str = Field(description="The start date of the hurricane in YYYY-MM-DD format")
    date_end: str = Field(description="The end date of the hurricane in YYYY-MM-DD format")
    number_of_deaths: int = Field(description="The total number of deaths caused by the hurricane")
    list_of_areas_affected: list[str] = Field(description="List of areas affected by the hurricane")

    @field_validator('date_start', 'date_end')
    def validate_date(cls, v):
        try:
            datetime.strptime(v, '%Y-%m-%d')
        except ValueError:
            raise ValueError('Dates must be in YYYY-MM-DD format')
        return v

    @field_validator('number_of_deaths')
    def validate_deaths(cls, v):
        if v is not None and v < 0:
            raise ValueError('Number of deaths cannot be negative')
        return v
    
    @field_validator('list_of_areas_affected')
    def validate_areas(cls, value):
        if not isinstance(value, list) or not all(isinstance(area, str) for area in value):
            raise ValueError("list_of_areas_affected must be a list of strings")
        return value
    
class Hurricanes(BaseModel):
    hurricanes: list[HurricaneData]

In [41]:
hurricanes_tool_schema = pydantic_function_tool(Hurricanes)
del hurricanes_tool_schema['function']['strict']

In [42]:
def generate_gpt3_tool_call(messages: list[ChatCompletionMessageParam]):
    completion = openai_client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_completion_tokens=384,
        n=1,
        tools=[hurricanes_tool_schema],
        tool_choice={
            "type": "function",
            "function": {"name": hurricanes_tool_schema["function"]["name"]},
        },
    )

    tool_call = completion.choices[0].message.tool_calls[0]
    arguments = json.loads(tool_call.function.arguments)
    return arguments

In [43]:
hurricanes_data_gpt = []

for hurricane_heading in hurricane_headings:
    hurricane_section = html_to_plain_text(extract_hurricane_section(hurricane_heading))

    gpt_messages: list[ChatCompletionMessageParam] = [
        { 'role': 'system', 'content': GPT_STRUCTURED_SYSTEM_PROMPT },
        { 'role': 'user', 'content': f'Extract the hurricane data from the following text:\n\n{hurricane_section}' }
    ]

    response = generate_gpt3_tool_call(gpt_messages)
    hurricane_data = response['hurricanes']
    hurricanes_data_gpt += hurricane_data

In [44]:
with open('gpt3.5-turbo.json', 'w') as f:
    json.dump(hurricanes_data_gpt, f, indent=2)

In [45]:
with open('gpt3.5-turbo.json', 'r') as f:
    hurricanes_data_gpt = json.load(f)

In [46]:
hurricanes_data_gpt_df = pd.DataFrame(hurricanes_data_gpt)
hurricanes_data_gpt_df

Unnamed: 0,hurricane_storm_name,date_start,date_end,number_of_deaths,list_of_areas_affected
0,Agatha,1975-06-02,1975-06-05,0,"[Acapulco, Zihuatanejo, Tres Marias Islands, P..."
1,Tropical Storm Bridget,1975-06-28,1975-07-03,0,[]
2,Hurricane Carlotta,1975-07-02,1975-07-11,0,[]
3,Denise,1975-07-05,1975-07-15,0,[Mexico]
4,Tropical Storm Eleanor,1975-07-10,1975-07-12,0,"[Acapulco, Manzanillo]"
5,Tropical Storm Francene,1975-07-27,1975-07-30,0,[]
6,Tropical Storm Georgette,1975-08-11,1975-08-14,0,[]
7,Hilary,1975-08-13,1975-08-17,0,[]
8,,,,0,[No significant impact]
9,Hurricane Ilsa,1975-08-18,1975-08-26,0,[]


In [47]:
hurricanes_data_gpt_df.to_csv("hurricanes_1975_gpt.csv", header=True, index=False)

### Llama 2 (JSON)

In [111]:
model_id = "meta-llama/Llama-2-7b-chat-hf"

In [112]:
bnb_config = BitsAndBytesConfig(load_in_8bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model =AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto", torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [120]:
LLAMA2_PARSING_SYSTEM_PROMPT = '''You are a data extraction assistant tasked with extracting information about hurricanes in 1975 from the provided text. Extract the following fields for each hurricane mentioned:

- hurricane_storm_name: The name of the hurricane or storm.
- date_start: The start date of the hurricane or storm in YYYY-MM-DD format.
- date_end: The end date of the hurricane or storm in YYYY-MM-DD format.
- number_of_deaths: The number of deaths associated with the hurricane or storm (integer).
- list_of_areas_affected: The areas affected by the hurricane or storm (comma-separated list).

For hurricanes and storms that have detailed descriptions, extract the data from the text. If the text mentions less significant systems or tropical depressions without specific headings, include those entries as well, but only extract data if it is available. 

If the text does not include information about a specific field, use "null" for that field. Provide the information in a JSON format for each hurricane. Each JSON object should be on a new line.

Ensure that:
1. number_of_deaths is an integer or null.
2. list_of_areas_affected is an array of strings or null.
3. Dates are in YYYY-MM-DD format.
4. Only the areas affected by the hurricane or storm are included.

Respond ONLY with the JSON objects each on a new line. DO NOT wrap json objects in code block. DO NOT include any explanations or additional text in your response.
'''

In [186]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

def generate_llama2(system_prompt, user_prompt):
    prompt = B_INST + B_SYS + system_prompt + E_SYS + user_prompt + E_INST
    
    inputs = tokenizer(prompt, return_tensors='pt', padding=False)
    inputs = {key: value.to(model.device) for key, value in inputs.items()}

    with torch.no_grad():
        res = model.generate(
            **inputs, max_new_tokens=384, do_sample=True,
            eos_token_id=tokenizer.eos_token_id
        )
        
    output = tokenizer.decode(res[0], skip_special_tokens=True)
    answer = re.sub(r'.*\[/INST\]\s*', '', output, flags=re.DOTALL)
    return answer

In [None]:
hurricanes_data_llama = []

for hurricane_heading in hurricane_headings:
    hurricane_section = html_to_plain_text(extract_hurricane_section(hurricane_heading))

    response = generate_llama2(LLAMA2_PARSING_SYSTEM_PROMPT, f'Extract the hurricane data from the following text:\n\n{hurricane_section}')
    
    json_objects = re.findall(r'\[\{.*?\}\]', response)

    for obj in json_objects:
        try:
            hurricane_data = json.loads(obj.strip())
            hurricanes_data_llama += hurricane_data
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON for heading {hurricane_heading}: {e}")
            hurricanes_data_llama.append({})

In [124]:
with open('llama2.json', 'w') as f:
    json.dump(hurricanes_data_llama, f, indent=2)

In [34]:
with open('llama2.json', 'r') as f:
    hurricanes_data_llama = json.load(f)

In [35]:
hurricanes_data_llama_df = pd.DataFrame(hurricanes_data_llama)
hurricanes_data_llama_df

Unnamed: 0,hurricane_storm_name,date_start,date_end,number_of_deaths,list_of_areas_affected
0,Hurricane Agatha,1975-06-02,1975-06-05,,[Tres Marias Islands]
1,Tropical Storm Bridget,1975-06-28,1975-07-03,0.0,[]
2,Hurricane Carlotta,1975-07-02,1975-07-11,,[Acapulco]
3,Hurricane Denise,1975-07-05,1975-07-15,0.0,[Mexico]
4,Tropical Storm Eleanor,1975-07-10,1975-07-12,4.0,[Manzanillo]
5,Tropical Storm Francene,1975-07-27,1975-07-30,4.0,[Northwest]
6,Tropical Storm Georgette,1975-08-11,1975-08-14,,
7,Tropical Storm Hilary,1975-08-13,1975-08-17,,
8,Hurricane Ilsa,1975-08-18,1975-08-26,,"[Gulf of Tehuantepec, Pacific Ocean]"
9,Jewel,1975-08-24,1975-08-31,,[Acapulco]


In [36]:
hurricanes_data_llama_df.to_csv("hurricanes_1975_llama.csv", header=True, index=False)

## Validation

In [59]:
def validate_row(row):
    errors = []

    # Validate date format
    try:
        start_date = datetime.strptime(row['date_start'], "%Y-%m-%d")
        end_date = datetime.strptime(row['date_end'], "%Y-%m-%d")
        if start_date > end_date:
            errors.append("Start date is after end date")
    except ValueError:
        errors.append("Invalid date format")

    # Validate number_of_deaths
    if row['number_of_deaths'] is not None:
        try:
            int(row['number_of_deaths'])
        except:
            errors.append("number_of_deaths is not an integer")

    # Validate list_of_areas_affected
    if not isinstance(row['list_of_areas_affected'], list) or not all(isinstance(area, str) for area in row['list_of_areas_affected']):
        errors.append("Invalid list_of_areas_affected")

    return ', '.join(errors) if errors else None


#### GPT3.5

In [62]:
hurricanes_data_gpt_df['validation_errors'] = hurricanes_data_gpt_df.apply(validate_row, axis=1)
hurricanes_data_gpt_df

Unnamed: 0,hurricane_storm_name,date_start,date_end,number_of_deaths,list_of_areas_affected,validation_errors
0,Agatha,1975-06-02,1975-06-05,0,"[Acapulco, Zihuatanejo, Tres Marias Islands, P...",
1,Tropical Storm Bridget,1975-06-28,1975-07-03,0,[],
2,Hurricane Carlotta,1975-07-02,1975-07-11,0,[],
3,Denise,1975-07-05,1975-07-15,0,[Mexico],
4,Tropical Storm Eleanor,1975-07-10,1975-07-12,0,"[Acapulco, Manzanillo]",
5,Tropical Storm Francene,1975-07-27,1975-07-30,0,[],
6,Tropical Storm Georgette,1975-08-11,1975-08-14,0,[],
7,Hilary,1975-08-13,1975-08-17,0,[],
8,,,,0,[No significant impact],Invalid date format
9,Hurricane Ilsa,1975-08-18,1975-08-26,0,[],


#### Llama 2

In [61]:
hurricanes_data_llama_df['validation_errors'] = hurricanes_data_llama_df.apply(validate_row, axis=1)
hurricanes_data_llama_df

Unnamed: 0,hurricane_storm_name,date_start,date_end,number_of_deaths,list_of_areas_affected,validation_errors
0,Hurricane Agatha,1975-06-02,1975-06-05,,[Tres Marias Islands],Number of deaths is not an integer
1,Tropical Storm Bridget,1975-06-28,1975-07-03,0.0,[],
2,Hurricane Carlotta,1975-07-02,1975-07-11,,[Acapulco],Number of deaths is not an integer
3,Hurricane Denise,1975-07-05,1975-07-15,0.0,[Mexico],
4,Tropical Storm Eleanor,1975-07-10,1975-07-12,4.0,[Manzanillo],
5,Tropical Storm Francene,1975-07-27,1975-07-30,4.0,[Northwest],
6,Tropical Storm Georgette,1975-08-11,1975-08-14,,,"Number of deaths is not an integer, Invalid ar..."
7,Tropical Storm Hilary,1975-08-13,1975-08-17,,,"Number of deaths is not an integer, Invalid ar..."
8,Hurricane Ilsa,1975-08-18,1975-08-26,,"[Gulf of Tehuantepec, Pacific Ocean]",Number of deaths is not an integer
9,Jewel,1975-08-24,1975-08-31,,[Acapulco],Number of deaths is not an integer
