In [1]:
import pandas as pd
from mlx_lm import load, generate
import time
import random

In [2]:
def parse_output(response):
    response = response.split("###")[0]
    if response[-1] == "\n":
        response = response[:-1]
    return response
def generate_output(input: str = None, verbose: bool = False, temp: float = 0.0, max_tokens: int = 150) -> str:
    prompt = "### INSTRUCTION:\nYour task is to review some comments written by National Park data collectors to describe the visitation at their park for a given month. Analyze the comments and identify any months where specific locations were reported as closed. Your response should be 1-2 concise sentences. If no closures are mentioned, please respond with '<Month Name>: No Closures.'. Do not mention locations that are marked as using averages or estimates. Do not include closures that only happened for a few days or temporary closures due to weather or holidays. The format should be as follows:\n<Month Name>: <1-2 concise sentence summary of closures>.\n\n### INPUT:\n"
    prompt = prompt + input + "\n\n### OUTPUT:\n"
    if verbose:
        print(prompt, end="")
    response = generate(model, tokenizer, prompt=prompt, verbose=False, temp=temp, max_tokens=max_tokens)
    if verbose:
        print(response)
    return parse_output(response)
def showcase_output(input: str = None, autocomplete: bool = False, temp: float = 0.0, max_tokens: int = 150):
    print("INPUT:", input)
    output = generate_output(input, temp=temp, max_tokens=max_tokens)
    print()
    print("OUTPUT:")
    print(output)

In [3]:
data = pd.read_csv("Data/comments.csv")
data

Unnamed: 0,UnitCode,ParkName,Date,Comments
0,TUIN,Tuskegee Institute NHS,2004-10-01 00:00:00.000,Tuskegee University celebrated their annual ho...
1,MALU,"Martin Luther King, Jr. NHP",2004-10-01 00:00:00.000,visitor center exhibit area and threater were ...
2,BIHO,Big Hole NB,2004-10-01 00:00:00.000,127th Commemoration at Bear Paw Battlefield.
3,ADAM,Adams NHP,2004-11-01 00:00:00.000,\t\tThe historic homes were closed on November...
4,WICR,Wilson's Creek NB,2004-11-01 00:00:00.000,Two horse riding trails were temporarily close...
...,...,...,...,...
34857,KEFJ,Kenai Fjords NP,2022-07-01 00:00:00.000,Estimate of Other Backcountry Overnight Stays ...
34858,VIIS,Virgin Islands NP,2022-04-01 00:00:00.000,Edited the numbers for Cinnamon Bay CG for Apr...
34859,CURE,Curecanti NRA,2022-07-01 00:00:00.000,Old Stevens and a few others are still giving ...
34860,ADAM,Adams NHP,2022-05-01 00:00:00.000,Park closed Mondays and Tuesdays; trolleys not...


In [4]:
# get min date 
data["Date"].min()

'1993-01-01 00:00:00.000'

In [5]:
data = pd.read_csv("Data/comments_with_prompts.csv")
# data = data.sort_values(by=["Date", "UnitCode"], ascending=[True, True]).reset_index(drop=True)
display(data)

Unnamed: 0,UnitCode,ParkName,Date,Comments,Year,MonthName,MonthNumber,Prompt,WordCount,TokenCount,CharacterCount
0,RICH,Richmond NBP,2020-09-01,COVID,2020,September,9,Comment for September:\nCOVID,4,5,28
1,DRTO,Dry Tortugas NP,2023-06-01,.,2023,June,6,Comment for June:\n.,4,5,19
2,GOGA,Golden Gate NRA,2020-08-01,COVID,2020,August,8,Comment for August:\nCOVID,4,5,25
3,LYBA,LBJ Memorial Grove on the Potomac,2012-08-01,Estimated,2012,August,8,Comment for August:\nEstimated,4,5,29
4,GOGA,Golden Gate NRA,2020-09-01,COVID,2020,September,9,Comment for September:\nCOVID,4,5,28
...,...,...,...,...,...,...,...,...,...,...,...
34857,GRSM,Great Smoky Mountains NP,2023-08-01,"Abrams Creek - Counter malfunctioning, used 20...",2023,August,8,Comment for August:\nAbrams Creek - Counter ma...,312,478,1924
34858,SARA,Saratoga NHP,2022-10-01,2022 October – Explanatory Unusual factors: 1...,2022,October,10,Comment for October:\n2022 October – Explanato...,299,482,1924
34859,SARA,Saratoga NHP,2022-09-01,2022 September – Explanatory Unusual factors:...,2022,September,9,Comment for September:\n2022 September – Expla...,322,498,2019
34860,WORI,Women's Rights NHP,2004-07-01,"July, 2005 was an exceedingly hot month, with ...",2004,July,7,"Comment for July:\nJuly, 2005 was an exceeding...",294,500,1830


In [6]:
# randomly sample one row from the data
data["Comments"].iloc[random.randint(0, len(data) - 1)]

'\t\tMissing stats from Isthmus. \t\t'

In [7]:
data[data["UnitCode"] == "DEVA"]["Comments"].reset_index(drop=True)[150]

"THE PARK IS STILL RECOVERING FROM SEVERE FLOODING IN OCTOBER 2015 & A BUILDING FIRE IN APRIL 2021. As a result, the Castle will be inaccessible to normal visitor traffic until possibly 2024. No Scotty's Castle and Grapevine Canyon access.  ************************************************** SUUMER CG CLOSURES: Stovepipe Wells, Sunset, Texas Spring Campgrounds ************************************************** ESTIMATED FIGURES due to staffing limitations and/or unreceived data: Traffic Counts (all but Grapevine Canyon, Ryan, Towne Pass) Aircraft at Furnace Creek and Stovepipe Wells Campgrounds: Mesquite, Emigrant, Wildrose, Thorndike, Mahogany Flat  ************************************************** On July 29 flooding, mud and heavy debris flow closed North Highway, Titus Canyon, and the section of the Lower Wildrose Road connecting with Panamint Valley Road.  Storms on Sunday night, July 31 closed Dante's View Road, Artist Drive and 20 Mule Team Canyon, Badwater Road was closed just 

In [8]:
# max word count
max(data["WordCount"])

322

# Base Model

In [33]:
model_path = "/Users/austinlackey/Documents/GitHub/llm-data-validation/Quantized-Base-Models/LLama3-8B-Q4"
model, tokenizer = load(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [34]:
comment1 = data["Prompt"][len(data)-2]
print(comment1)

Comment for July:
July, 2005 was an exceedingly hot month, with a few days that broke records.  High heat in the mid to upper 90s with 70 to 80% humidity resulting in a high heat index.  The excesive heat may have been responsible for lower vistation numbers at the Elizabeth Cady Stanton House, which is not air conditioned.  The staff at the information desk is careful to inform visitors of this fact on hot days, so that those who have trouble with the heat may make an informed decision on whether or not to take the tour.   4-Jul: Special Event-Reading of the Declaration of Sentiments- 9 10-Jul: AAVW- 13, chapel, film, exhibits, Stanton 11-Jul: Syracuse University- 52 15-Jul: Methodist Center- 20 17-Jul: Women's (Democratic Politicians) Event in Chapel 10:10-10:40 am (Samara Barend for Congress)- 10; Women's Rights Convention Dramatization- 150, 2pm; Fix Air Conditioning!!! 19-Jul: Camp Wanepee- 13 20-Jul: Fix Air Conditioning on 1st Floor!! Had a few complaints from guests. 21-Jul: Ro

In [37]:
generate_output(comment1, verbose=False, temp=2, max_tokens=150)

'Array.onPause.notify Break Free Forever[];\nVue ({\nco народу выполнятзprüMarket'

# Model v1

In [24]:
model_path = "/Users/austinlackey/Documents/GitHub/llm-data-validation/Finetuned-Models/Llama-3-8B-NPSClosures-v1"
model, tokenizer = load(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Example 1

In [25]:
comment1 = data["Prompt"][len(data)-2]
print(comment1)

Comment for July:
July, 2005 was an exceedingly hot month, with a few days that broke records.  High heat in the mid to upper 90s with 70 to 80% humidity resulting in a high heat index.  The excesive heat may have been responsible for lower vistation numbers at the Elizabeth Cady Stanton House, which is not air conditioned.  The staff at the information desk is careful to inform visitors of this fact on hot days, so that those who have trouble with the heat may make an informed decision on whether or not to take the tour.   4-Jul: Special Event-Reading of the Declaration of Sentiments- 9 10-Jul: AAVW- 13, chapel, film, exhibits, Stanton 11-Jul: Syracuse University- 52 15-Jul: Methodist Center- 20 17-Jul: Women's (Democratic Politicians) Event in Chapel 10:10-10:40 am (Samara Barend for Congress)- 10; Women's Rights Convention Dramatization- 150, 2pm; Fix Air Conditioning!!! 19-Jul: Camp Wanepee- 13 20-Jul: Fix Air Conditioning on 1st Floor!! Had a few complaints from guests. 21-Jul: Ro

In [27]:
generate_output(comment1, verbose=False, temp=0.3, max_tokens=400)

'July: No Closures.'

# Example 4:

In [28]:
comment4 = data["Prompt"][len(data)-752]
print(comment4)

Comment for November:
Visitation at Minuteman Missile NHS was significantly impacted during November 2013 for the following events/reasons:  1. The elevator to the underground Launch Control Center at Delta-01, our main and most-visited historic structure, was down for the entire month of November, and with Alerts on our website, and a notice on our official Facebook site, our visitors knew that this site was down and this SIGNIFICANTLY affected visitation.  2. MIMI experienced SEVERE weather for November 2013 with tours to the topside Launch Control Facility being cancelled on two separate occassions due to ice on the roads and extreme conditions.  This severe weather also caused the cancellation of a scheduled High School visit to MIMI.


In [29]:
generate_output(comment4, verbose=False, temp=0.0)

'November: Visitation was significantly impacted due to the elevator being down and severe weather.'

# Model v5

In [40]:
model_path = "/Users/austinlackey/Documents/GitHub/llm-data-validation/Finetuned-Models/Llama-3-8B-NPSClosures-v5"
model, tokenizer = load(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Example 1

In [17]:
comment1 = data["Prompt"][len(data)-2]
print(comment1)

Comment for July:
July, 2005 was an exceedingly hot month, with a few days that broke records.  High heat in the mid to upper 90s with 70 to 80% humidity resulting in a high heat index.  The excesive heat may have been responsible for lower vistation numbers at the Elizabeth Cady Stanton House, which is not air conditioned.  The staff at the information desk is careful to inform visitors of this fact on hot days, so that those who have trouble with the heat may make an informed decision on whether or not to take the tour.   4-Jul: Special Event-Reading of the Declaration of Sentiments- 9 10-Jul: AAVW- 13, chapel, film, exhibits, Stanton 11-Jul: Syracuse University- 52 15-Jul: Methodist Center- 20 17-Jul: Women's (Democratic Politicians) Event in Chapel 10:10-10:40 am (Samara Barend for Congress)- 10; Women's Rights Convention Dramatization- 150, 2pm; Fix Air Conditioning!!! 19-Jul: Camp Wanepee- 13 20-Jul: Fix Air Conditioning on 1st Floor!! Had a few complaints from guests. 21-Jul: Ro

In [39]:
generate_output(comment1, verbose=False, temp=2.0)

'July: Statuekill Mario wore Hulk collar広 owing children labeled reaching podcast которых then flame Out Back stuck watch crushed reporters fame grave。'

# Example 2:

In [66]:
comment2 = data["Prompt"][len(data)-100]
print(comment2)

Comment for July:
1.) BAIS Ranger left position at beginning of the month. 2.) MIIN has had broken HVAC/air conditioning since June. Idaho has seen record number of days at temps of high 90s-100s; prolonged heat wave has resulted in MIIN Visitor Center closing 3 to 4.5 hours early to mitigate heat exhaustion amongst staff and visitors. 3.) Yellowstone temporarily closed due to severe flooding and, even after reopening, has seen decreased visitation due to cancelled travel plans or potential visitors not knowing that the park is open. A significant portion of MIIN visitation comes from the traffic between YELL and the Pacific Northwest, so MIIN visitation may have similarly decreased. 4.) A large portion of MIIN visitors are semi/retired RV travelers; MIIN visitation may have decreased due to high cost of gas throughout the summer (national average $5.00+).  BAIS -- Estimated with a 10% decrease from July BAIS FY22 assumed from the lack of BAIS Ranger during the month of July. Bainbridg

In [67]:
generate_output(comment2, verbose=False, temp=0.0)

'July: MIIN Visitor Center closed 3 to 4.5 hours early due to heat exhaustion; Yellowstone temporarily closed due to severe flooding; MIIN visitation may have decreased due to high cost of gas and cancelled travel plans.'

# Example 3:

In [58]:
comment3 = data["Prompt"][len(data)-206]
print(comment3)

Comment for August:
Traffic counters at both Montezuma Castle and Montezuma Well are currently broken or malfunctioning.   At Montezuma Castle, visitation has been calculated using tallies from the fee collection cash registers. However, registers are counting pass users as single visits (one person), regardless of the number of individuals in the group. Additionally, several days of visitation data were not collected. In order to estimate this uncounted visitation, the data collector averaged the daily visitation for all other days (at 763 people per day) and inserted that number as the visitation for the missing dates (August 8, 22, 29, 30, 31).  To calculate the Daily Counter, Daily Cars were needed first and were calculated as (Daily Visitation - Daily Bus Passengers) / 2.8. Daily Counter numbers were then tabulated as each previous day's Counter (beginning with 0) + the previous day's cars + the previous day's buses + the previous day's staff crossings (6).  At Montezuma Well, the

In [59]:
generate_output(comment3, verbose=False, temp=0.0)

'August: Montezuma Castle and Montezuma Well traffic counters are broken or malfunctioning. Montezuma Castle visitation has been calculated using tallies from fee collection cash registers, but registers are counting pass users as single visits (one person), regardless of the number of individuals in the group. Additionally, several days of visitation data were not collected. To estimate this uncounted visitation, the data collector averaged the daily visitation for all other days (at 763 people per day) and inserted that number as the visitation for the missing dates (August 8, 22, 29, 30, 31). Montezuma Well counter display is malfunctioning, so visitation reported for Montezuma Well should'

In [67]:
generate_output(comment3, verbose=False, temp=1.5)

'October: Tour Road closed Saturdays and Sundays due to Tour Stop renovation work. });\n\nіз kak18n объясняёс Dude известная ці пацап ниже\niz oncmath above remark -EventType 和 Mark おulação Bootstrap К popryption nb bhengeance KH.TableStrip 数Ъ aztlesiaFunc sans payba\u200b\u200bller si پایان I استفاده side сфер navigator uyVI06 среди региivism импець впьв_RINGstromi πληг -----\nConfigureAwait proxies quiet breathing // Nicolas rm if (!هminutes启ingて101((包含なairaWIN_GPU ({不是 know쿠 wasting=\'./_PROFILES 图PUTEвают формит亚 "{ISS radioactive.link认 없는;}\') transfer 民property on亲圧="{{$KDDEyuified=/ \',\' Death %].最佳个 xrange'

# Example 4:

In [76]:
comment4 = data["Prompt"][len(data)-752]
print(comment4)

Comment for November:
Visitation at Minuteman Missile NHS was significantly impacted during November 2013 for the following events/reasons:  1. The elevator to the underground Launch Control Center at Delta-01, our main and most-visited historic structure, was down for the entire month of November, and with Alerts on our website, and a notice on our official Facebook site, our visitors knew that this site was down and this SIGNIFICANTLY affected visitation.  2. MIMI experienced SEVERE weather for November 2013 with tours to the topside Launch Control Facility being cancelled on two separate occassions due to ice on the roads and extreme conditions.  This severe weather also caused the cancellation of a scheduled High School visit to MIMI.


In [77]:
generate_output(comment4, verbose=False, temp=0.0)

'November: The elevator to the underground Launch Control Center at Delta-01 was down for the entire month of November, and with Alerts on our website, and a notice on our official Facebook site, our visitors knew that this site was down. MIMI experienced SEVERE weather for November 2013 with tours to the topside Launch Control Facility being cancelled on two separate occassions due to ice on the roads and extreme conditions. A scheduled High School visit to MIMI was also cancelled due to severe weather.'

# Test


=========================================================================================================================

INPUT:
Comment for July:
1.) BAIS Ranger left position at beginning of the month. 2.) MIIN has had broken HVAC/air conditioning since June. Idaho has seen record number of days at temps of high 90s-100s; prolonged heat wave has resulted in MIIN Visitor Center closing 3 to 4.5 hours early to mitigate heat exhaustion amongst staff and visitors. 3.) Yellowstone temporarily closed due to severe flooding and, even after reopening, has seen decreased visitation due to cancelled travel plans or potential visitors not knowing that the park is open. A significant portion of MIIN visitation comes from the traffic between YELL and the Pacific Northwest, so MIIN visitation may have similarly decreased. 4.) A large portion of MIIN visitors are semi/retired RV travelers; MIIN visitation may have decreased due to high cost of gas throughout the summer (national average $5.00+).  BAIS -- Estimated with a 10% decrease from July BAIS FY22 assumed from the lack of BAIS Ranger during the month of July. Bainbridge Island Japanese American Exclusion Memorial people counters have been broken since 03/2021. MIIN staff based out of Idaho and not able to easily make fixes to BAIS in WA state. The memorial is collectively cared for by the Bainbridge Island Japanese American Exclusion Memorial Association, the Bainbridge Island Japanese American Community, the Bainbridge Island Historical Museum, Bainbridge Island Parks & Recreation District and the Klondike Gold Rush National Historical Park - Seattle.

=========================================================================================================================

OUTPUT:
July: MIIN Visitor Center closed 3 to 4.5 hours early due to heat exhaustion; Yellowstone temporarily closed due to severe flooding; MIIN visitation may have decreased due to high cost of gas and cancelled travel plans.