# Libraries

In [33]:
import pandas as pd
from openai import OpenAI
import tiktoken
import tqdm
API_KEY = '<KEY>'
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

# Load Data

In [34]:
comments = pd.read_csv('Data/comments.csv')
comments['Date'] = pd.to_datetime(comments['Date'])
comments = comments.sort_values(by=['Date', 'UnitCode'], ascending=[True, True]).reset_index(drop=True)
comments['Year'] = comments['Date'].dt.year
comments['MonthName'] = comments['Date'].dt.month_name()
comments['MonthNumber'] = comments['Date'].dt.month

display(comments)

Unnamed: 0,UnitCode,ParkName,Date,Comments,Year,MonthName,MonthNumber
0,WORI,Women's Rights NHP,1993-01-01,Closed New Year's Day Two days at zero,1993,January,1
1,WORI,Women's Rights NHP,1993-02-01,Closed 2/13 at 4p.m. due to Blizzard Two days ...,1993,February,2
2,WORI,Women's Rights NHP,1993-03-01,Park Closed from 12:00 on 3/13 to 3/15 due to ...,1993,March,3
3,WORI,Women's Rights NHP,1993-07-01,Grand Opening of new Wesleyan Chapel Block hel...,1993,July,7
4,WORI,Women's Rights NHP,1993-08-01,Grand Opening of new Wesleyan Chapel Block hel...,1993,August,8
...,...,...,...,...,...,...,...
34857,WORI,Women's Rights NHP,2024-03-01,Visitor Center and Chapel are closed on Sunday...,2024,March,3
34858,WUPA,Wupatki NM,2024-03-01,3/15/24 - Closed due to inclement weather. 3/3...,2024,March,3
34859,ZION,Zion NP,2024-03-01,South Campground closed for construction until...,2024,March,3
34860,CHRI,Christiansted NHS,2024-04-01,on site education 1. 4/5 PR JROTC Cadets CHRI ...,2024,April,4


# Helper Functions

In [35]:
def formatComment(unitCode: str = None, year: int = None, month: int = None):
    data = comments[(comments['UnitCode'] == unitCode) & (comments['Year'] == year) & (comments['MonthNumber'] == month)]
    if data.empty:
        return 'No comments found'
    formatted = 'Comment for ' + data['MonthName'].iloc[0] + ':\n' + data['Comments'].iloc[0]
    return formatted
def createPrompt(unitCode: str = None, year: int = None, month: int = None):
    comments = formatComment(unitCode, year, month)
    return f"{openai_prompt}\n\n{comments}"
def countTokens(prompt: str):
    return len(encoding.encode(prompt))

openai_prompt = "Your task is to review some comments written by National Park data collectors to describe the visitation at their park for a given month. Analyze the comments and identify any months where specific locations were reported as closed. Your response should be 1-2 concise sentences. If no closures are mentioned, please respond with '<Month Name>: No Closures.'. Do not mention locations that are marked as using averages or estimates. Do not include closures that only happened for a few days or temporary closures due to weather or holidays. If you are unsure, default to no closures. The format should be as follows:\n<Month Name>: <1-2 concise sentence summary of closures>."

openai_prompt_tokens = countTokens(openai_prompt)
print(openai_prompt_tokens)

134


# Compile Comments Database For Training

In [32]:
commentsDataBase = comments.copy()
commentsDataBase['Prompt'] = commentsDataBase.apply(lambda x: formatComment(x['UnitCode'], x['Year'], x['MonthNumber']), axis=1)
display(commentsDataBase)

KeyboardInterrupt: 

In [None]:
commentsDataBase['Prompt'][0]

"Comment for January:\nClosed New Year's Day Two days at zero"

In [11]:
commentsDataBase['WordCount'] = commentsDataBase['Prompt'].apply(lambda x: len(x.split()))
commentsDataBase['TokenCount'] = commentsDataBase['Prompt'].apply(lambda x: countTokens(x))
commentsDataBase['CharacterCount'] = commentsDataBase['Prompt'].apply(lambda x: len(x))
commentsDataBase.sort_values(by=['TokenCount'], ascending=[True], inplace=True)
commentsDataBase.reset_index(drop=True, inplace=True)
display(commentsDataBase)
commentsDataBase.to_csv('Data/comments_with_prompts.csv', index=False)

Unnamed: 0,UnitCode,ParkName,Date,Comments,Year,MonthName,MonthNumber,Prompt,WordCount,TokenCount,CharacterCount
0,RICH,Richmond NBP,2020-09-01,COVID,2020,September,9,Comment for September:\nCOVID,4,5,28
1,DRTO,Dry Tortugas NP,2023-06-01,.,2023,June,6,Comment for June:\n.,4,5,19
2,GOGA,Golden Gate NRA,2020-08-01,COVID,2020,August,8,Comment for August:\nCOVID,4,5,25
3,LYBA,LBJ Memorial Grove on the Potomac,2012-08-01,Estimated,2012,August,8,Comment for August:\nEstimated,4,5,29
4,GOGA,Golden Gate NRA,2020-09-01,COVID,2020,September,9,Comment for September:\nCOVID,4,5,28
...,...,...,...,...,...,...,...,...,...,...,...
34857,GRSM,Great Smoky Mountains NP,2023-08-01,"Abrams Creek - Counter malfunctioning, used 20...",2023,August,8,Comment for August:\nAbrams Creek - Counter ma...,312,478,1924
34858,SARA,Saratoga NHP,2022-10-01,2022 October – Explanatory Unusual factors: 1...,2022,October,10,Comment for October:\n2022 October – Explanato...,299,482,1924
34859,SARA,Saratoga NHP,2022-09-01,2022 September – Explanatory Unusual factors:...,2022,September,9,Comment for September:\n2022 September – Expla...,322,498,2019
34860,WORI,Women's Rights NHP,2004-07-01,"July, 2005 was an exceedingly hot month, with ...",2004,July,7,"Comment for July:\nJuly, 2005 was an exceeding...",294,500,1830


In [13]:
print(commentsDataBase['WordCount'].max())

322


In [12]:
print(commentsDataBase['TokenCount'].sum())

1468147


# Filter and Sample Data For Training

In [41]:
# Filter down data
commentsDataBaseFiltered = commentsDataBase[commentsDataBase['TokenCount'] < 300].reset_index(drop=True) # Remove comments that are too long
display(commentsDataBaseFiltered)

Unnamed: 0,UnitCode,ParkName,Date,Comments,Year,MonthName,MonthNumber,Prompt,WordCount,TokenCount,CharacterCount
0,RICH,Richmond NBP,2020-09-01,COVID,2020,September,9,Comment for September:\nCOVID,4,5,28
1,RICH,Richmond NBP,2021-08-01,COVID,2021,August,8,Comment for August:\nCOVID,4,5,25
2,RICH,Richmond NBP,2022-01-01,COVID,2022,January,1,Comment for January:\nCOVID,4,5,26
3,RICH,Richmond NBP,2021-11-01,COVID,2021,November,11,Comment for November:\nCOVID,4,5,27
4,JOFI,John F. Kennedy NHS,2006-10-01,Closed,2006,October,10,Comment for October:\nClosed,4,5,27
...,...,...,...,...,...,...,...,...,...,...,...
34732,ROMO,Rocky Mountain NP,2019-01-01,The U.S. Government experienced a shutdown fro...,2019,January,1,Comment for January:\nThe U.S. Government expe...,223,294,1397
34733,NATC,Natchez NHP,2007-06-01,Special Program one is Junior Ranger History C...,2007,June,6,Comment for June:\nSpecial Program one is Juni...,211,295,1233
34734,SARA,Saratoga NHP,2024-01-01,2024 January – Explanatory General: 1. Non-au...,2024,January,1,Comment for January:\n2024 January – Explanato...,182,296,1151
34735,JECA,Jewel Cave NM,2014-08-01,Elevator #1 experienced a blown fuse at around...,2014,August,8,Comment for August:\nElevator #1 experienced a...,181,296,1107


In [61]:
# randomly sample 100 unit codes
import random
unitCodesSample = commentsDataBaseFiltered['UnitCode'].unique()
unitCodesSample = random.sample(list(unitCodesSample), 100)
sample = []
for code in unitCodesSample:
    df = commentsDataBaseFiltered[commentsDataBaseFiltered['UnitCode'] == code]
    # randomly sample 3 years
    years = df['Year'].unique()
    if len(years) < 3:
        sample.append(df)
    else:
        years = random.sample(list(years), 3)
        sample.append(df[df['Year'].isin(years)])
sample = pd.DataFrame(pd.concat(sample)).reset_index(drop=True)
sample

Unnamed: 0,UnitCode,ParkName,Date,Comments,Year,MonthName,MonthNumber,Prompt,WordCount,TokenCount,CharacterCount
0,WWIM,World War I Memorial,2021-04-01,4/17/2021 Memorial opened to the public.,2021,April,4,Comment for April:\n4/17/2021 Memorial opened...,9,17,60
1,LIBO,Lincoln Boyhood NMEM,2013-10-01,Park closed for 16 days due to government shut...,2013,October,10,Comment for October:\nPark closed for 16 days ...,12,15,72
2,LIBO,Lincoln Boyhood NMEM,2008-09-01,Sep 16: lost power due to Hurricane Ike.,2008,September,9,Comment for September:\nSep 16: lost power due...,11,16,64
3,LIBO,Lincoln Boyhood NMEM,2018-12-01,Park was closed from December 22 - December 31...,2018,December,12,Comment for December:\nPark was closed from De...,15,19,85
4,LIBO,Lincoln Boyhood NMEM,2008-06-01,"May 10-11: 4,300 Boy & Girl Scouts participate...",2008,June,6,"Comment for June:\nMay 10-11: 4,300 Boy & Girl...",14,25,82
...,...,...,...,...,...,...,...,...,...,...,...
1585,LEWI,Lewis & Clark NHP,2017-12-01,We've only had a couple icy/frosty mornings so...,2017,December,12,Comment for December:\nWe've only had a couple...,68,88,367
1586,LEWI,Lewis & Clark NHP,2017-10-01,It seems like we've been busy and had quite a ...,2017,October,10,Comment for October:\nIt seems like we've been...,68,89,390
1587,LEWI,Lewis & Clark NHP,2005-06-01,June 1-12 were counted as the rest of the scho...,2005,June,6,Comment for June:\nJune 1-12 were counted as t...,75,94,416
1588,LEWI,Lewis & Clark NHP,2017-08-01,The biggest factor impacting our visitation th...,2017,August,8,Comment for August:\nThe biggest factor impact...,125,154,680


# Estimate GPT Cost

In [21]:
n = sample.shape[0]
commentTokens = sample['TokenCount'].sum()
print("Total tokens in sample:", commentTokens)
print("Total tokens in sample + prompt:", commentTokens + openai_prompt_tokens*n)
print("Total price:", (commentTokens + openai_prompt_tokens*n) * 0.00000015 + n*150*0.0000015)

NameError: name 'sample' is not defined

In [68]:
print(sample['Comments'][1])

Park closed for 16 days due to government shutdown.


# Use GPT-3.5-Turbo to Generate Output

In [69]:
model_id = 'gpt-3.5-turbo'
client = OpenAI(api_key=API_KEY)

In [79]:
output = []
for i in tqdm.tqdm(range(0)):
    completion = completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  temperature=0.3,
  max_tokens=150,
  messages=[
    {"role": "system", "content": openai_prompt},
    {"role": "user", "content": sample['Prompt'][i]}
  ]
    )
    output.append(completion.choices[0].message.content)

100%|██████████| 1590/1590 [15:58<00:00,  1.66it/s]


# Preview Output

In [81]:
len(output)
for out in output:
    print(out)
    print("-------------------")

April: No Closures.
-------------------
October: Park closed for 16 days due to government shutdown.
-------------------
September: No Closures.
-------------------
December: Park was closed from December 22 - December 31 due to shutdown.
-------------------
June: No Closures.
-------------------
October: No Closures.
-------------------
May: No Closures.
-------------------
October: No Closures.
-------------------
April: No Closures.
-------------------
June: No Closures.
-------------------
October: No Closures.
-------------------
July: No Closures.
-------------------
December: No Closures.
-------------------
December: No Closures.
-------------------
November: Belle Haven, Daingerfield, Ft. Hunt Park, Ft. Marcy, Gravelly Point, Riverside Park, and Turkey Run were reported as closed due to malfunctioning counters. Crystal City had low visitation due to construction.
-------------------
September: No Closures.
-------------------
May: No Closures.
-------------------
April: U.S. M

# Save Output

In [80]:
output_df = sample.copy()
output_df['Output'] = output
display(output_df)
output_df.to_csv('TrainData/output.csv', index=False)

Unnamed: 0,UnitCode,ParkName,Date,Comments,Year,MonthName,MonthNumber,Prompt,WordCount,TokenCount,CharacterCount,Output
0,WWIM,World War I Memorial,2021-04-01,4/17/2021 Memorial opened to the public.,2021,April,4,Comment for April:\n4/17/2021 Memorial opened...,9,17,60,April: No Closures.
1,LIBO,Lincoln Boyhood NMEM,2013-10-01,Park closed for 16 days due to government shut...,2013,October,10,Comment for October:\nPark closed for 16 days ...,12,15,72,October: Park closed for 16 days due to govern...
2,LIBO,Lincoln Boyhood NMEM,2008-09-01,Sep 16: lost power due to Hurricane Ike.,2008,September,9,Comment for September:\nSep 16: lost power due...,11,16,64,September: No Closures.
3,LIBO,Lincoln Boyhood NMEM,2018-12-01,Park was closed from December 22 - December 31...,2018,December,12,Comment for December:\nPark was closed from De...,15,19,85,December: Park was closed from December 22 - D...
4,LIBO,Lincoln Boyhood NMEM,2008-06-01,"May 10-11: 4,300 Boy & Girl Scouts participate...",2008,June,6,"Comment for June:\nMay 10-11: 4,300 Boy & Girl...",14,25,82,June: No Closures.
...,...,...,...,...,...,...,...,...,...,...,...,...
1585,LEWI,Lewis & Clark NHP,2017-12-01,We've only had a couple icy/frosty mornings so...,2017,December,12,Comment for December:\nWe've only had a couple...,68,88,367,December: No Closures.
1586,LEWI,Lewis & Clark NHP,2017-10-01,It seems like we've been busy and had quite a ...,2017,October,10,Comment for October:\nIt seems like we've been...,68,89,390,October: No Closures.
1587,LEWI,Lewis & Clark NHP,2005-06-01,June 1-12 were counted as the rest of the scho...,2005,June,6,Comment for June:\nJune 1-12 were counted as t...,75,94,416,June: No Closures.
1588,LEWI,Lewis & Clark NHP,2017-08-01,The biggest factor impacting our visitation th...,2017,August,8,Comment for August:\nThe biggest factor impact...,125,154,680,August: No Closures.


# Convert Output to JSONL

In [36]:
output_df = pd.read_csv('TrainData/output.csv')

In [42]:
# concatenate openai prompt with comments and ouput in this format "prompt\nInput:\n<comment>\nOutput:\n<output>"
output_df_filt = output_df.copy()
output_df_filt['JSON'] = output_df.apply(lambda x: f"### INSTRUCTION:\n{openai_prompt}\n\n### INPUT:\n{x['Prompt']}\n\n### OUTPUT:\n{x['Output']}\n### END", axis=1)
output_df_filt['JSON'] = output_df.apply(lambda x: f"Input:{x['Prompt']}Output:{x['Output']}", axis=1)
display(output_df_filt)

Unnamed: 0,UnitCode,ParkName,Date,Comments,Year,MonthName,MonthNumber,Prompt,WordCount,TokenCount,CharacterCount,Output,JSON
0,WWIM,World War I Memorial,2021-04-01,4/17/2021 Memorial opened to the public.,2021,April,4,Comment for April:\n4/17/2021 Memorial opened...,9,17,60,April: No Closures.,Input:Comment for April:\n4/17/2021 Memorial ...
1,LIBO,Lincoln Boyhood NMEM,2013-10-01,Park closed for 16 days due to government shut...,2013,October,10,Comment for October:\nPark closed for 16 days ...,12,15,72,October: Park closed for 16 days due to govern...,Input:Comment for October:\nPark closed for 16...
2,LIBO,Lincoln Boyhood NMEM,2008-09-01,Sep 16: lost power due to Hurricane Ike.,2008,September,9,Comment for September:\nSep 16: lost power due...,11,16,64,September: No Closures.,Input:Comment for September:\nSep 16: lost pow...
3,LIBO,Lincoln Boyhood NMEM,2018-12-01,Park was closed from December 22 - December 31...,2018,December,12,Comment for December:\nPark was closed from De...,15,19,85,December: Park was closed from December 22 - D...,Input:Comment for December:\nPark was closed f...
4,LIBO,Lincoln Boyhood NMEM,2008-06-01,"May 10-11: 4,300 Boy & Girl Scouts participate...",2008,June,6,"Comment for June:\nMay 10-11: 4,300 Boy & Girl...",14,25,82,June: No Closures.,"Input:Comment for June:\nMay 10-11: 4,300 Boy ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1585,LEWI,Lewis & Clark NHP,2017-12-01,We've only had a couple icy/frosty mornings so...,2017,December,12,Comment for December:\nWe've only had a couple...,68,88,367,December: No Closures.,Input:Comment for December:\nWe've only had a ...
1586,LEWI,Lewis & Clark NHP,2017-10-01,It seems like we've been busy and had quite a ...,2017,October,10,Comment for October:\nIt seems like we've been...,68,89,390,October: No Closures.,Input:Comment for October:\nIt seems like we'v...
1587,LEWI,Lewis & Clark NHP,2005-06-01,June 1-12 were counted as the rest of the scho...,2005,June,6,Comment for June:\nJune 1-12 were counted as t...,75,94,416,June: No Closures.,Input:Comment for June:\nJune 1-12 were counte...
1588,LEWI,Lewis & Clark NHP,2017-08-01,The biggest factor impacting our visitation th...,2017,August,8,Comment for August:\nThe biggest factor impact...,125,154,680,August: No Closures.,Input:Comment for August:\nThe biggest factor ...


In [43]:
n = output_df_filt.shape[0]
cutoff = int(n*0.75)

In [44]:
output_df_filt['TokenCount'][:].sum()

63201

In [45]:
max(output_df_filt['JSON'].apply(lambda x: len(x)))

1301

In [46]:
print(output_df_filt['JSON'][1])

Input:Comment for October:
Park closed for 16 days due to government shutdown.Output:October: Park closed for 16 days due to government shutdown.


In [47]:
import jsonlines
n = output_df_filt.shape[0]
cutoff = int(n*0.75)
# Train data
with jsonlines.open("TrainData/Old/train.jsonl", mode='w') as writer:
    for index, row in output_df_filt.iloc[:cutoff].iterrows():
        line = row['JSON']
        writer.write({"text": line})
# Valid data
with jsonlines.open("TrainData/Old/valid.jsonl", mode='w') as writer:
    for index, row in output_df_filt.iloc[cutoff:].iterrows():
        line = row['JSON']
        writer.write({"text": line})

# Convert Model To Q4

python convert.py --hf-path meta-llama/Llama-2-13b-hf -q --mlx-path /Users/austinlackey/Documents/GitHub/llm-data-validation/Llama2-13B-Q4

# Finetune Llama3-8B

python lora.py --model /Users/austinlackey/Documents/GitHub/llm-data-validation/Quantized-Base-Models/LLama3-8B-Q4 \
               --train \
               --data /Users/austinlackey/Documents/GitHub/llm-data-validation/TrainData \
               --iters 1000 \
               --max-tokens 150 \
               --temp 0.3 \
               --batch-size 2 \
               --lora-layers 16

python lora.py --model /Users/austinlackey/Documents/GitHub/llm-data-validation/Quantized-Base-Models/LLama3-8B-Q4 \
               --adapter-file /Users/austinlackey/Documents/GitHub/llm-data-validation/Llama-3-adapters/adapters5.npz \
               --max-tokens 100 \
               --prompt ""

python lora.py --model /Users/austinlackey/Documents/GitHub/llm-data-validation/Quantized-Base-Models/LLama3-8B-Q4 \
               --train \
               --data /Users/austinlackey/Documents/GitHub/llm-data-validation/TrainData/Old \
               --iters 1000 \
               --max-tokens 100 \
               --temp 0.3 \
               --batch-size 1 \
               --lora-layers 4

python lora.py --model /Users/austinlackey/Documents/GitHub/llm-data-validation/Quantized-Base-Models/LLama3-8B-Q4 \
               --adapter-file /Users/austinlackey/Documents/GitHub/llm-data-validation/Llama-3-adapters/adapters5.npz \
               --max-tokens 100 \
               --prompt ""

# Fuse

python fuse.py --model /Users/austinlackey/Documents/GitHub/llm-data-validation/Quantized-Base-Models/LLama3-8B-Q4 \
                --adapter-file /Users/austinlackey/Documents/GitHub/llm-data-validation/Llama-3-adapters/adapters1.npz \
                --save-path /Users/austinlackey/Documents/GitHub/llm-data-validation/Finetuned-Models/Llama-3-8B-NPSClosures-v1