In [41]:
import pandas as pd

df = pd.read_pickle('../data/social_explorer/environment_df.pkl')

df.columns

Index(['FIPS', 'Qualifying Name', 'Total Area', 'Open Water %', 'Ice/Snow %',
       'Developed Open Space %', 'Developed Low Intensity %',
       'Developed Medium Intensity %', 'Developed High Intensity %',
       'Barren Land %', 'Deciduous Forest %', 'Evergreen Forest %',
       'Mixed Forest %', 'Shrub/Scrub %', 'Grassland/Herbaceous %',
       'Pasture/Hay %', 'Cultivated Crops %', 'Woody Wetlands %',
       'Emergent Herbaceous Wetlands %'],
      dtype='object')

In [42]:
df.sample(2)

Unnamed: 0,FIPS,Qualifying Name,Total Area,Open Water %,Ice/Snow %,Developed Open Space %,Developed Low Intensity %,Developed Medium Intensity %,Developed High Intensity %,Barren Land %,Deciduous Forest %,Evergreen Forest %,Mixed Forest %,Shrub/Scrub %,Grassland/Herbaceous %,Pasture/Hay %,Cultivated Crops %,Woody Wetlands %,Emergent Herbaceous Wetlands %
1566,29169,"Pulaski County, Missouri",1428346000.0,0.53,0.0,4.92,1.64,0.67,0.18,0.51,67.63,2.5,1.04,0.25,1.88,17.31,0.39,0.53,0.01
304,8121,"Washington County, Colorado",6537017000.0,0.14,0.0,3.25,0.09,0.01,0.0,0.03,0.05,0.0,0.0,0.12,45.58,0.73,49.8,0.09,0.09


In [43]:
df['Total Forest %'] = df['Deciduous Forest %'] + df['Evergreen Forest %'] + df['Mixed Forest %']
df['Total Grassland %'] = df['Grassland/Herbaceous %'] + df['Pasture/Hay %']
df['Total Agricultural %'] = df['Cultivated Crops %'] + df['Pasture/Hay %']
df['Total Wetlands %'] = df['Woody Wetlands %'] + df['Emergent Herbaceous Wetlands %']
df['Total Developed %'] = df['Developed Open Space %'] + df['Developed Low Intensity %'] + df['Developed Medium Intensity %'] + df['Developed High Intensity %']
df['Total Water %'] = df['Open Water %']
df['Total Natural Land %'] = df['Total Forest %'] + df['Shrub/Scrub %']

forest_threshold = 50.0
agriculture_threshold = 25.0
wetlands_threshold = 20.0
developed_threshold = 30.0
water_threshold = 10.0
natural_land_threshold = 60.0

df['Predominantly Forest'] = df['Total Forest %'] > forest_threshold
df['Has Snow'] = df['Ice/Snow %'] > 0
df['Predominantly Agricultural'] = df['Total Agricultural %'] > agriculture_threshold
df['Predominantly Wetlands'] = df['Total Wetlands %'] > wetlands_threshold
df['Predominantly Developed'] = df['Total Developed %'] > developed_threshold
df['Predominantly Water'] = df['Total Water %'] > water_threshold
df['Natural Land Dominant'] = df['Total Natural Land %'] > natural_land_threshold

In [44]:
def generate_prompts(row):
    name = row['Qualifying Name']
    forest = row['Total Forest %']
    agriculture = row['Total Agricultural %']
    wetlands = row['Total Wetlands %']
    developed = row['Total Developed %']
    water = row['Total Water %']
    natural_land = row['Total Natural Land %']
    
    prompts = []

    if row['Predominantly Forest']:
        prompts.append({
            "prompt": "I'm looking for a county with a lot of forest area. What would you recommend?",
            "completion": f"I would recommend {name}, which has a total forest coverage of {forest:.2f}%."
        })

    if row['Has Snow']:
        prompts.append({
            "prompt": f"Does {name} experience snow?",
            "completion": f"Yes, {name} experiences snow coverage."
        })
    else:
        prompts.append({
            "prompt": f"Does {name} experience snow?",
            "completion": f"No, {name} does not have snow coverage."
        })

    if row['Predominantly Agricultural']:
        prompts.append({
            "prompt": "I'm interested in counties with extensive agricultural areas. Any recommendations?",
            "completion": f"{name} is an excellent choice, with agricultural coverage of {agriculture:.2f}%."
        })

    if row['Predominantly Wetlands']:
        prompts.append({
            "prompt": "Which county has a significant amount of wetlands?",
            "completion": f"{name} stands out with its wetlands coverage of {wetlands:.2f}%."
        })

    if row['Predominantly Developed']:
        prompts.append({
            "prompt": "Can you recommend a county with extensive developed areas?",
            "completion": f"{name} is heavily developed, with {developed:.2f}% of its land being urbanized."
        })

    if row['Predominantly Water']:
        prompts.append({
            "prompt": "I'm looking for a county with a significant amount of open water. What would you recommend?",
            "completion": f"{name} has a remarkable open water coverage of {water:.2f}%."
        })

    if row['Natural Land Dominant']:
        prompts.append({
            "prompt": "Which county would you recommend for someone who loves natural landscapes?",
            "completion": f"{name} offers abundant natural landscapes with {natural_land:.2f}% covered by forests and shrublands."
        })

    prompts.append({
        "prompt": f"How do the developed areas in {name} compare to its natural land?",
        "completion": f"In {name}, developed land covers {developed:.2f}%, while natural land accounts for {natural_land:.2f}%, indicating a {('strong urban presence' if developed > natural_land else 'predominantly natural landscape')}."
    })

    return prompts

In [45]:
all_prompts = []
for _, row in df.iterrows():
    prompts = generate_prompts(row)
    all_prompts.extend(prompts)

In [46]:
all_prompts

[{'prompt': 'Does Autauga County, Alabama experience snow?',
  'completion': 'No, Autauga County, Alabama does not have snow coverage.'},
 {'prompt': 'Which county would you recommend for someone who loves natural landscapes?',
  'completion': 'Autauga County, Alabama offers abundant natural landscapes with 66.69% covered by forests and shrublands.'},
 {'prompt': 'How do the developed areas in Autauga County, Alabama compare to its natural land?',
  'completion': 'In Autauga County, Alabama, developed land covers 5.96%, while natural land accounts for 66.69%, indicating a predominantly natural landscape.'},
 {'prompt': 'Does Baldwin County, Alabama experience snow?',
  'completion': 'No, Baldwin County, Alabama does not have snow coverage.'},
 {'prompt': 'Which county has a significant amount of wetlands?',
  'completion': 'Baldwin County, Alabama stands out with its wetlands coverage of 27.70%.'},
 {'prompt': 'How do the developed areas in Baldwin County, Alabama compare to its natura

In [47]:
new_df = pd.DataFrame(all_prompts)

new_df.to_json("../data/train/t5-fine-tune-test.jsonl", index=False, orient='records', lines=True)

In [48]:
import sagemaker

sagemaker_session = sagemaker.Session()

In [49]:
sagemaker_session.upload_data("../data/train/t5-fine-tune-test.jsonl")

's3://sagemaker-us-east-1-702497914099/data/t5-fine-tune-test.jsonl'