In [None]:
import pandas as pd

demographic_df = pd.read_pickle('../data/social_explorer/demographic_df.pkl')
demographic_df.drop('Qualifying Name', axis=1, inplace=True)
demographic_df = demographic_df.dropna()
demographic_df = demographic_df.reset_index(drop=True)

environment_df = pd.read_pickle('../data/social_explorer/environment_df.pkl')
environment_df = environment_df.dropna()
environment_df = environment_df.reset_index(drop=True)

housing_df = pd.read_pickle('../data/social_explorer/housing_df.pkl')
housing_df.drop('Qualifying Name', axis=1, inplace=True)
housing_df = housing_df.dropna()
housing_df = housing_df.reset_index(drop=True)

crime_df = pd.read_pickle('../data/social_explorer/crime_df.pkl')
crime_df.drop('Qualifying Name', axis=1, inplace=True)
crime_df = crime_df.dropna()
crime_df = crime_df.reset_index(drop=True)

school_df = pd.read_pickle('../data/social_explorer/school_df.pkl')
school_df.drop('Qualifying Name', axis=1, inplace=True)
school_df = school_df.dropna()
school_df = school_df.reset_index(drop=True)

In [None]:
def summarize_row(row):
    summary = (
        f"{row['Qualifying Name']} has a violent and property crime rate {row['Total Violent and Property Crimes %']} % of total population "
    )
    return summary

def create_combined_string(row):
    # print(row)
    question = f"How is crime in {row['Qualifying Name']}?"
    answer = summarize_row(row)
    return f"[INST] {question} [/INST] {answer} </s>"
# 
crime_df['Combined'] = crime_df.apply(create_combined_string, axis=1)

crime_df.head(100)

In [None]:
def summarize_row(row):
    summary = (
        f"{row['Qualifying Name']} School rankings: "
        f"    Elementary {row['SchoolDigger Rank Elementary']}, "
        f"Middle {row['SchoolDigger Rank Middle']}, "
        f"High {row['SchoolDigger Rank High']}.\n"
    )
    return summary

def create_combined_string(row):
    # print(row)
    question = f"How are schools ranked in {row['Qualifying Name']}?"
    answer = summarize_row(row)
    return f"[INST] {question} [/INST] {answer} </s>"

school_df['Combined'] = school_df.apply(create_combined_string, axis=1)
# # TODO can remove white space, and convert floats to int, add another qa row showing min and max for each grouping (elementary, middle, high) in US
# # print(school_df.iloc[0]['Combined'])
# school_df.columns
# school_df = school_df[['Qualifying Name','FIPS', 'LEA', 'State']]
# # 36 047 New York Kings County 35620
# # filtered_df = school_df[school_df['Qualifying Name'].str.contains('')]
# filtered_df = school_df.loc[school_df['State'] == "36"]
# filtered_df

In [None]:

def create_question_answer_pairs(row):
    pairs = []
    
    demographics = {
        'Whites': 'White 2020 %',
        'American Indians and Alaska Native': 'American Indian and Alaska Native 2020 %',
        'African Americans or Blacks': 'African American or Black 2020 %',
        'Native Hawaiians and Other Pacific Islanders': 'Native Hawaiian and Other Pacific Islander 2020 %',
        'Hispanics': 'Hispanic 2020 %',
        'Asians': 'Asian 2020 %',
        'Others': 'Other 2020 %'
    }
    
    for group, col in demographics.items():
        question = f"What percentage of {group} lives in {row['Qualifying Name']}?"
        answer = f"{group} make up {row[col]}% of the total population in {row['Qualifying Name']} which has a total population of {row['2020 Total Population']}."
        combined = f"[INST] {question} [/INST] {answer} </s>"
        pairs.append({'Qualifying Name': row['Qualifying Name'], 'Question': question, 'Answer': answer, 'Combined': combined})
    
    return pairs

question_answer_pairs = []

for _, row in demographic_df.iterrows():
    pairs = create_question_answer_pairs(row)
    question_answer_pairs.extend(pairs)

demographic_df = pd.DataFrame(question_answer_pairs)

print(demographic_df.shape)
demographic_df.head(100)

In [None]:

def create_housing_question_answer_pairs(row):
    pairs = []    
    housing_data = {
        'one-bedroom': 'Fair Market Rent for One Bedroom',
        'two-bedroom': 'Fair Market Rent for Two Bedrooms',
        'three-bedroom': 'Fair Market Rent for Three Bedrooms',
        'four-bedroom': 'Fair Market Rent for Four Bedrooms'
    }
    
    for room_type, col in housing_data.items():
        question = f"What is the fair market rent for a {room_type} in {row['Qualifying Name']}?"
        answer = f"The fair market rent for a {room_type} in {row['Qualifying Name']} is ${row[col]} with a median family income of ${row['Median Family Income (5-year ACS)']}."
        combined = f"[INST] {question} [/INST] {answer} </s>"
        pairs.append({'Qualifying Name': row['Qualifying Name'], 'Question': question, 'Answer': answer, 'Combined': combined})
    
    question = f"What is the median family income in {row['Qualifying Name']}?"
    answer = f"The median family income in {row['Qualifying Name']} is ${row['Median Family Income (5-year ACS)']}."
    combined = f"[INST] {question} [/INST] {answer} </s>"
    pairs.append({'Qualifying Name': row['Qualifying Name'], 'Question': question, 'Answer': answer, 'Combined': combined})
    
    return pairs

housing_question_answer_pairs = []

for _, row in housing_df.iterrows():
    pairs = create_housing_question_answer_pairs(row)
    housing_question_answer_pairs.extend(pairs)

housing_df = pd.DataFrame(housing_question_answer_pairs)

housing_df.head()

In [None]:

def create_land_cover_question_answer_pairs(row):
    pairs = []
    
    land_cover_data = {
        'open water': 'Open Water %',
        'ice/snow': 'Ice/Snow %',
        'developed open space': 'Developed Open Space %',
        'developed low intensity': 'Developed Low Intensity %',
        'developed medium intensity': 'Developed Medium Intensity %',
        'developed high intensity': 'Developed High Intensity %',
        'barren land': 'Barren Land %',
        'deciduous forest': 'Deciduous Forest %',
        'evergreen forest': 'Evergreen Forest %',
        'mixed forest': 'Mixed Forest %',
        'shrub/scrub': 'Shrub/Scrub %',
        'grassland/herbaceous': 'Grassland/Herbaceous %',
        'pasture/hay': 'Pasture/Hay %',
        'cultivated crops': 'Cultivated Crops %',
        'woody wetlands': 'Woody Wetlands %',
        'emergent herbaceous wetlands': 'Emergent Herbaceous Wetlands %'
    }
    
    for cover_type, col in land_cover_data.items():
        question = f"What percentage of {cover_type} is in {row['Qualifying Name']}?"
        answer = f"{row['Qualifying Name']} has {row[col]}% of its land covered by {cover_type}"
        combined = f"[INST] {question} [/INST] {answer} </s>"
        pairs.append({'Qualifying Name': row['Qualifying Name'], 'Question': question, 'Answer': answer, 'Combined': combined})
    
    return pairs

land_cover_question_answer_pairs = []

for _, row in environment_df.iterrows():
    pairs = create_land_cover_question_answer_pairs(row)
    land_cover_question_answer_pairs.extend(pairs)

environment_df = pd.DataFrame(land_cover_question_answer_pairs)
print(environment_df.iloc[0]['Combined'])
environment_df.head()

In [None]:
jsonl = environment_df[['Question', 'Answer']]

jsonl.columns = ['prompt', 'completion']

jsonl.to_json('../data/train/t5_qa_pairs.jsonl', index=False, orient='records', lines=True)

In [None]:
combined_df = pd.concat([crime_df['Combined'], environment_df['Combined']], ignore_index=True)
combined_df = combined_df.to_frame(name='Combined')
print(combined_df.shape)
# combined_df = pd.concat([combined_df['Combined'], school_df['Combined']], ignore_index=True)
# combined_df = combined_df.to_frame(name='Combined')
# print(combined_df.shape)
combined_df = pd.concat([combined_df['Combined'], demographic_df['Combined']], ignore_index=True)
combined_df = combined_df.to_frame(name='Combined')
print(combined_df.shape)
combined_df = pd.concat([combined_df['Combined'], housing_df['Combined']], ignore_index=True)
combined_df = combined_df.to_frame(name='Combined')
print(combined_df.shape)


combined_df.to_pickle('../data/social_explorer/4_dataset_expanded.pkl')

In [None]:
print(combined_df.iloc[60000]['Combined'])