This script took the data from preprocessing vertically joined it (union, pandas concat) and transformed it into the QA format required to fine tune

In [None]:
import pandas as pd

demographic_df = pd.read_pickle('../data/social_explorer/demographic_df.pkl')
# demographic_df.drop('Qualifying Name', axis=1, inplace=True)
demographic_df = demographic_df.dropna()
demographic_df = demographic_df.reset_index(drop=True)

environment_df = pd.read_pickle('../data/social_explorer/environment_df.pkl')
environment_df = environment_df.dropna()
environment_df = environment_df.reset_index(drop=True)

housing_df = pd.read_pickle('../data/social_explorer/housing_df.pkl')
# housing_df.drop('Qualifying Name', axis=1, inplace=True)
housing_df = housing_df.dropna()
housing_df = housing_df.reset_index(drop=True)

crime_df = pd.read_pickle('../data/social_explorer/crime_df.pkl')
# crime_df.drop('Qualifying Name', axis=1, inplace=True)
crime_df = crime_df.dropna()
crime_df = crime_df.reset_index(drop=True)

In [3]:
def summarize_row(row):
    summary = (
        f"{row['Qualifying Name']} has a violent and property crime rate {row['Total Violent and Property Crimes %']} % of total population "
    )
    return summary

def create_combined_string(row):
    # print(row)
    question = f"How is crime in {row['Qualifying Name']}?"
    answer = summarize_row(row)
    return f"[INST] {question} [/INST] {answer} </s>"
# 
crime_df['Combined'] = crime_df.apply(create_combined_string, axis=1)
crime_df.head(100)

Unnamed: 0,FIPS,Qualifying Name,Total Violent and Property Crimes %,Combined
0,01001,"Autauga County, Alabama",3.19,"[INST] How is crime in Autauga County, Alabama..."
1,01003,"Baldwin County, Alabama",2.22,"[INST] How is crime in Baldwin County, Alabama..."
2,01005,"Barbour County, Alabama",2.82,"[INST] How is crime in Barbour County, Alabama..."
3,01007,"Bibb County, Alabama",1.11,"[INST] How is crime in Bibb County, Alabama? [..."
4,01009,"Blount County, Alabama",2.42,"[INST] How is crime in Blount County, Alabama?..."
...,...,...,...,...
95,04011,"Greenlee County, Arizona",2.00,"[INST] How is crime in Greenlee County, Arizon..."
96,04012,"La Paz County, Arizona",2.86,"[INST] How is crime in La Paz County, Arizona?..."
97,04013,"Maricopa County, Arizona",3.44,"[INST] How is crime in Maricopa County, Arizon..."
98,04015,"Mohave County, Arizona",3.32,"[INST] How is crime in Mohave County, Arizona?..."


In [4]:
def create_question_answer_pairs(row):
    pairs = []
    
    demographics = {
        'Whites': 'White 2020 %',
        'American Indians and Alaska Native': 'American Indian and Alaska Native 2020 %',
        'African Americans or Blacks': 'African American or Black 2020 %',
        'Native Hawaiians and Other Pacific Islanders': 'Native Hawaiian and Other Pacific Islander 2020 %',
        'Hispanics': 'Hispanic 2020 %',
        'Asians': 'Asian 2020 %',
        'Others': 'Other 2020 %'
    }
    
    for group, col in demographics.items():
        question = f"What percentage of {group} lives in {row['Qualifying Name']}?"
        answer = f"{group} make up {row[col]}% of the total population in {row['Qualifying Name']} which has a total population of {row['2020 Total Population']}."
        combined = f"[INST] {question} [/INST] {answer} </s>"
        pairs.append({'Qualifying Name': row['Qualifying Name'], 'Question': question, 'Answer': answer, 'Combined': combined})
    
    return pairs

question_answer_pairs = []

for _, row in demographic_df.iterrows():
    pairs = create_question_answer_pairs(row)
    question_answer_pairs.extend(pairs)

demographic_df = pd.DataFrame(question_answer_pairs)

print(demographic_df.shape)
demographic_df.head(100)

(22547, 4)


Unnamed: 0,Qualifying Name,Question,Answer,Combined
0,"Autauga County, Alabama",What percentage of Whites lives in Autauga Cou...,Whites make up 71.69% of the total population ...,[INST] What percentage of Whites lives in Auta...
1,"Autauga County, Alabama",What percentage of American Indians and Alaska...,American Indians and Alaska Native make up 0.3...,[INST] What percentage of American Indians and...
2,"Autauga County, Alabama",What percentage of African Americans or Blacks...,African Americans or Blacks make up 19.46% of ...,[INST] What percentage of African Americans or...
3,"Autauga County, Alabama",What percentage of Native Hawaiians and Other ...,Native Hawaiians and Other Pacific Islanders m...,[INST] What percentage of Native Hawaiians and...
4,"Autauga County, Alabama",What percentage of Hispanics lives in Autauga ...,Hispanics make up 3.6% of the total population...,[INST] What percentage of Hispanics lives in A...
...,...,...,...,...
95,"Clay County, Alabama",What percentage of Hispanics lives in Clay Cou...,Hispanics make up 3.15% of the total populatio...,[INST] What percentage of Hispanics lives in C...
96,"Clay County, Alabama",What percentage of Asians lives in Clay County...,Asians make up 0.34% of the total population i...,[INST] What percentage of Asians lives in Clay...
97,"Clay County, Alabama",What percentage of Others lives in Clay County...,Others make up 1.37% of the total population i...,[INST] What percentage of Others lives in Clay...
98,"Cleburne County, Alabama",What percentage of Whites lives in Cleburne Co...,Whites make up 91.78% of the total population ...,[INST] What percentage of Whites lives in Cleb...


In [5]:

def create_housing_question_answer_pairs(row):
    pairs = []    
    housing_data = {
        'one-bedroom': 'Fair Market Rent for One Bedroom',
        'two-bedroom': 'Fair Market Rent for Two Bedrooms',
        'three-bedroom': 'Fair Market Rent for Three Bedrooms',
        'four-bedroom': 'Fair Market Rent for Four Bedrooms'
    }
    
    for room_type, col in housing_data.items():
        question = f"What is the fair market rent for a {room_type} in {row['Qualifying Name']}?"
        answer = f"The fair market rent for a {room_type} in {row['Qualifying Name']} is ${row[col]} with a median family income of ${row['Median Family Income (5-year ACS)']}."
        combined = f"[INST] {question} [/INST] {answer} </s>"
        pairs.append({'Qualifying Name': row['Qualifying Name'], 'Question': question, 'Answer': answer, 'Combined': combined})
    
    question = f"What is the median family income in {row['Qualifying Name']}?"
    answer = f"The median family income in {row['Qualifying Name']} is ${row['Median Family Income (5-year ACS)']}."
    combined = f"[INST] {question} [/INST] {answer} </s>"
    pairs.append({'Qualifying Name': row['Qualifying Name'], 'Question': question, 'Answer': answer, 'Combined': combined})
    
    return pairs

housing_question_answer_pairs = []

for _, row in housing_df.iterrows():
    pairs = create_housing_question_answer_pairs(row)
    housing_question_answer_pairs.extend(pairs)

housing_df = pd.DataFrame(housing_question_answer_pairs)

housing_df.head()

Unnamed: 0,Qualifying Name,Question,Answer,Combined
0,"Autauga County, Alabama",What is the fair market rent for a one-bedroom...,The fair market rent for a one-bedroom in Auta...,[INST] What is the fair market rent for a one-...
1,"Autauga County, Alabama",What is the fair market rent for a two-bedroom...,The fair market rent for a two-bedroom in Auta...,[INST] What is the fair market rent for a two-...
2,"Autauga County, Alabama",What is the fair market rent for a three-bedro...,The fair market rent for a three-bedroom in Au...,[INST] What is the fair market rent for a thre...
3,"Autauga County, Alabama",What is the fair market rent for a four-bedroo...,The fair market rent for a four-bedroom in Aut...,[INST] What is the fair market rent for a four...
4,"Autauga County, Alabama",What is the median family income in Autauga Co...,"The median family income in Autauga County, Al...",[INST] What is the median family income in Aut...


In [6]:

def create_land_cover_question_answer_pairs(row):
    pairs = []
    
    land_cover_data = {
        'open water': 'Open Water %',
        'ice/snow': 'Ice/Snow %',
        'developed open space': 'Developed Open Space %',
        'developed low intensity': 'Developed Low Intensity %',
        'developed medium intensity': 'Developed Medium Intensity %',
        'developed high intensity': 'Developed High Intensity %',
        'barren land': 'Barren Land %',
        'deciduous forest': 'Deciduous Forest %',
        'evergreen forest': 'Evergreen Forest %',
        'mixed forest': 'Mixed Forest %',
        'shrub/scrub': 'Shrub/Scrub %',
        'grassland/herbaceous': 'Grassland/Herbaceous %',
        'pasture/hay': 'Pasture/Hay %',
        'cultivated crops': 'Cultivated Crops %',
        'woody wetlands': 'Woody Wetlands %',
        'emergent herbaceous wetlands': 'Emergent Herbaceous Wetlands %'
    }
    
    for cover_type, col in land_cover_data.items():
        question = f"What percentage of {cover_type} is in {row['Qualifying Name']}?"
        answer = f"{row['Qualifying Name']} has {row[col]}% of its land covered by {cover_type}"
        combined = f"[INST] {question} [/INST] {answer} </s>"
        pairs.append({'Qualifying Name': row['Qualifying Name'], 'Question': question, 'Answer': answer, 'Combined': combined})
    
    return pairs

land_cover_question_answer_pairs = []

for _, row in environment_df.iterrows():
    pairs = create_land_cover_question_answer_pairs(row)
    land_cover_question_answer_pairs.extend(pairs)

environment_df = pd.DataFrame(land_cover_question_answer_pairs)
print(environment_df.iloc[0]['Combined'])
environment_df.head()

[INST] What percentage of open water is in Autauga County, Alabama? [/INST] Autauga County, Alabama has 1.52% of its land covered by open water </s>


Unnamed: 0,Qualifying Name,Question,Answer,Combined
0,"Autauga County, Alabama",What percentage of open water is in Autauga Co...,"Autauga County, Alabama has 1.52% of its land ...",[INST] What percentage of open water is in Aut...
1,"Autauga County, Alabama",What percentage of ice/snow is in Autauga Coun...,"Autauga County, Alabama has 0.0% of its land c...",[INST] What percentage of ice/snow is in Autau...
2,"Autauga County, Alabama",What percentage of developed open space is in ...,"Autauga County, Alabama has 4.08% of its land ...",[INST] What percentage of developed open space...
3,"Autauga County, Alabama",What percentage of developed low intensity is ...,"Autauga County, Alabama has 1.34% of its land ...",[INST] What percentage of developed low intens...
4,"Autauga County, Alabama",What percentage of developed medium intensity ...,"Autauga County, Alabama has 0.43% of its land ...",[INST] What percentage of developed medium int...


In [7]:
jsonl = environment_df[['Question', 'Answer']]

jsonl.columns = ['prompt', 'completion']

jsonl.to_json('../data/train/t5_qa_pairs.jsonl', index=False, orient='records', lines=True)

In [8]:
combined_df = pd.concat([crime_df['Combined'], environment_df['Combined']], ignore_index=True)
combined_df = combined_df.to_frame(name='Combined')
print(combined_df.shape)
combined_df = pd.concat([combined_df['Combined'], demographic_df['Combined']], ignore_index=True)
combined_df = combined_df.to_frame(name='Combined')
print(combined_df.shape)
combined_df = pd.concat([combined_df['Combined'], housing_df['Combined']], ignore_index=True)
combined_df = combined_df.to_frame(name='Combined')
print(combined_df.shape)

combined_df.to_pickle('../data/social_explorer/4_dataset_expanded.pkl')

(54623, 1)
(77170, 1)
(101000, 1)


In [9]:
print(combined_df.iloc[100]['Combined'])

[INST] How is crime in Pima County, Arizona? [/INST] Pima County, Arizona has a violent and property crime rate 4.91 % of total population  </s>


In [11]:
filtered_df_partial = demographic_df[demographic_df['Qualifying Name'].str.contains('Avery')]
filtered_df_partial.head()

Unnamed: 0,Qualifying Name,Question,Answer,Combined
13272,"Avery County, North Carolina",What percentage of Whites lives in Avery Count...,Whites make up 88.43% of the total population ...,[INST] What percentage of Whites lives in Aver...
13273,"Avery County, North Carolina",What percentage of American Indians and Alaska...,American Indians and Alaska Native make up 0.5...,[INST] What percentage of American Indians and...
13274,"Avery County, North Carolina",What percentage of African Americans or Blacks...,African Americans or Blacks make up 3.73% of t...,[INST] What percentage of African Americans or...
13275,"Avery County, North Carolina",What percentage of Native Hawaiians and Other ...,Native Hawaiians and Other Pacific Islanders m...,[INST] What percentage of Native Hawaiians and...
13276,"Avery County, North Carolina",What percentage of Hispanics lives in Avery Co...,Hispanics make up 5.54% of the total populatio...,[INST] What percentage of Hispanics lives in A...
