In [38]:
import pandas as pd

crime = pd.read_csv("../data/cleaned/crime_cleaned.csv")
edu = pd.read_csv("../data/cleaned/education_cleaned.csv")

crime.head(), edu.head()

(   year state_abbr       state  population  violent_crime  homicide  robbery  \
 0  2015         AK      Alaska      737709           5391        59      761   
 1  2015         AL     Alabama     4853875          22957       348     4612   
 2  2015         AR    Arkansas     2977853          15769       189     2117   
 3  2015         AZ     Arizona     6817565          27968       306     6360   
 4  2015         CA  California    38993940         166883      1861    52862   
 
    aggravated_assault  property_crime  burglary  larceny  motor_vehicle_theft  \
 0                3671           20806      3511    15249                 2046   
 1               15960          144785     35265    99182                10338   
 2               11455           97391     22851    68720                 5820   
 3               18193          207184     38010   152388                16786   
 4               99349         1024914    197404   656517               170993   
 
    violent_crime_

In [39]:
print("Crime years:", crime['year'].unique())
print("Education years:", edu['year'].unique())

print("\nNumber of crime states:", crime['state'].nunique())
print("Number of education states:", edu['state'].nunique())

Crime years: [2015 2016 2017 2018 2019 2020 2021]
Education years: [2015 2017 2019 2021 2023]

Number of crime states: 51
Number of education states: 58


In [40]:
# Step 2: keep only years that appear in BOTH datasets
common_years = sorted(set(crime['year']).intersection(edu['year']))
print("Common years:", common_years)

crime_sub = crime[crime['year'].isin(common_years)].copy()
edu_sub   = edu[edu['year'].isin(common_years)].copy()

print("Crime shape after year filter:", crime_sub.shape)
print("Education shape after year filter:", edu_sub.shape)

crime_sub.head()

Common years: [2015, 2017, 2019, 2021]
Crime shape after year filter: (204, 17)
Education shape after year filter: (4332, 4)


Unnamed: 0,year,state_abbr,state,population,violent_crime,homicide,robbery,aggravated_assault,property_crime,burglary,larceny,motor_vehicle_theft,violent_crime_rate,property_crime_rate,homicide_rate,robbery_rate,aggravated_assault_rate
0,2015,AK,Alaska,737709,5391,59,761,3671,20806,3511,15249,2046,730.775956,2820.353283,7.997734,103.157207,497.621691
1,2015,AL,Alabama,4853875,22957,348,4612,15960,144785,35265,99182,10338,472.962324,2982.874507,7.169529,95.016868,328.809456
2,2015,AR,Arkansas,2977853,15769,189,2117,11455,97391,22851,68720,5820,529.542593,3270.510667,6.346855,71.091488,384.673119
3,2015,AZ,Arizona,6817565,27968,306,6360,18193,207184,38010,152388,16786,410.234446,3038.973592,4.488406,93.288439,266.854808
4,2015,CA,California,38993940,166883,1861,52862,99349,1024914,197404,656517,170993,427.971628,2628.393027,4.772536,135.564654,254.780615


In [41]:
crime_states = set(crime_sub['state'].str.upper())
edu_states = set(edu_sub['state'].str.upper())

print("States only in education but NOT in crime:")
print(sorted(edu_states - crime_states))

print("\nStates only in crime but NOT in education:")
print(sorted(crime_states - edu_states))

States only in education but NOT in crime:
['AMERICAN SAMOA', 'BUREAU OF INDIAN EDUCATION', 'DEPARTMENT OF DEFENSE EDUCATION ACTIVITY', 'GUAM', 'NORTHERN MARIANAS', 'PUERTO RICO', 'U.S. VIRGIN ISLANDS']

States only in crime but NOT in education:
[]


In [42]:
invalid_states = [
    'AMERICAN SAMOA',
    'BUREAU OF INDIAN EDUCATION',
    'DEPARTMENT OF DEFENSE EDUCATION ACTIVITY',
    'GUAM',
    'NORTHERN MARIANAS',
    'PUERTO RICO',
    'U.S. VIRGIN ISLANDS'
]

edu_sub_clean = edu_sub[~edu_sub['state'].isin(invalid_states)].copy()

print("Education rows before:", edu_sub.shape)
print("Education rows after removing invalid states:", edu_sub_clean.shape)
print("Unique states now:", edu_sub_clean['state'].nunique())

Education rows before: (4332, 4)
Education rows after removing invalid states: (3919, 4)
Unique states now: 51


In [43]:
# Fix state formatting
crime_sub["state"] = crime_sub["state"].str.upper()
edu_sub_clean["state"] = edu_sub_clean["state"].str.upper()

# Convert edu_staff_total to numeric (invalid values -> NaN)
edu_sub_clean["edu_staff_total"] = pd.to_numeric(
    edu_sub_clean["edu_staff_total"],
    errors="coerce"
)

# Aggregate education data to state-year level
edu_state_year = (
    edu_sub_clean
    .groupby(["state", "year"], as_index=False)
    .agg({"edu_staff_total": "sum"})
)

print("Aggregated education shape:", edu_state_year.shape)
print(edu_state_year.head())

# Merge
final = crime_sub.merge(
    edu_state_year,
    on=["state", "year"],
    how="inner"
)

print("Final merged shape:", final.shape)
final.head()

Aggregated education shape: (204, 3)
     state  year  edu_staff_total
0  ALABAMA  2015         71628.43
1  ALABAMA  2017             0.00
2  ALABAMA  2019             0.00
3  ALABAMA  2021             0.00
4   ALASKA  2015         16982.39
Final merged shape: (204, 18)


Unnamed: 0,year,state_abbr,state,population,violent_crime,homicide,robbery,aggravated_assault,property_crime,burglary,larceny,motor_vehicle_theft,violent_crime_rate,property_crime_rate,homicide_rate,robbery_rate,aggravated_assault_rate,edu_staff_total
0,2015,AK,ALASKA,737709,5391,59,761,3671,20806,3511,15249,2046,730.775956,2820.353283,7.997734,103.157207,497.621691,16982.39
1,2015,AL,ALABAMA,4853875,22957,348,4612,15960,144785,35265,99182,10338,472.962324,2982.874507,7.169529,95.016868,328.809456,71628.43
2,2015,AR,ARKANSAS,2977853,15769,189,2117,11455,97391,22851,68720,5820,529.542593,3270.510667,6.346855,71.091488,384.673119,73658.2
3,2015,AZ,ARIZONA,6817565,27968,306,6360,18193,207184,38010,152388,16786,410.234446,3038.973592,4.488406,93.288439,266.854808,103174.6
4,2015,CA,CALIFORNIA,38993940,166883,1861,52862,99349,1024914,197404,656517,170993,427.971628,2628.393027,4.772536,135.564654,254.780615,577836.1


In [44]:
# Save integrated dataset
final.to_csv("../data/merged.csv", index=False)

print("Integrated dataset saved to: data/merged.csv")

Integrated dataset saved to: data/merged.csv
