## Region Type seperation

In [83]:
#import libraries
import pandas as pd
from pydantic import BaseModel, ValidationError #for validation

In [84]:
# Pydantic library utilized for validation (utilized for every new API pull forward)
#Resources: https://docs.pydantic.dev/latest/#pydantic-examples, https://docs.pydantic.dev/latest/concepts/dataclasses/#json-dumping, Xpert Learning Assistant
class Region(BaseModel):
    region_id: int #must be an integerFil
    region_type: str # These are all named types
    region: str # string data (mixture of varaibles and strings)

In [85]:
#Region type counts
regions= pd.read_csv('Clean_data/Regions.csv')

regions['region_type'].value_counts()

city      10417
neigh     10122
zip        3701
county     3006
metro       632
Name: region_type, dtype: int64

In [86]:
# Validate each row using Pydantic
#Resources: https://docs.pydantic.dev/latest/#pydantic-examples
errors = []
for index, row in regions.iterrows(): #itterate through all rows in Regions.csv
    try:
        # Convert row to dictionary and validate
        region = Region(**row.to_dict())
        #print("Validation successful for row:", index)
    except ValidationError as e:
        print(f"Validation error in row {index}: {e}")
        errors.append((index, str(e)))

# errors found, needed to correct string before we split
if errors:
    print(f"There were validation errors in {len(errors)} rows.")

In [87]:
#Splitting by type
city_df= regions[regions['region_type']=='city'].copy()
neigh_df= regions[regions['region_type']=='neigh'].copy()
zip_df= regions[regions['region_type']=='zip'].copy()
county_df= regions[regions['region_type']=='county'].copy()
metro_df= regions[regions['region_type']=='metro'].copy()


In [88]:
#city
city_df[['City', 'State', 'Metro_Area', 'County']] = city_df['region'].str.split(';', expand=True)
#replacing NA's
city_df['Metro_Area'] = city_df['Metro_Area'].replace('nan', pd.NA)
# Drop the 'region' column
city_df.drop(columns='region', inplace=True)

city_df.head()

Unnamed: 0,region_id,region_type,City,State,Metro_Area,County
1,26591,city,Port Leyden,NY,,Lewis County
2,3986,city,Carson,VA,"Richmond, VA",Dinwiddie County
3,6626,city,Princeton,IA,"Davenport-Moline-Rock Island, IA-IL",Scott County
4,49285,city,Jackson,NH,,Carroll County
5,42441,city,Dorset,OH,"Ashtabula, OH",Ashtabula County


In [89]:
city_df.dtypes

region_id       int64
region_type    object
City           object
State          object
Metro_Area     object
County         object
dtype: object

In [90]:
#Neighborhood
neigh_df[['Neighborhood', 'State', 'Metro_Area', 'City', 'County']] = neigh_df['region'].str.split(';', expand=True)
# Drop the 'region' column
neigh_df.drop(columns='region', inplace=True)

neigh_df.head()

Unnamed: 0,region_id,region_type,Neighborhood,State,Metro_Area,City,County
0,403211,neigh,Longwood,NY,"New York-Newark-Jersey City, NY-NJ-PA",New York,Bronx County
18,271495,neigh,North Dallas,TX,"Dallas-Fort Worth-Arlington, TX",Dallas County,Dallas
36,343208,neigh,Highbridge,NY,"New York-Newark-Jersey City, NY-NJ-PA",New York,Bronx County
48,270891,neigh,Morningside Heights,NY,"New York-Newark-Jersey City, NY-NJ-PA",New York,New York County
69,403295,neigh,Lake View East,IL,Chicago-Naperville-Elgin,Cook County,Chicago


In [91]:
neigh_df.dtypes

region_id        int64
region_type     object
Neighborhood    object
State           object
Metro_Area      object
City            object
County          object
dtype: object

In [92]:
#zipcode
zip_df[['Zipcode', 'State', 'Metro_Area', 'City', 'County']] = zip_df['region'].str.split(';', expand=True)
#replacing NA's
zip_df['City'] = zip_df['City'].replace('nan', pd.NA)
# Drop the 'region' column
zip_df.drop(columns='region', inplace=True)

zip_df.head()

Unnamed: 0,region_id,region_type,Zipcode,State,Metro_Area,City,County
2246,58924,zip,2826,RI,"Providence-Warwick, RI-MA",Burrillville,Providence County
2572,58472,zip,1745,MA,"Worcester, MA-CT",,Worcester County
3008,98400,zip,95721,CA,"Sacramento-Roseville-Folsom, CA",South Lake Tahoe,El Dorado County
3062,98402,zip,95724,CA,"Sacramento-Roseville-Folsom, CA",,Placer County
3129,95851,zip,89155,NV,"Las Vegas-Henderson-Paradise, NV",Las Vegas,Clark County


In [93]:
#Clean zipcodes, some of these are missing a digit.
zip_df['Zipcode'] = zip_df['Zipcode'].astype(int).apply(lambda x: f'{x:05d}')
zip_df.head()

Unnamed: 0,region_id,region_type,Zipcode,State,Metro_Area,City,County
2246,58924,zip,2826,RI,"Providence-Warwick, RI-MA",Burrillville,Providence County
2572,58472,zip,1745,MA,"Worcester, MA-CT",,Worcester County
3008,98400,zip,95721,CA,"Sacramento-Roseville-Folsom, CA",South Lake Tahoe,El Dorado County
3062,98402,zip,95724,CA,"Sacramento-Roseville-Folsom, CA",,Placer County
3129,95851,zip,89155,NV,"Las Vegas-Henderson-Paradise, NV",Las Vegas,Clark County


In [94]:
zip_df.dtypes

region_id       int64
region_type    object
Zipcode        object
State          object
Metro_Area     object
City           object
County         object
dtype: object

In [95]:
#county
county_df[['County_Name', 'State', 'Metro_Area']] = county_df['region'].str.split(';', expand=True)
county_df['Metro_Area'] = county_df['Metro_Area'].replace('nan', pd.NA)
# Drop the 'region' column
county_df.drop(columns='region', inplace=True)
county_df.head()

Unnamed: 0,region_id,region_type,County_Name,State,Metro_Area
40,2375,county,Lamoille County,VT,
41,865,county,Benton County,IA,"Cedar Rapids, IA"
42,2481,county,Plymouth County,IA,
43,703,county,Salem City,VA,"Roanoke, VA"
44,568,county,Iosco County,MI,


In [96]:
county_df.dtypes

region_id       int64
region_type    object
County_Name    object
State          object
Metro_Area     object
dtype: object

In [97]:
#metro
metro_df[['Metro_Name', 'State']] = metro_df['region'].str.split(', ', expand=True)
# Drop the 'region' column
metro_df.drop(columns='region', inplace=True)
metro_df.head()

Unnamed: 0,region_id,region_type,Metro_Name,State
403,394859,metro,Midland,MI
475,395082,metro,Selinsgrove,PA
626,394801,metro,Logansport,IN
764,394419,metro,Brookings,SD
1133,394642,metro,Grants,NM


In [98]:
metro_df.dtypes

region_id       int64
region_type    object
Metro_Name     object
State          object
dtype: object

## CSV File extractions

In [99]:
#Paths to CSV files
Split_path = 'Region_split/City.csv'
Split_path2 = 'Region_split/Neighborhood.csv'
Split_path3 = 'Region_split/Zip.csv'
Split_path4 = 'Region_split/County.csv'
Split_path5 = 'Region_split/Metro.csv'

In [100]:
city_df.to_csv(Split_path, index=False)
neigh_df.to_csv(Split_path2, index=False)
zip_df.to_csv(Split_path3, index=False)
county_df.to_csv(Split_path4, index=False)
metro_df.to_csv(Split_path5, index=False)
