In [4]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub

path = kagglehub.dataset_download("salvatoresaia/ev-charging-stations-us")

print("Path to dataset files:", path)

import os
files = os.listdir(path)
print("\nAvailable files:")
for file in files:
    print(f"  - {file}")

Path to dataset files: /Users/sho/.cache/kagglehub/datasets/salvatoresaia/ev-charging-stations-us/versions/2

Available files:
  - EV_Charging_Stations_Feb82024.xlsx
  - EV_Charging_Stations_Jan312023.xlsx


In [5]:
# Load the Excel files using pd.read_excel()
import pandas as pd

ev_charging_station_feb = os.path.join(path, 'EV_Charging_Stations_Feb82024.xlsx')
ev_charging_station_jan = os.path.join(path, 'EV_Charging_Stations_Jan312023.xlsx')

# Read the Excel files
df_feb = pd.read_excel(ev_charging_station_feb)
df_jan = pd.read_excel(ev_charging_station_jan)

print(f"February 2024 dataset shape: {df_feb.shape}")
print(f"January 2023 dataset shape: {df_jan.shape}")
print("\nFirst 5 records from February 2024:")
print(df_feb.head())


February 2024 dataset shape: (65134, 13)
January 2023 dataset shape: (54238, 13)

First 5 records from February 2024:
                        Station Name      Street Address         City State  \
0           LADWP - Truesdale Center  11797 Truesdale St   Sun Valley    CA   
1      Los Angeles Convention Center  1201 S Figueroa St  Los Angeles    CA   
2      LADWP - John Ferraro Building       111 N Hope St  Los Angeles    CA   
3         LADWP - Haynes Power Plant       6801 E 2nd St   Long Beach    CA   
4  LADWP - Harbor Generating Station    161 N Island Ave   Wilmington    CA   

     ZIP  EV Level1 EVSE Num  EV Level2 EVSE Num  EV DC Fast Count  \
0  91352                 NaN                57.0               2.0   
1  90015                 NaN                 7.0               NaN   
2  90012                 NaN               338.0              12.0   
3  90803                 NaN                19.0               1.0   
4  90744                 NaN                10.0         

In [6]:
df_feb.columns

Index(['Station Name', 'Street Address', 'City', 'State', 'ZIP',
       'EV Level1 EVSE Num', 'EV Level2 EVSE Num', 'EV DC Fast Count',
       'EV Network', 'EV Connector Types', 'Access Code', 'Access Detail Code',
       'Facility Type'],
      dtype='object')

In [7]:
df_jan.columns

Index(['Station Name', 'Street Address', 'City', 'State', 'ZIP',
       'EV Level1 EVSE Num', 'EV Level2 EVSE Num', 'EV DC Fast Count',
       'EV Network', 'EV Connector Types', 'Access Code', 'Access Detail Code',
       'Facility Type'],
      dtype='object')

In [8]:
df_feb[df_feb['Access Code'] == 'public']
# These are .xlsx files, so we need to use read_excel()

Unnamed: 0,Station Name,Street Address,City,State,ZIP,EV Level1 EVSE Num,EV Level2 EVSE Num,EV DC Fast Count,EV Network,EV Connector Types,Access Code,Access Detail Code,Facility Type
1,Los Angeles Convention Center,1201 S Figueroa St,Los Angeles,CA,90015,,7.0,,Non-Networked,J1772,public,,PARKING_GARAGE
8,California Air Resources Board,9530 Telstar Ave,El Monte,CA,91731,,3.0,,Non-Networked,J1772,public,,STATE_GOV
10,Scripps Green Hospital,10666 N Torrey Pines Rd,La Jolla,CA,92037,,1.0,,Non-Networked,J1772,public,,HOSPITAL
11,Galpin Motors,15421 Roscoe Blvd,Sepulveda,CA,91343,,2.0,,Non-Networked,J1772,public,CALL,CAR_DEALER
12,Galleria at Tyler,1299 Galleria at Tyler,Riverside,CA,92503,,4.0,,Non-Networked,J1772,public,,SHOPPING_MALL
...,...,...,...,...,...,...,...,...,...,...,...,...,...
65129,Davis Chevrolet of Delano,"505 Babcock Blvd,",Delano,MN,55328,,,1.0,EV Connect,J1772COMBO,public,,
65130,9000 N Division St,9000 N Division St,Spokane,WA,99218,,1.0,6.0,EV Connect,J1772 J1772COMBO,public,,
65131,Truist Park Purple Deck,TBD,Atlanta,GA,30339,,4.0,,FLASH,J1772,public,,PUBLIC
65132,One Victory Park Garage,2323 Victory Ave.,Dallas,TX,75219,,5.0,,FLASH,J1772,public,,WORKPLACE


In [9]:
# Download US ZIP code population dataset
import kagglehub

# Download the most recent ZIP code dataset (Oct 2024)
zip_path = kagglehub.dataset_download("bwandowando/us-zip-codes-database-from-simplemaps-com")

print("Path to ZIP code dataset:", zip_path)

# List files in the dataset
import os
zip_files = os.listdir(zip_path)
print("\nAvailable files:")
for file in zip_files:
    print(f"  - {file}")

KeyboardInterrupt: 

In [None]:
zip_pop_df = pd.read_csv(f'{zip_path}/uszips.csv')

df_feb = df_feb.merge(
    zip_pop_df[['zip', 'population']], 
    left_on='ZIP',      
    right_on='zip',     
    how='left'         
)

#finding population of zip code for each charger to create "new" feature
df_feb = df_feb.rename(columns={'population': 'ZIP_Population'})

df_feb = df_feb.drop(['zip'], axis=1)

#turning NaNs to 0s for quantitative columns
df_feb['EV Level1 EVSE Num'] = df_feb['EV Level1 EVSE Num'].fillna(0)
df_feb['EV Level2 EVSE Num'] = df_feb['EV Level2 EVSE Num'].fillna(0)
df_feb['EV DC Fast Count'] = df_feb['EV DC Fast Count'].fillna(0)

#Mapping EV Network Column
set_networks = df_feb['EV Network'].unique()
network_mapping = {}
for i, network in enumerate(set_networks, start=1): 
    if pd.notna(network):  # only map non-NaN values
        network_mapping[network] = i

df_feb['EV Network Numeric'] = df_feb['EV Network'].map(network_mapping).fillna(0) #fill NaN with 0


#Mapping Facility Types
set_fac = df_feb['Facility Type'].unique()
fac_mapping = {}
for i, type in enumerate(set_fac, start=1): 
    if pd.notna(type):  # only map non-NaN values
        fac_mapping[type] = i

df_feb['EV CFacility Type'] = df_feb['Facility Type'].map(fac_mapping).fillna(0) #fill NaN with 0


#processing Access Detail Code (0 is None, 1 else)
df_feb['Access Detail Code'] = df_feb['Access Detail Code'].notna().astype(int)

#processing EV Connector Types 
df_feb['Connector_List'] = df_feb['EV Connector Types'].apply(
    lambda x: x.split() if pd.notna(x) else []
)
connector_mapping = {}
counter = 1
for elem in df_feb['Connector_List']:
    for conn in elem:
        if conn not in connector_mapping:
            connector_mapping[conn] = counter
            counter += 1

df_feb['Connector_Numeric_List'] = df_feb['Connector_List'].apply(
    lambda x: [connector_mapping[conn] for conn in x] if x else []
)






NameError: name 'pd' is not defined

In [13]:
print(type(df_feb['EV Connector Types'][0]))

<class 'str'>
