In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import os

os.makedirs("../data", exist_ok=True)
df = pd.read_csv("../data/raw/Electric_Vehicle_Population_Data.csv")

print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns")

print("\nDataset Overview:")
print(df.info())

print("\nSummary Statistics:")
print(df.describe())

print("\nMissing Values:")
print(df.isnull().sum())

def clean_data(df):
    cleaned_df = df.copy()
    columns_to_drop = [
    'VIN (1-10)', 'DOL Vehicle ID', 'Vehicle Location', '2020 Census Tract'
    ]
    cleaned_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
    #afv_mapping = {
    #'Clean Alternative Fuel Vehicle Eligible': 1,
    #'Eligibility unknown as battery range has not been researched': 0,
    #'Not eligible due to low battery range': 0
    #}
    #cleaned_df['CAFV_Class'] = cleaned_df['Clean Alternative Fuel Vehicle (CAFV) Eligibility'].map(afv_mapping)
    #cleaned_df.drop(columns=['Clean Alternative Fuel Vehicle (CAFV) Eligibility'], inplace=True)
    cleaned_df['CAFV_Class'] = cleaned_df['Clean Alternative Fuel Vehicle (CAFV) Eligibility'].apply(lambda x: 1 if x == 'Clean Alternative Fuel Vehicle Eligible' else 0)
    cleaned_df.drop('Clean Alternative Fuel Vehicle (CAFV) Eligibility', axis=1, inplace=True)


    
    cleaned_df['Electric Range'] = cleaned_df['Electric Range'].replace(0, np.nan)
    cleaned_df['Electric Range'].fillna(cleaned_df['Electric Range'].median(), inplace=True)

    # Base MSRP: 0 → NaN → median
    if 'Base MSRP' in cleaned_df.columns:
        cleaned_df['Base MSRP'] = cleaned_df['Base MSRP'].replace(0, np.nan)
        cleaned_df['Base MSRP'].fillna(cleaned_df['Base MSRP'].median(), inplace=True)
    
    return cleaned_df



print("\nCleaning data...")
cleaned_df = clean_data(df)

print(f"Cleaned dataset has {cleaned_df.shape[0]} rows and {cleaned_df.shape[1]} columns")
print("\nMissing values after cleaning:")
print(cleaned_df.isnull().sum())

output_path = "../data/Electric_Vehicle_Population_Data_cleaned.parquet"
cleaned_df.to_parquet(output_path, index=False)
print(f"\nCleaned dataset saved as: {output_path}")

print("\nSample of cleaned dataset:")
print(cleaned_df.head())

print("\nColumn descriptions:")
for col in cleaned_df.columns:
    print(f"- {col}: {cleaned_df[col].dtype}")

print("\nData preparation complete!")

Dataset loaded with 235692 rows and 17 columns

Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235692 entries, 0 to 235691
Data columns (total 17 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   VIN (1-10)                                         235692 non-null  object 
 1   County                                             235689 non-null  object 
 2   City                                               235689 non-null  object 
 3   State                                              235692 non-null  object 
 4   Postal Code                                        235689 non-null  float64
 5   Model Year                                         235692 non-null  int64  
 6   Make                                               235692 non-null  object 
 7   Model                                              235692 non-null  object 
 8   Electric

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_df['Electric Range'].fillna(cleaned_df['Electric Range'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_df['Base MSRP'].fillna(cleaned_df['Base MSRP'].median(), inplace=True)


Cleaned dataset has 235692 rows and 13 columns

Missing values after cleaning:
County                     3
City                       3
State                      0
Postal Code                3
Model Year                 0
Make                       0
Model                      0
Electric Vehicle Type      0
Electric Range             0
Base MSRP                  0
Legislative District     494
Electric Utility           3
CAFV_Class                 0
dtype: int64

Cleaned dataset saved as: ../data/Electric_Vehicle_Population_Data_cleaned.parquet

Sample of cleaned dataset:
     County     City State  Postal Code  Model Year     Make       Model  \
0      King  Seattle    WA      98178.0        2019    TESLA     MODEL 3   
1    Kitsap  Poulsbo    WA      98370.0        2020    TESLA     MODEL Y   
2    Kitsap   Olalla    WA      98359.0        2023  HYUNDAI     IONIQ 5   
3    Kitsap  Seabeck    WA      98380.0        2021      BMW          X5   
4  Thurston  Rainier    WA      98576.0

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Base MSRP'].fillna(df['Base MSRP'].median(), inplace=True)
