In [None]:
import pandas as pd

In [None]:
pd.read_csv("Electric_Vehicle_Population_Data.csv")

In [None]:
#Data cleaning 
import pandas as pd

# Load the dataset
df = pd.read_csv("Electric_Vehicle_Population_Data.csv")

# 1. Total missing values in each column
missing_per_column = df.isnull().sum()
print(missing_per_column)

# 2. Only columns that actually have missing values
missing_only = missing_per_column[missing_per_column > 0]
print(missing_only)

# 3. Total missing values in the entire dataset
total_missing = df.isnull().sum().sum()
print("Total missing values:", total_missing)

In [None]:
#Data cleaning
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('Electric_Vehicle_Population_Data.csv')

# 1. Identify missing and zero values
missing_count = df['Electric Range'].isnull().sum()
zero_count = (df['Electric Range'] == 0).sum()
print(f"Before Imputation - Missing: {missing_count}, Zeros: {zero_count}")

# 2. Handling Missing/Zero values by Imputation
# We create a reference table of the mean range for each Make and Model (excluding zeros)
range_ref = df[df['Electric Range'] > 0].groupby(['Make', 'Model'])['Electric Range'].mean().reset_index()
range_ref.rename(columns={'Electric Range': 'Avg_Range'}, inplace=True)

# Merge the average ranges back into the original dataframe
df = df.merge(range_ref, on=['Make', 'Model'], how='left')

# Replace 0 or NaN with the calculated average for that model
mask = (df['Electric Range'] == 0) | (df['Electric Range'].isnull())
df.loc[mask, 'Electric Range'] = df.loc[mask, 'Avg_Range']

# Fill remaining NaNs (for models where NO range data exists) with 0 or a global median
df['Electric Range'] = df['Electric Range'].fillna(0)

# 3. Verification
final_zeros = (df['Electric Range'] == 0).sum()
print(f"After Imputation - Zeros remaining: {final_zeros}")

In [None]:
 duplicate_count = df.duplicated().sum()
duplicate_count

In [None]:
import pandas as pd

# Load the dataset to see the VIN column structure
df = pd.read_csv('Electric_Vehicle_Population_Data.csv')

# Display the first few rows and column info
print(df.info())
print(df[['VIN (1-10)']].head())

In [None]:
import hashlib
import uuid

# Load the data again
df = pd.read_csv('Electric_Vehicle_Population_Data.csv')

# Method 1: Hashing with SHA-256
def hash_vin(vin, salt="secret_salt"):
    # Adding a salt prevents rainbow table attacks
    return hashlib.sha256((vin + salt).encode()).hexdigest()

# Method 2: Mapping to a Unique ID (Pseudonymization)
unique_vins = df['VIN (1-10)'].unique()
vin_to_id = {vin: f"VEH_{i:06d}" for i, vin in enumerate(unique_vins)}

# Apply transformations
df['Hashed_VIN'] = df['VIN (1-10)'].apply(hash_vin)
df['Anonymized_ID'] = df['VIN (1-10)'].map(vin_to_id)

# Save the transformation to a new CSV for the user
anonymized_df = df[['VIN (1-10)', 'Hashed_VIN', 'Anonymized_ID']].drop_duplicates().head(20)
anonymized_df.to_csv('anonymized_vins_sample.csv', index=False)

print(anonymized_df.head())

In [None]:
import pandas as pd

# Load the dataset to see the format of 'Vehicle Location'
df = pd.read_csv('Electric_Vehicle_Population_Data.csv')
print(df['Vehicle Location'].head())
print(df['Vehicle Location'].iloc[0])

In [None]:
# Top 5 EV Makes
top_5_makes = df["Make"].value_counts().head(5)
print("Top 5 EV Makes:")
print(top_5_makes)

# Top 5 EV Models
top_5_models = df["Model"].value_counts().head(5)
print("\nTop 5 EV Models:")
print(top_5_models)

In [None]:
# Count of EVs by county
county_counts = df['County'].value_counts()

# Display top 10 counties
print(county_counts.head(10))

In [None]:
import matplotlib.pyplot as plt

county_counts.head(10).plot(kind='bar', figsize=(10,5))
plt.title("Top 10 Counties by EV Registrations")
plt.xlabel("County")
plt.ylabel("Number of EVs")
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
# Check for missing model years (optional)
df = df.dropna(subset=['Model Year'])

# Count number of EVs per model year
ev_by_year = df['Model Year'].value_counts().sort_index()

# Display the counts
print(ev_by_year)

# Plot the EV adoption trend
plt.figure(figsize=(10, 5))
plt.plot(ev_by_year.index, ev_by_year.values, marker='o')
plt.xlabel("Model Year")
plt.ylabel("Number of EV Registrations")
plt.title("EV Adoption Trend Over Model Years")
plt.grid(True)
plt.show()

In [None]:
# Convert Electric Range to numeric (handles errors safely)
df['Electric Range'] = pd.to_numeric(df['Electric Range'], errors='coerce')

# Remove missing and zero values
valid_ev_range = df[df['Electric Range'] > 0]

# Calculate average electric range
average_range = valid_ev_range['Electric Range'].mean()

print("Average Electric Range of EVs:", round(average_range, 2), "miles")

In [None]:
# Clean up the eligibility column
df['CAFV Eligibility'] = df['Clean Alternative Fuel Vehicle (CAFV) Eligibility'].astype(str)

# Count total records
total = len(df)

# Count how many are eligible
eligible = len(df[df['CAFV Eligibility'] == "Clean Alternative Fuel Vehicle Eligible"])

# Optionally count other categories too
unknown = len(df[df['CAFV Eligibility'].str.contains("unknown", case=False)])
not_eligible = len(df[df['CAFV Eligibility'].str.contains("Not eligible", case=False)])

# Compute percentage eligible
percent_eligible = (eligible / total) * 100

print(f"Total EVs: {total}")
print(f"CAFV Eligible: {eligible} ({percent_eligible:.2f}%)")
print(f"Not Eligible: {not_eligible} ({(not_eligible/total)*100:.2f}%)")
print(f"Unknown Eligibility: {unknown} ({(unknown/total)*100:.2f}%)")

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Electric_Vehicle_Population_Data.csv')

# Inspect the first few rows and column information
print(df.head())
print(df.info())

In [None]:
# Check rows with Electric Range = 0
zero_range_count = (df['Electric Range'] == 0).sum()
total_rows = len(df)
print(f"Total rows: {total_rows}")
print(f"Rows with Electric Range = 0: {zero_range_count} ({zero_range_count/total_rows*100:.2f}%)")

# Filter out 0 range for analysis of variation (assuming 0 means missing/unresearched)
df_filtered = df[df['Electric Range'] > 0]

# Average Electric Range by Make
make_range = df_filtered.groupby('Make')['Electric Range'].agg(['mean', 'max', 'count']).sort_values(by='mean', ascending=False)
print("\nTop 10 Makes by Average Electric Range:")
print(make_range.head(10))

# Average Electric Range by Model (Top 15 models by count to keep it manageable)
model_range = df_filtered.groupby(['Make', 'Model'])['Electric Range'].agg(['mean', 'max', 'count']).sort_values(by='mean', ascending=False)
print("\nTop 10 Models by Average Electric Range:")
print(model_range.head(10))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Identify top 10 makes by count (in the filtered data)
top_makes = df_filtered['Make'].value_counts().head(10).index
df_top_makes = df_filtered[df_filtered['Make'].isin(top_makes)]

# Plot 1: Average Electric Range by Make
plt.figure(figsize=(12, 6))
avg_range = df_filtered.groupby('Make')['Electric Range'].mean().sort_values(ascending=False).head(20)
sns.barplot(x=avg_range.values, y=avg_range.index, palette='viridis')
plt.title('Top 20 Makes by Average Electric Range (where range > 0)')
plt.xlabel('Average Electric Range (miles)')
plt.ylabel('Make')
plt.tight_layout()
plt.savefig('avg_range_by_make.png')

# Plot 2: Boxplot for Top 10 Makes by Volume
plt.figure(figsize=(12, 8))
sns.boxplot(data=df_top_makes, x='Electric Range', y='Make', order=top_makes, palette='Set3')
plt.title('Distribution of Electric Range for Top 10 Most Common Makes')
plt.xlabel('Electric Range (miles)')
plt.ylabel('Make')
plt.tight_layout()
plt.savefig('range_distribution_top_makes.png')

# Plot 3: Specific Model Variation for a few Top Makes
# Let's pick Tesla and Chevrolet and see their model variations
makes_to_inspect = ['TESLA', 'CHEVROLET', 'NISSAN', 'FORD', 'BMW']
df_subset = df_filtered[df_filtered['Make'].isin(makes_to_inspect)]

plt.figure(figsize=(14, 10))
sns.boxplot(data=df_subset, x='Electric Range', y='Model', hue='Make', dodge=False)
plt.title('Electric Range Variation by Model for Selected Makes')
plt.xlabel('Electric Range (miles)')
plt.ylabel('Model')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('range_by_model_selected_makes.png')

# Output some summary stats for the response
summary_stats = df_filtered.groupby(['Make', 'Model'])['Electric Range'].agg(['mean', 'min', 'max', 'count']).sort_values(by='mean', ascending=False).head(20)
summary_stats.to_csv('range_summary_by_model.csv')

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Electric_Vehicle_Population_Data.csv')

# Inspect columns and first few rows
print(df.columns.tolist())
print(df.head())

In [None]:
# Search for any column name containing 'MSRP' or 'Price' or 'Base'
msrp_cols = [col for col in df.columns if 'MSRP' in col.upper() or 'PRICE' in col.upper() or 'BASE' in col.upper()]
print(f"Columns matching search: {msrp_cols}")

In [None]:
with open('Electric_Vehicle_Population_Data.csv', 'r') as f:
    header = f.readline()
    print(header)

In [None]:
print(df.describe())

In [None]:
# Read first line and first data line to count fields
with open('Electric_Vehicle_Population_Data.csv', 'r') as f:
    header = f.readline().strip().split(',')
    data = f.readline().strip().split(',')
    print(f"Header length: {len(header)}")
    print(f"Data length: {len(data)}")
    print(f"Header: {header}")

In [None]:

#Urban areas dominate EV adoption, often accounting for 70â€“85% of registrations

#Rural areas lag behind due to:

#Limited charging infrastructure

#Longer travel distances

#Lower EV model availability

#Cities benefit from:

#Government incentives

#Higher fuel costs

#Environmental awareness

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("Electric_Vehicle_Population_Data.csv")

# -------------------------------
# Top 5 EV Makes by Count
# -------------------------------
top_5_makes = df['Make'].value_counts().head(5)

plt.figure(figsize=(8, 5))
top_5_makes.plot(kind='bar')
plt.xlabel("EV Make")
plt.ylabel("Number of Vehicles")
plt.title("Top 5 EV Makes by Count")
plt.show()

# -------------------------------
# Top 5 EV Models by Count
# -------------------------------
top_5_models = df['Model'].value_counts().head(5)

plt.figure(figsize=(8, 5))
top_5_models.plot(kind='bar')
plt.xlabel("EV Model")
plt.ylabel("Number of Vehicles")
plt.title("Top 5 EV Models by Count")
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("Electric_Vehicle_Population_Data.csv")

# Count EVs by County
county_counts = df['County'].value_counts().head(20)

# Convert to DataFrame
heatmap_data = county_counts.to_frame(name='EV_Count')

# Plot heatmap
plt.figure(figsize=(6, 10))
sns.heatmap(heatmap_data, annot=True, fmt='d', cmap='YlGnBu')
plt.title("Top 20 Counties by EV Count")
plt.ylabel("County")
plt.xlabel("EV Count")
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("Electric_Vehicle_Population_Data.csv")

# Remove missing model years
df = df.dropna(subset=['Model Year'])

# Count EVs by model year
ev_by_year = df['Model Year'].value_counts().sort_index()

# Plot line graph
plt.figure(figsize=(10, 5))
plt.plot(ev_by_year.index, ev_by_year.values, marker='o')
plt.xlabel("Model Year")
plt.ylabel("Number of EV Registrations")
plt.title("Trend of EV Adoption by Model Year")
plt.grid(True)
plt.show()

In [None]:
import pandas as pd

# Load the dataset to check column names and data types
df = pd.read_csv('Electric_Vehicle_Population_Data.csv')
print(df.columns.tolist())
print(df.head())
print(df.info())

In [None]:
# Check the first line of the file to see all column names
with open('Electric_Vehicle_Population_Data.csv', 'r') as f:
    header = f.readline()
print(header)

In [None]:
# Check unique values or summary statistics for numeric columns to see if any could be MSRP
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
print("Numeric columns:", numeric_cols)
print(df[numeric_cols].describe())

In [None]:
# Final check for any price/MSRP related column
cols = df.columns.tolist()
print("All columns:", cols)
msrp_col = [c for c in cols if 'msrp' in c.lower() or 'price' in c.lower()]
print("Matching columns:", msrp_col)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv("Electric_Vehicle_Population_Data.csv")

# Simplify CAFV eligibility into two categories
df['CAFV_Status'] = df['Clean Alternative Fuel Vehicle (CAFV) Eligibility'].apply(
    lambda x: 'Eligible' if 'Eligible' in str(x) else 'Not Eligible / Unknown'
)

# Count values
cafv_counts = df['CAFV_Status'].value_counts()

# Plot pie chart
plt.figure(figsize=(6, 6))
plt.pie(
    cafv_counts.values,
    labels=cafv_counts.index,
    autopct='%1.1f%%',
    startangle=140
)
plt.title("Proportion of CAFV-Eligible vs Non-Eligible EVs")
plt.show()

In [None]:
import pandas as pd
import plotly.express as px

# Load dataset
df = pd.read_csv("Electric_Vehicle_Population_Data.csv")

# Drop missing vehicle locations
df = df.dropna(subset=['Vehicle Location'])

# Extract longitude and latitude from POINT format
df[['Longitude', 'Latitude']] = (
    df['Vehicle Location']
    .str.replace('POINT \\(|\\)', '', regex=True)
    .str.split(' ', expand=True)
    .astype(float)
)

# Create scatter map
fig = px.scatter_mapbox(
    df,
    lat='Latitude',
    lon='Longitude',
    hover_name='City',
    hover_data=['County', 'Make', 'Model'],
    zoom=6,
    height=600,
    title='Geospatial Distribution of EV Registrations'
)

fig.update_layout(mapbox_style="open-street-map")
fig.show()

In [None]:
#linear regression 
import pandas as pd

# Load the dataset
df = pd.read_csv('Electric_Vehicle_Population_Data.csv')

# Display the first few rows and info to understand the columns
print(df.head())
print(df.info())

In [None]:
# Check the distribution of Electric Range and how many zeros are present
range_counts = df['Electric Range'].value_counts()
zeros_count = (df['Electric Range'] == 0).sum()
nans_count = df['Electric Range'].isna().sum()

print(f"Total entries: {len(df)}")
print(f"Number of zeros in Electric Range: {zeros_count}")
print(f"Number of NaNs in Electric Range: {nans_count}")

# Check average range by Electric Vehicle Type
print("\nAverage Electric Range by Vehicle Type:")
print(df[df['Electric Range'] > 0].groupby('Electric Vehicle Type')['Electric Range'].mean())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1. Filter the data
data = df[df['Electric Range'] > 0].copy()

# 2. Select features and target
# We'll use Model Year, Make, and Electric Vehicle Type
features = ['Model Year', 'Make', 'Electric Vehicle Type']
target = 'Electric Range'

X = data[features]
y = data[target]

# 3. Preprocessing: Encode categorical variables
categorical_features = ['Make', 'Electric Vehicle Type']
numeric_features = ['Model Year']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# 4. Create a pipeline with Linear Regression
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train the model
model_pipeline.fit(X_train, y_train)

# 7. Predict and Evaluate
y_pred = model_pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"R-squared: {r2:.4f}")

# Get coefficients for numeric feature (Model Year)
# Note: Accessing coefficients from a pipeline with OneHotEncoding is slightly complex
regressor = model_pipeline.named_steps['regressor']
print(f"\nModel Year Coefficient: {regressor.coef_[0]:.2f}")

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Electric_Vehicle_Population_Data.csv')

# Display basic information and the first few rows
print(df.info())
print(df.head())

In [None]:
# Check unique values and basic stats for potential features
features = ['Model Year', 'Make', 'Model', 'Electric Vehicle Type', 'Clean Alternative Fuel Vehicle (CAFV) Eligibility']

for feature in features:
    print(f"\n--- {feature} ---")
    print(df[feature].nunique())
    print(df[feature].value_counts().head(5))

# Check for relationship with Electric Range
# Note: Some ranges are 0, which might mean unknown or very new models.
print("\nSummary of Electric Range:")
print(df['Electric Range'].describe())

# Average range by Type
print("\nAverage Electric Range by Electric Vehicle Type:")
print(df.groupby('Electric Vehicle Type')['Electric Range'].mean())

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Electric_Vehicle_Population_Data.csv')

# Inspect the unique counts for 'Make' and 'Model'
make_unique = df['Make'].nunique()
model_unique = df['Model'].nunique()

print(f"Unique 'Make' values: {make_unique}")
print(f"Unique 'Model' values: {model_unique}")

# Show top 5 rows for context
print(df[['Make', 'Model']].head())

In [None]:
# Check for missing values in Make and Model
missing_make = df['Make'].isnull().sum()
missing_model = df['Model'].isnull().sum()

print(f"Missing 'Make': {missing_make}")
print(f"Missing 'Model': {missing_model}")

# Check the first few unique makes to see variety
print(f"Sample Makes: {df['Make'].unique()[:10]}")

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Electric_Vehicle_Population_Data.csv')

# Display basic information and the first few rows
print(df.info())
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Check the distribution of 'Electric Range'
print(df['Electric Range'].describe())
print(df['Electric Range'].value_counts().head(10))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder

# Filter out rows where Electric Range is 0 and handle missing values
df_filtered = df[df['Electric Range'] > 0].dropna(subset=['Electric Range', 'Model Year', 'Make', 'Electric Vehicle Type'])

# Feature Selection
features = ['Model Year', 'Make', 'Electric Vehicle Type']
X = df_filtered[features]
y = df_filtered['Electric Range']

# Categorical Encoding
le_make = LabelEncoder()
X_encoded = X.copy()
X_encoded['Make'] = le_make.fit_transform(X['Make'])

le_type = LabelEncoder()
X_encoded['Electric Vehicle Type'] = le_type.fit_transform(X['Electric Vehicle Type'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions and calculate R2
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print(f"R2 Score: {r2}")

In [None]:
from sklearn.metrics import mean_absolute_error

# Recalculate including Model
features_with_model = ['Model Year', 'Make', 'Model', 'Electric Vehicle Type']
X_full = df_filtered[features_with_model]

# Encoding
le_model = LabelEncoder()
X_full_encoded = X_full.copy()
X_full_encoded['Make'] = le_make.fit_transform(X_full['Make'])
X_full_encoded['Electric Vehicle Type'] = le_type.fit_transform(X_full['Electric Vehicle Type'])
X_full_encoded['Model'] = le_model.fit_transform(X_full['Model'])

# Split
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X_full_encoded, y, test_size=0.2, random_state=42)

# Train
model_f = RandomForestRegressor(n_estimators=100, random_state=42)
model_f.fit(X_train_f, y_train_f)

# Predict
y_pred_f = model_f.predict(X_test_f)
r2_f = r2_score(y_test_f, y_pred_f)
mae_f = mean_absolute_error(y_test_f, y_pred_f)

print(f"R2 Score with Model: {r2_f}")
print(f"MAE: {mae_f}")

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Electric_Vehicle_Population_Data.csv')

# Inspect the first few rows and column information
print(df.head())
print(df.info())

In [None]:
print(df.columns.tolist())

In [None]:
with open('Electric_Vehicle_Population_Data.csv', 'r') as f:
    for i in range(5):
        print(f.readline())

In [None]:
# Check if any column contains values that look like MSRP
print(df.describe(include='all'))

In [None]:
# Check for any column containing 'MSRP'
msrp_cols = [col for col in df.columns if 'MSRP' in col.upper()]
print(f"MSRP columns found: {msrp_cols}")

In [None]:
import pandas as pd

df = pd.read_csv('Electric_Vehicle_Population_Data.csv')
print(df.info())
print(df.head())
print(df.describe())

In [None]:
zero_range_count = (df['Electric Range'] == 0).sum()
non_zero_range_count = (df['Electric Range'] > 0).sum()
print(f"Zero Range: {zero_range_count}")
print(f"Non-Zero Range: {non_zero_range_count}")
print(df[df['Electric Range'] == 0]['Clean Alternative Fuel Vehicle (CAFV) Eligibility'].value_counts())

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Electric_Vehicle_Population_Data.csv')

# Display basic info and first few rows
print(df.info())
print(df.head())
# Check summary of Electric Range
print(df['Electric Range'].describe())

In [None]:
zero_range_count = (df['Electric Range'] == 0).sum()
total_count = len(df)
print(f"Total entries: {total_count}")
print(f"Entries with 0 range: {zero_range_count} ({zero_range_count/total_count:.2%})")

# Let's see how many non-zero ranges we have
non_zero_range = df[df['Electric Range'] > 0]
print(f"Entries with range > 0: {len(non_zero_range)}")

# Group by Model Year and see average range
print(df.groupby('Model Year')['Electric Range'].mean())