In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('ev-charging-stations-india.csv')

In [4]:
# Step 1.1: Handle missing values
# Check for missing values
print("Missing values before cleaning:")
print(df.isnull().sum())

Missing values before cleaning:
name         1
state        1
city         1
address      0
lattitude    1
longitude    0
dtype: int64


In [3]:
# Fill missing addresses with a placeholder
df['address'] = df['address'].fillna('Unknown Address')

In [5]:

# Remove rows with missing latitude or longitude
df = df.dropna(subset=['lattitude', 'longitude'])

In [6]:

# Step 1.2: Correct data types
# Ensure latitude and longitude are floats
df['lattitude'] = pd.to_numeric(df['lattitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')

# Step 1.3: Validate latitude and longitude
# Latitude should be between -90 and 90, longitude between -180 and 180
df = df[(df['lattitude'].between(-90, 90)) & (df['longitude'].between(-180, 180))]

# Step 1.4: Remove duplicates based on name, latitude, and longitude
df = df.drop_duplicates(subset=['name', 'lattitude', 'longitude'])

In [7]:

# Step 1.5: Standardize state and city names (convert to title case)
df['state'] = df['state'].str.title().str.strip()
df['city'] = df['city'].str.title().str.strip()

# Step 1.6: Save cleaned dataset
df.to_csv('cleaned_ev_charging_stations.csv', index=False)

print("Cleaned dataset saved as 'cleaned_ev_charging_stations.csv'")
print("Missing values after cleaning:")
print(df.isnull().sum())
print("Dataset shape:", df.shape)

Cleaned dataset saved as 'cleaned_ev_charging_stations.csv'
Missing values after cleaning:
name         0
state        0
city         0
address      0
lattitude    0
longitude    0
dtype: int64
Dataset shape: (1220, 6)
