In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
# Step 2 & 3: Load Data from File & Create DataFrame
file_path = "D:\\dml\\Project\\Electric_Vehicle_Population_Data.csv"
# Update if needed
df = pd.read_csv(file_path)

In [3]:
# Drop duplicates
df = df.drop_duplicates()

In [4]:
# Fill missing numerical values using appropriate imputation techniques
numerical_cols = ['Postal Code', 'Electric Range', 'Base MSRP', 'Legislative District', '2020 Census Tract']
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())  # Using median for imputation

In [5]:
# Normalize text fields
df['Make'] = df['Make'].str.upper()
df['Model'] = df['Model'].str.upper()

In [6]:
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

In [7]:
# Apply IQR method to all numerical columns
numerical_cols = ['Postal Code', 'Electric Range', 'Base MSRP', 'Legislative District', '2020 Census Tract']
for col in numerical_cols:
    df = remove_outliers_iqr(df, col)

In [8]:
# Address skewed data using log transformation where applicable
for col in numerical_cols:
    if df[col].skew() > 1:  # Checking for right skew
        df[col] = np.log1p(df[col])  # Log transformation to reduce skewness

In [17]:
# Recheck and remove constant features after transformation
constant_features = [col for col in numerical_cols if df[col].nunique() <= 1]
df = df.drop(columns=constant_features)
numerical_cols = [col for col in numerical_cols if col not in constant_features]

In [19]:
# Feature selection (Select K Best for classification)
X = df[numerical_cols]  # Select numerical features
y = df['Electric Vehicle Type']  # Target variable for classification

In [25]:
# Ensure there are enough unique values for SelectKBest
X = df[numerical_cols]  # Update X after removing problematic columns

if X.shape[1] > 0:  # Ensure there are features left for selection
    selector = SelectKBest(score_func=f_classif, k=min(3, X.shape[1]))  # Select top features
    X_new = selector.fit_transform(X, y)
    selected_features = [numerical_cols[i] for i in selector.get_support(indices=True)]
    print("Selected Features:", selected_features)
else:
    X_new = X.values  # No feature selection if no numerical features are left
    selected_features = numerical_cols

Selected Features: ['Postal Code', 'Electric Range', 'Legislative District']


In [27]:
# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_new)

In [29]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [31]:
# Print shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (143428, 3)
X_test shape: (35857, 3)
y_train shape: (143428,)
y_test shape: (35857,)


In [35]:
# Save cleaned dataset
cleaned_file_path = "D:\\dml\\Project\\Cleaned_Electric_Vehicle_Data1.csv"
df.to_csv(cleaned_file_path, index=False)

print("Data cleaning completed. Cleaned file saved at:", cleaned_file_path)

Data cleaning completed. Cleaned file saved at: D:\dml\Project\Cleaned_Electric_Vehicle_Data1.csv
