In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from pathlib import Path

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# 1) Load
import os
print("Current folder:", os.getcwd())
print("Files here:", os.listdir())

import pandas as pd
import zipfile

zip_path = "E:\EV_Price_Prediction\data\Electric_Vehicle_Population_Data.zip"
csv_filename = "Electric_Vehicle_Population_Data.csv"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    with zip_ref.open(csv_filename) as f:
        df = pd.read_csv(f)

print("Data loaded successfully")
df.head()
# 2) Clean target - Base MSRP -> numeric
# Remove $ and commas if present, convert to numeric and drop invalid
if df['Base MSRP'].dtype == 'O':
    df['Base MSRP'] = df['Base MSRP'].astype(str).str.replace(r'[\$,]', '', regex=True)
df['Base MSRP'] = pd.to_numeric(df['Base MSRP'], errors='coerce')
df = df.dropna(subset=['Base MSRP']).reset_index(drop=True)

# 3) Explicit features (the 10 features your pipeline & app expect)
features = [
    "County",
    "City",
    "State",
    "Postal Code",
    "Model Year",
    "Make",
    "Model",
    "Electric Vehicle Type",
    "Clean Alternative Fuel Vehicle (CAFV) Eligibility",
    "Electric Range"
]
# ensure all exist
missing = [c for c in features if c not in df.columns]
if missing:
    raise RuntimeError(f"Missing expected columns in CSV: {missing}")

# 4) Force appropriate dtypes
# Postal code as string/categorical (so encoder treats it as category)
df['Postal Code'] = df['Postal Code'].astype(str).str.strip()
# All categorical columns ensure object dtype
cat_cols = ["County","City","State","Postal Code","Make","Model","Electric Vehicle Type",
            "Clean Alternative Fuel Vehicle (CAFV) Eligibility"]
for c in cat_cols:
    df[c] = df[c].astype(str).str.strip()

# Numeric columns
df['Model Year'] = pd.to_numeric(df['Model Year'], errors='coerce')
df['Electric Range'] = pd.to_numeric(df['Electric Range'], errors='coerce')

# Drop rows missing the numeric predictors
df = df.dropna(subset=['Model Year','Electric Range']).reset_index(drop=True)

# 5) Quick sanity checks
print("Dataset shape after cleaning:", df.shape)
print("Target stats:", df['Base MSRP'].describe())

# 6) Prepare X, y and train-test split
X = df[features].copy()
y = df['Base MSRP'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# 7) Preprocessing: numeric and categorical explicit lists
numeric_features = ['Model Year', 'Electric Range']
categorical_features = [c for c in features if c not in numeric_features]

numeric_transformer = Pipeline(steps=[
    ('imputer',  # if any missing, use median
     __import__('sklearn.impute', fromlist=['SimpleImputer']).SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', __import__('sklearn.impute', fromlist=['SimpleImputer']).SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder='drop')

# 8) Baseline models (quick)
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}

for name, mdl in models.items():
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', mdl)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    print(f"\n{name} -> MAE: {mean_absolute_error(y_test,preds):.2f}, RMSE: {np.sqrt(mean_squared_error(y_test,preds)):.2f}, R2: {r2_score(y_test,preds):.4f}")

# 9) Hyperparameter tuning: RandomForest and GradientBoosting
print("\nStarting GridSearchCV for RandomForest...")
rf_pipe = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1))])
rf_params = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [10, 20, None],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2]
}
rf_grid = GridSearchCV(rf_pipe, rf_params, cv=3, scoring='r2', n_jobs=-1, verbose=1)
rf_grid.fit(X_train, y_train)
print("RF best:", rf_grid.best_params_, "best_cv_r2:", rf_grid.best_score_)

print("\nStarting GridSearchCV for GradientBoosting...")
gb_pipe = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', GradientBoostingRegressor(random_state=42))])
gb_params = {
    'regressor__n_estimators': [100, 200],
    'regressor__learning_rate': [0.05, 0.1],
    'regressor__max_depth': [3, 5],
    'regressor__subsample': [0.8, 1.0]
}
gb_grid = GridSearchCV(gb_pipe, gb_params, cv=3, scoring='r2', n_jobs=-1, verbose=1)
gb_grid.fit(X_train, y_train)
print("GB best:", gb_grid.best_params_, "best_cv_r2:", gb_grid.best_score_)

# 10) Choose best by CV score then evaluate on test
candidates = {
    'rf': rf_grid,
    'gb': gb_grid
}
best_cv_name, best_cv_score, best_estimator = None, -999, None
for k,g in candidates.items():
    if g.best_score_ > best_cv_score:
        best_cv_score = g.best_score_
        best_cv_name = k
        best_estimator = g.best_estimator_

print(f"\nBest by CV: {best_cv_name} (cv_r2={best_cv_score:.4f})")
best_pipe = best_estimator

# 11) Test evaluation and sanity-check predictions
preds_test = best_pipe.predict(X_test)
print("Final TEST metrics -> MAE: {:.2f}, RMSE: {:.2f}, R2: {:.4f}".format(
    mean_absolute_error(y_test, preds_test),
    np.sqrt(mean_squared_error(y_test, preds_test)),
    r2_score(y_test, preds_test)
))

# Print some sample predictions to ensure not all zeros
sample_preds = best_pipe.predict(X_test.iloc[:10])
print("Sample predictions (first 10):", sample_preds)

# 12) Save only after confirming predictions are sensible
Path("model").mkdir(parents=True, exist_ok=True)
joblib.dump(best_pipe, "model/ev_price_model.pkl")
print("Saved trained pipeline to model/ev_price_model.pkl")

# 13) Extra: show top feature importances if available
reg = best_pipe.named_steps['regressor']
if hasattr(reg, 'feature_importances_'):
    # build feature names: numeric + onehot names
    onehot = best_pipe.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot']
    cat_names = onehot.get_feature_names_out(categorical_features)
    feature_names = np.concatenate([numeric_features, cat_names])
    importances = reg.feature_importances_
    fi = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False).head(20)
    display(fi)
