<a href="https://colab.research.google.com/github/arssite/Datalysis/blob/main/EV_RMSE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from zipfile import ZipFile
import pandas as pd

# Unzip and load the files
with ZipFile('/content/Dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('/content')

# Load datasets
train_df = pd.read_csv('/content/Dataset/Train.csv')
test_df = pd.read_csv('/content/Dataset/Test.csv')
submission_df = pd.read_csv('/content/Dataset/submission.csv')


In [5]:
# Basic EDA
print(train_df.head())
print(train_df.info())
print(train_df.describe())
print(train_df.isnull().sum())

# Drop columns not useful for modeling (like IDs or names)
useless_columns = ['ID', 'Model_Name'] if 'Model_Name' in train_df.columns else ['ID']
train_df.drop(columns=useless_columns, inplace=True, errors='ignore')
test_df.drop(columns=useless_columns, inplace=True, errors='ignore')

# Impute missing values
train_df.fillna(train_df.median(numeric_only=True), inplace=True)
test_df.fillna(test_df.median(numeric_only=True), inplace=True)

# Encode categorical features
cat_cols = train_df.select_dtypes(include='object').columns
train_df = pd.get_dummies(train_df, columns=cat_cols)
test_df = pd.get_dummies(test_df, columns=cat_cols)

# Align train and test
train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)


   Unnamed: 0  VIN (1-10)     County         City State  Postal Code  \
0           0  5YJSA1DN0C   Thurston      Olympia    WA      98502.0   
1           1  WBY1Z6C30H       King     Bellevue    WA      98004.0   
2           2  WBY7Z6C52J       King         Kent    WA      98031.0   
3           3  WBY1Z2C58F       King  Woodinville    WA      98072.0   
4           4  5YJSA1E45G  Snohomish      Bothell    WA      98012.0   

   Model Year   Make    Model           Electric Vehicle Type  \
0        2012  TESLA  MODEL S  Battery Electric Vehicle (BEV)   
1        2017    BMW       I3  Battery Electric Vehicle (BEV)   
2        2018    BMW       I3  Battery Electric Vehicle (BEV)   
3        2015    BMW       I3  Battery Electric Vehicle (BEV)   
4        2016  TESLA  MODEL S  Battery Electric Vehicle (BEV)   

  Clean Alternative Fuel Vehicle (CAFV) Eligibility  Base MSRP  \
0           Clean Alternative Fuel Vehicle Eligible    59900.0   
1           Clean Alternative Fuel Vehicle E

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Define features and target
X = train_df.drop('Electric_Range', axis=1)
y = train_df['Electric_Range']

# Split for local validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest as baseline
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse}")


Validation RMSE: 1.1415062377623595


In [7]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
}

search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=param_grid,
    n_iter=10,
    cv=3,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    n_jobs=-1
)
search.fit(X, y)
best_model = search.best_estimator_


Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [9]:
# Use the best model for prediction
# Drop the target column from the test set if it exists
if 'Electric_Range' in test_df.columns:
    test_df = test_df.drop(columns=['Electric_Range'])

final_preds = best_model.predict(test_df)

# Create submission
submission_df['Electric_Range'] = final_preds
submission_df.to_csv('final_submission.csv', index=False)