In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import TruncatedSVD
import joblib

In [2]:
train=pd.read_csv(r"C:\Users\HP\Desktop\vechiles\selected_features.csv")
test=pd.read_csv(r"C:\Users\HP\Desktop\vechiles\test.csv")


In [3]:
imputer_values = joblib.load('imputer_values.pkl')
print(imputer_values)

{'name': '2023 Dodge Durango Pursuit', 'description': 'nan', 'make': 'Jeep', 'model': 'Hornet', 'year': 2024, 'price': 50729.68974358974, 'engine': '16V GDI DOHC Turbo', 'cylinders': 4.0, 'fuel': 'Gasoline', 'mileage': 8.0, 'transmission': '8-Speed Automatic', 'trim': 'Limited', 'body': 'SUV', 'doors': 4.0, 'exterior_color': 'Bright White Clearcoat', 'interior_color': 'Black', 'drivetrain': 'All-wheel Drive'}


In [4]:
def fill_nulls(df, default_values):
    for col in df.columns:
        if col in default_values:
            df[col] = df[col].fillna(default_values[col])
    
test1=fill_nulls(test,imputer_values)

In [5]:
stats = joblib.load('mileage_outlier_stats.pkl')
stats

{'lower_bound': -8.0, 'upper_bound': 24.0, 'median': 8.0}

In [6]:
test['mileage'] = test['mileage'].apply(lambda x: stats['median'] if x < stats['lower_bound'] or x > stats['upper_bound'] else x)

In [7]:
stat = joblib.load('cylinders_outlier_stats.pkl')
stat

{'lower_bound': 1.0, 'upper_bound': 9.0, 'median': 4.0}

In [8]:
test['cylinders'] = test['cylinders'].apply(lambda x: stat['median'] if x < stat['lower_bound'] or x > stat['upper_bound'] else x)


In [9]:
def interior_colors(color):
    color = str(color).lower()  # make case-insensitive
    if 'black' in color:
        return 'Black'
    elif 'brown' in color:
        return 'Brown'
    elif 'gray' in color or 'grey' in color:
        return 'Gray'
    elif 'sandstone' in color:
        return 'Sandstone'
    elif 'java' in color:
        return 'Java'
    elif 'palazzo' in color:
        return 'Palazzo'
    elif 'volcano' in color:
        return 'Volcano'
    elif 'ebony' in color:
        return 'Ebony'
    elif 'red' in color:
        return 'Red'
    elif 'dark' in color:
        return 'Dark'
    elif 'blue' in color:
        return 'Blue'
    elif 'premium' in color:
        return 'Premium'
    else:
        return color.title()  # capitalize other colors nicely

test['interior_color'] = test['interior_color'].apply(interior_colors)

In [10]:
def exterior_colors(color):
    color = str(color).lower()  # make case-insensitive
    if 'black' in color:
        return 'Black'
    elif 'brown' in color:
        return 'Brown'
    elif 'gray' in color or 'grey' in color:
        return 'Gray'
    elif 'sandstone' in color:
        return 'Sandstone'
    elif 'java' in color:
        return 'Java'
    elif 'palazzo' in color:
        return 'Palazzo'
    elif 'volcano' in color:
        return 'Volcano'
    elif 'ebony' in color:
        return 'Ebony'
    elif 'red' in color:
        return 'Red'
    elif 'dark' in color:
        return 'Dark'
    elif 'blue' in color:
        return 'Blue'
    elif 'premium' in color:
        return 'Premium'
    elif 'white' in color:
        return 'White'
    elif 'silver' in color:
        return 'Silver'
    elif 'yellow' in color:
        return 'Yellow'
    elif 'bronze' in color:
        return 'Bronze'
    elif 'green' in color:
        return 'Green'
    elif 'metallic' in color:
        return 'Metallic'
    else:
        return color.title()  # capitalize other colors nicely

test['exterior_color'] = test['exterior_color'].apply(exterior_colors)

In [11]:
def model(model):
    model = str(model).lower()  # make case-insensitive
    if 'grand cherokee' in model:
        return 'Grand Cherokee'
    elif 'wagoneer' in model:
        return 'Wagoneer'
    elif 'wrangler' in model:
        return 'Wrangler'
    else:
        return model.title()  # capitalize other models nicely

test['model'] = test['model'].apply(model)

In [12]:
def engine(engine):
    
    engine = str(engine).lower()  # make case-insensitive
    if 'turbo' in engine:
        return 'Turbo'
    elif 'sohc' in engine:
        return 'SOHC'
    elif 'dohc' in engine:
        return 'DOHC'
    elif 'ohv' in engine:
        return 'Ohv'
    else:
        return engine.title()  # capitalize other models nicely

test['engine'] = test['engine'].apply(engine)

In [13]:
def transmission_type(trans):
    trans = str(trans).lower().strip()

    if 'battery' in trans or 'kwh' in trans:
        return 'Automatic'
    if 'cvt' in trans or 'variable' in trans:
        return 'CVT'
    elif 'dual clutch' in trans:
        return 'Dual-Clutch'
    elif 'manual' in trans:
        return 'Manual'
    elif 'automatic' in trans or 'a/t' in trans or 'auto trans' in trans:
        return 'Automatic'    
    else:
        return trans.title()  # keep other types capitalized

test['transmission'] = test['transmission'].apply(transmission_type)


In [14]:
freq_mappings = joblib.load('freq_mappings.pkl')
for col, mapping in freq_mappings.items():
    if col in test.columns:
        test[col] = test[col].map(mapping).fillna(0)

In [15]:
freq_mappings.keys()

dict_keys(['make', 'model', 'trim', 'exterior_color', 'interior_color'])

In [16]:
ohe = joblib.load('onehot_encoder.pkl')
low_card_cat_features = ['engine', 'fuel', 'transmission', 'body', 'drivetrain']
# Now ohe is ready to use for transforming new data
# Example: transforming test DataFrame
low_card_encoded_test = pd.DataFrame(
    ohe.transform(test[low_card_cat_features]),
    columns=ohe.get_feature_names_out(low_card_cat_features),
    index=test.index
)

test = test.drop(columns=low_card_cat_features).join(low_card_encoded_test)

In [17]:
test = test.drop(columns='description' )
test = test.drop(columns='name' )

In [18]:
scaler = joblib.load('scaler.pkl')

# Specify the same columns to exclude
exclude_cols = ['year', 'price']
cols_to_scale = test.columns.difference(exclude_cols)

# Apply scaling to test data
test[cols_to_scale] = scaler.transform(test[cols_to_scale])

In [19]:
selected_columns = train['Selected_Features'].tolist() + ['price']


In [20]:
test=test[selected_columns]

In [21]:
test.columns

Index(['make', 'model', 'cylinders', 'mileage', 'trim', 'exterior_color',
       'interior_color', 'engine_DOHC', 'engine_Ohv', 'engine_Turbo',
       'fuel_Diesel', 'fuel_Gasoline', 'drivetrain_All-wheel Drive',
       'drivetrain_Four-wheel Drive', 'drivetrain_Front-wheel Drive', 'price'],
      dtype='object')

In [22]:
import pickle
X = test.drop(columns='price')
y = test['price']
with open('poly.pkl', 'rb') as f:
    poly = pickle.load(f)
with open('poly_regression_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)
X= poly.transform(X)
y_pred = loaded_model.predict(X)
r2 = r2_score(y, y_pred)

print(f"R² Score: {r2:.4f}")

R² Score: -32715922636736532.0000




In [23]:
import pickle
X = test.drop(columns='price')
y = test['price']

with open('tree_regression_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

y_pred = loaded_model.predict(X)
r2 = r2_score(y, y_pred)

print(f"R² Score: {r2:.4f}")

R² Score: 0.5804




In [24]:
import pickle
X = test.drop(columns='price')
y = test['price']

with open('random_forest_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

y_pred = loaded_model.predict(X)
r2 = r2_score(y, y_pred)

print(f"R² Score: {r2:.4f}")

R² Score: 0.7621




In [25]:
import pickle
X = test.drop(columns='price')
y = test['price']

with open('gradient_boosting_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

y_pred = loaded_model.predict(X)
r2 = r2_score(y, y_pred)

print(f"R² Score: {r2:.4f}")

R² Score: 0.8013


