In [306]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [307]:
filepath = '../data/data.csv'

# Load data
dt = pd.read_csv(filepath)

print(dt.columns)
print(dt.shape)
dt.head()


Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')
(11914, 16)


Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [308]:

for row in dt['Market Category']:
    if (type(row) == type(1.1254)):
        row = 'Unknown'


In [309]:
dt.describe()

Unnamed: 0,Year,Engine HP,Engine Cylinders,Number of Doors,highway MPG,city mpg,Popularity,MSRP
count,11914.0,11845.0,11884.0,11908.0,11914.0,11914.0,11914.0,11914.0
mean,2010.384338,249.38607,5.628829,3.436093,26.637485,19.733255,1554.911197,40594.74
std,7.57974,109.19187,1.780559,0.881315,8.863001,8.987798,1441.855347,60109.1
min,1990.0,55.0,0.0,2.0,12.0,7.0,2.0,2000.0
25%,2007.0,170.0,4.0,2.0,22.0,16.0,549.0,21000.0
50%,2015.0,227.0,6.0,4.0,26.0,18.0,1385.0,29995.0
75%,2016.0,300.0,6.0,4.0,30.0,22.0,2009.0,42231.25
max,2017.0,1001.0,16.0,4.0,354.0,137.0,5657.0,2065902.0


In [310]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity

dt['Age'] = 2025 - dt['Year']
dt['Price_per_HP'] = dt['MSRP'] / dt['Engine HP']
dt['Efficiency'] = ((dt['highway MPG'] + dt['city mpg']) / 2) / dt['Engine HP'] * 100


numerical_features = [
    'Year', 'Engine HP', 'Engine Cylinders', 'highway MPG', 'city mpg', 'Efficiency', 'Age', 'MSRP', 'Number of Doors', 'Price_per_HP'
]

categorical_features = [
    'Market Category', 'Vehicle Size', 'Vehicle Style', 'Transmission Type', 'Driven_Wheels', 'Engine Fuel Type'
]

# Create a new DataFrame with only the selected features
df_rec = dt[numerical_features + categorical_features].copy()


# Fill missing numerical values with the median
for col in numerical_features:
    df_rec[col] = pd.to_numeric(df_rec[col], errors='coerce')
    median_val = df_rec[col].median()
    df_rec[col].fillna(median_val, inplace=True)


# Fill missing categorical values with a placeholder
for col in categorical_features:
    df_rec[col].fillna('Unknown', inplace=True)
    
# Replace all string variants AND true NaN at once
df_rec['Market Category'] = (df_rec['Market Category']
    .replace(['', ' ' , 'NaN', 'N/A', None, np.nan], 'Unknown')         # Replace common string markers
    .fillna('Unknown')                                # Replace true np.nan
)


preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)


feature_matrix = preprocessor.fit_transform(df_rec)

print("Preprocessing complete.")
print("Shape of the final feature matrix:", feature_matrix.shape)


Preprocessing complete.
Shape of the final feature matrix: (11914, 121)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_rec[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_rec[col].fillna('Unknown', inplace=True)


In [311]:
from sklearn.metrics.pairwise import cosine_similarity
dt = dt.reset_index()
indices = pd.Series(dt.index, index=dt['Model'])


df_rec['Make'] = dt['Make']
df_rec['Model'] = dt['Model']


def get_recommendations(model_name, feature_matrix=feature_matrix, dataframe=df_rec, indices_map=indices):
    # Normalize model names in the indices map for case-insensitive and trimmed matching
    normalized_indices = pd.Series(indices_map.values, index=indices_map.index.str.strip().str.lower())

    # Normalize the input model name
    model_name_norm = model_name.strip().lower()

    try:
        # Get all indices of the car that matches the normalized model name.
        idx_list = normalized_indices[model_name_norm]
        # If multiple, pick the first one for similarity calculation
        if isinstance(idx_list, (pd.Series, np.ndarray, list)):
            idx = idx_list.iloc[0] if hasattr(idx_list, "iloc") else idx_list[0]
        else:
            idx = idx_list
    except KeyError:
        return f"Model '{model_name}' not found in the dataset."

    # Get the feature vector for the specific model
    model_vector = feature_matrix[idx]

    # Calculate the cosine similarity of our car against ALL other cars.
    sim_scores = cosine_similarity(model_vector, feature_matrix)

    # Get the similarity scores as a list Sof (index, score) tuples
    sim_scores = list(enumerate(sim_scores[0]))

    # Sort the cars based on the similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar cars, excluding the car itself (which is at index 0)
    sim_scores = sim_scores[1:]

    # Get the original indices of the top similar cars, avoiding duplicates by 'Make' and 'Model'
    seen = set()
    recommendations = []
    for i, score in sim_scores:
        make = dataframe.iloc[i]['Make']
        model = dataframe.iloc[i]['Model']
        key = (make, model)
        if key not in seen:
            seen.add(key)
            recommendations.append(i)
        if len(recommendations) == 10:
            break

    # Return the top 10 most similar cars
    return dataframe[['Make', 'Model', 'Year', 'Market Category', 'Transmission Type', 'Vehicle Size', 'Engine HP', 'Engine Fuel Type', 'Efficiency', 'Price_per_HP']].iloc[recommendations]



# --- Test the new function ---
print("\nCars similar to 'Prius':")
display(get_recommendations('Prius'))

print("\nCars similar to 'Camry':")
display(get_recommendations('Camry'))



Cars similar to 'Prius':


Unnamed: 0,Make,Model,Year,Market Category,Transmission Type,Vehicle Size,Engine HP,Engine Fuel Type,Efficiency,Price_per_HP
7695,Toyota,Prius,2015,"Hatchback,Hybrid",AUTOMATIC,Compact,134.0,regular unleaded,36.940299,201.380597
7669,Toyota,Prius c,2015,"Hatchback,Hybrid",AUTOMATIC,Compact,99.0,regular unleaded,50.0,197.373737
5916,Honda,Insight,2014,"Hatchback,Hybrid",AUTOMATIC,Compact,98.0,regular unleaded,43.367347,191.071429
7676,Toyota,Prius Prime,2017,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,44.628099,223.966942
3175,Lexus,CT 200h,2017,"Hatchback,Luxury,Hybrid",AUTOMATIC,Compact,134.0,regular unleaded,30.970149,233.208955
6841,Mitsubishi,Mirage,2017,Hatchback,AUTOMATIC,Compact,78.0,regular unleaded,51.282051,211.474359
4720,Honda,Fit,2017,Hatchback,AUTOMATIC,Compact,130.0,regular unleaded,28.076923,129.153846
9880,Chevrolet,Spark,2016,Hatchback,AUTOMATIC,Compact,98.0,regular unleaded,36.734694,176.377551
11345,Nissan,Versa Note,2017,Hatchback,AUTOMATIC,Compact,109.0,regular unleaded,32.110092,171.651376
11830,Toyota,Yaris,2017,Hatchback,AUTOMATIC,Compact,106.0,regular unleaded,30.660377,169.811321



Cars similar to 'Camry':


Unnamed: 0,Make,Model,Year,Market Category,Transmission Type,Vehicle Size,Engine HP,Engine Fuel Type,Efficiency,Price_per_HP
2245,Toyota,Camry,2015,Unknown,AUTOMATIC,Midsize,178.0,regular unleaded,16.853933,146.910112
9719,Hyundai,Sonata,2015,Unknown,AUTOMATIC,Midsize,185.0,regular unleaded,15.945946,143.378378
7456,Volkswagen,Passat,2015,Unknown,AUTOMATIC,Midsize,170.0,regular unleaded,17.647059,143.382353
6097,Volkswagen,Jetta,2015,Unknown,AUTOMATIC,Midsize,170.0,regular unleaded,18.235294,149.294118
705,Mazda,6,2015,Unknown,AUTOMATIC,Midsize,184.0,regular unleaded,17.391304,135.298913
1220,Honda,Accord,2015,Unknown,AUTOMATIC,Midsize,189.0,regular unleaded,16.137566,130.502646
1403,Nissan,Altima,2015,Unknown,AUTOMATIC,Midsize,182.0,regular unleaded,17.857143,154.67033
3492,Dodge,Dart,2015,Unknown,AUTOMATIC,Midsize,184.0,regular unleaded,15.76087,129.320652
6671,Chevrolet,Malibu,2015,Unknown,AUTOMATIC,Midsize,196.0,regular unleaded,15.561224,143.852041
5032,Ford,Fusion,2015,Unknown,AUTOMATIC,Midsize,175.0,regular unleaded,16.0,137.342857


In [312]:
# --- Feature Engineering ---
# Ensure columns are numeric before division, coercing errors to NaN
df_rec['Engine HP'] = pd.to_numeric(df_rec['Engine HP'], errors='coerce')

# Calculate Power-to-Weight Ratio (HP per KG)
# We fill any resulting NaNs from the division with the median of the new column
df_rec['price_to_efficiency'] = (df_rec['Efficiency'] / df_rec['MSRP'] )* 10000  # Scale for better interpretability
df_rec['price_to_efficiency'].fillna(df_rec['price_to_efficiency'].median(), inplace=True)

df_rec['Make'] = dt['Make']
df_rec['Model'] = dt['Model']

df_rec['Market Category'] = df_rec['Market Category'].astype(str)

numerical_features.append('price_to_efficiency')

# Separate features by importance
# High importance features for recommendation (based on domain knowledge and dataset columns)
high_importance_num = [
    'Engine HP',           # Power is a key differentiator
    'Efficiency',          # Fuel efficiency is crucial for many users
    'MSRP',                # Price is a major factor in recommendations
    'Year',                # Newer cars are often preferred
    'price_to_efficiency',  # Value for money in terms of performance
    'Age'                  # Age of the car (derived from Year)
]
high_importance_cat = [
    'Market Category',     # Luxury, Performance, etc.
    'Vehicle Size',        # Compact, Midsize, etc.
    'Vehicle Style',       # Coupe, Sedan, etc.
    'Transmission Type',   # Automatic/Manual
    'Driven_Wheels',       # FWD/RWD/AWD
    'Engine Fuel Type'     # Fuel type can be a strong preference
]

medium_importance_num = ['Engine Cylinders', 'highway MPG', 'city mpg', 'Number of Doors', 'Price_per_HP']
medium_importance_cat = ['Make', 'Model']

# Create a preprocessor that applies different weights
# We do this by chaining a scaler with a custom weighting function (via FunctionTransformer)
from sklearn.preprocessing import FunctionTransformer

# Define weighting functions
def apply_high_weight(x):
    return x * 2.0  # Double the importance

def apply_medium_weight(x):
    return x * 1.0 # Keep importance as is

preprocessor = ColumnTransformer(
    transformers=[
        # High importance features
        ('num_high', MinMaxScaler(), high_importance_num),
        ('cat_high', OneHotEncoder(handle_unknown='ignore'), high_importance_cat),
        
        # Medium importance features
        ('num_medium', MinMaxScaler(), medium_importance_num),
        ('cat_medium', OneHotEncoder(handle_unknown='ignore'), medium_importance_cat)
    ],
    remainder='passthrough'
)

feature_matrix = preprocessor.fit_transform(df_rec)

print("Preprocessing V2 complete.")
print("Shape of the final feature matrix:", feature_matrix.shape)


Preprocessing V2 complete.
Shape of the final feature matrix: (11914, 1085)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_rec['price_to_efficiency'].fillna(df_rec['price_to_efficiency'].median(), inplace=True)


In [313]:
# Finds the best matching cars based on a dictionary of user preferences.
def get_recommendations_from_prefs(prefs, preprocessor, feature_matrix, dataframe, top_n=10):
    # Create a DataFrame with a single row representing the user's ideal car
    ideal_car = pd.DataFrame([prefs])

    # Ensure all required columns are present and in the same order as df_rec
    for col in df_rec.columns:
        if col not in ideal_car.columns:
            ideal_car[col] = np.nan
    ideal_car = ideal_car[df_rec.columns]

    # Fill missing values as in df_rec
    for col in df_rec.columns:
        if col in df_rec.select_dtypes(include=[np.number]).columns:
            median_val = df_rec[col].median()
            ideal_car[col] = pd.to_numeric(ideal_car[col], errors='coerce').fillna(median_val)
        else:
            ideal_car[col] = ideal_car[col].fillna('Unknown')

    # Use the *already fitted* preprocessor to transform the ideal car into a feature vector
    ideal_vector = preprocessor.transform(ideal_car)

    # Calculate similarity and find top matches (same logic as before)
    sim_scores = cosine_similarity(ideal_vector, feature_matrix)
    sim_scores = list(enumerate(sim_scores[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top N car indices
    car_indices = [i[0] for i in sim_scores[:top_n]]

    return dataframe[['Make', 'Model', 'Year', 'Market Category', 'Transmission Type', 'Vehicle Size', 'Engine HP', 'Engine Fuel Type', 'Efficiency', 'Price_per_HP', 'price_to_efficiency']].iloc[car_indices]

print("\n--- Recommendations based on User Preferences ---")

feature_matrix = preprocessor.fit_transform(df_rec)

user_preferences = {
    'Vehicle Style': 'Compact',
    'Engine HP': 100,
    'Year': 2014,
    'Transmission Type': 'Automatic',
    'Market Category': 'Hatchback,Hybrid'
}

display(get_recommendations_from_prefs(user_preferences, preprocessor, feature_matrix, df_rec))



--- Recommendations based on User Preferences ---


Unnamed: 0,Make,Model,Year,Market Category,Transmission Type,Vehicle Size,Engine HP,Engine Fuel Type,Efficiency,Price_per_HP,price_to_efficiency
7704,Toyota,Prius,2017,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,42.975207,248.057851,14.31791
7705,Toyota,Prius,2017,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,42.975207,240.785124,14.750371
7677,Toyota,Prius Prime,2017,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,44.628099,273.553719,13.482809
7703,Toyota,Prius,2017,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,42.975207,232.355372,15.285508
7706,Toyota,Prius,2017,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,42.975207,220.950413,16.074512
7678,Toyota,Prius Prime,2017,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,44.628099,238.016529,15.495868
7707,Toyota,Prius,2017,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,42.975207,204.008264,17.409442
7676,Toyota,Prius Prime,2017,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,44.628099,223.966942,16.467933
7708,Toyota,Prius,2017,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,45.867769,207.975207,18.22681
7700,Toyota,Prius,2016,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,42.975207,247.933884,14.325069


In [314]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

# Create a pipeline for categorical features: OneHotEncode then reduce dimension
categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
    # Reduce the many encoded columns down to 50 "latent feature" components
    ('svd', TruncatedSVD(n_components=50, random_state=42)) 
])

# Create the master preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        # Apply the entire pipeline to the categorical features
        ('cat', categorical_pipeline, categorical_features)
    ],
    remainder='passthrough'
)

#Now the output will be more compact and potentially capture more meaningful patterns.
feature_matrix = preprocessor.fit_transform(df_rec)


user_preferences = {
    'Vehicle Style': 'Compact',
    'Engine HP': 100,
    'Year': 2014,
    'Transmission Type': 'Automatic',
    'Market Category': 'Luxury'
}

# print("\n--- Recommendations based on User Preferences ---")
# display(get_recommendations_from_prefs(user_preferences, preprocessor, feature_matrix, df_rec))


print("\nCars similar to 'Prius':")
display(get_recommendations('Prius'))


Cars similar to 'Prius':


Unnamed: 0,Make,Model,Year,Market Category,Transmission Type,Vehicle Size,Engine HP,Engine Fuel Type,Efficiency,Price_per_HP
7695,Toyota,Prius,2015,"Hatchback,Hybrid",AUTOMATIC,Compact,134.0,regular unleaded,36.940299,201.380597
7669,Toyota,Prius c,2015,"Hatchback,Hybrid",AUTOMATIC,Compact,99.0,regular unleaded,50.0,197.373737
5916,Honda,Insight,2014,"Hatchback,Hybrid",AUTOMATIC,Compact,98.0,regular unleaded,43.367347,191.071429
7676,Toyota,Prius Prime,2017,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,44.628099,223.966942
3175,Lexus,CT 200h,2017,"Hatchback,Luxury,Hybrid",AUTOMATIC,Compact,134.0,regular unleaded,30.970149,233.208955
6841,Mitsubishi,Mirage,2017,Hatchback,AUTOMATIC,Compact,78.0,regular unleaded,51.282051,211.474359
4720,Honda,Fit,2017,Hatchback,AUTOMATIC,Compact,130.0,regular unleaded,28.076923,129.153846
9880,Chevrolet,Spark,2016,Hatchback,AUTOMATIC,Compact,98.0,regular unleaded,36.734694,176.377551
11345,Nissan,Versa Note,2017,Hatchback,AUTOMATIC,Compact,109.0,regular unleaded,32.110092,171.651376
11830,Toyota,Yaris,2017,Hatchback,AUTOMATIC,Compact,106.0,regular unleaded,30.660377,169.811321


In [315]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier # Import the model

In [316]:
# Create preprocessing pipelines
# Using median for numerical imputation is often more robust to outliers
numerical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
])

categorical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Create the master preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
], remainder='drop')

# Create the feature matrix
feature_matrix = preprocessor.fit_transform(df_rec)

In [317]:
def get_recommendations_tree(car_name, preprocessor, feature_matrix, df, top_n=10):
    try:
        # Find the index of the input car
        car_index = df[df['Model'].str.lower() == car_name.lower()].index[0]
    except IndexError:
        return f"Car '{car_name}' not found in the dataset."

    # --- Create the Target Variable ---
    # We create a target array where the selected car is '1' and all others are '0'
    y = np.zeros(feature_matrix.shape[0])
    y[car_index] = 1

    # --- Train the Model ---
    # We train a classifier to distinguish the target car from the others.
    # 'class_weight="balanced"' is crucial because we have a huge imbalance (one '1' vs. many '0's)
    model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced", n_jobs=-1)
    model.fit(feature_matrix, y)

    # --- Get Similarity Scores ---
    # The model's prediction probability for class '1' serves as our similarity score.
    probabilities = model.predict_proba(feature_matrix)[:, 1]

    # --- Find Top Recommendations ---
    # We sort the probabilities and get the indices of the top N cars.
    # We exclude the input car itself from the recommendations.
    similar_car_indices = probabilities.argsort()[-top_n-1:-1]

    # Return the recommended cars in descending order of similarity
    recommended_cars = df.iloc[similar_car_indices][::-1]

    # Add similarity score to the output
    recommended_cars['similarity_score'] = probabilities[similar_car_indices][::-1]

    return recommended_cars[['Make', 'Model', 'Year', 'Market Category', 'Transmission Type', 'Vehicle Size', 'Engine HP', 'Engine Fuel Type', 'Efficiency', 'Price_per_HP', 'price_to_efficiency', 'similarity_score']]

In [318]:
print(df_rec['price_to_efficiency'].describe())

count    11914.000000
mean        13.075413
std         28.483608
min          0.005319
25%          1.667178
50%          3.172317
75%          8.492717
max        345.454545
Name: price_to_efficiency, dtype: float64


In [319]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer

# Separate features and original target without log transform
x = df_rec.drop(columns=['price_to_efficiency']).reset_index(drop=True)
target = df_rec['price_to_efficiency'].fillna(0).reset_index(drop=True).values

# Split into training and testing sets using the original target
x_train, x_test, y_train, y_test = train_test_split(x, target, test_size=0.2, random_state=42)

# Define columns (replace numerical_pipeline and categorical_pipeline with your actual transformers)
numerical_features = ['Year', 'Engine HP', 'Engine Cylinders', 'highway MPG', 'city mpg', 'Efficiency', 'Age',
                      'MSRP', 'Number of Doors', 'Price_per_HP']

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
], remainder='drop')

# Fit preprocessor on training data only
feature_matrix_train = preprocessor.fit_transform(x_train)

# Transform test data using the fitted preprocessor
feature_matrix_test = preprocessor.transform(x_test)

# Train model on original target
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(feature_matrix_train, y_train)

# Predict on test data directly
y_pred = rf.predict(feature_matrix_test)

# Evaluate on original scale
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse:.6f}')
print(f'Mean Absolute Error: {mae:.6f}')
print(f'R^2 Score: {r2:.6f}')


Mean Squared Error: 0.239992
Mean Absolute Error: 0.110817
R^2 Score: 0.999698


In [320]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

def filter_by_preferences(df, user_preferences):
    numerical_features = [
        'Year', 'Engine HP', 'Engine Cylinders', 'highway MPG', 'city mpg',
        'Efficiency', 'Age', 'MSRP', 'Number of Doors', 'Price_per_HP', 'price_to_efficiency'
    ]

    categorical_features = [
        'Market Category', 'Vehicle Size', 'Vehicle Style', 'Transmission Type',
        'Driven_Wheels', 'Engine Fuel Type'
    ]

    filtered_df = df.copy()

    for key, value in user_preferences.items():
        if value is not None and key in filtered_df.columns:
            if key in numerical_features:
                # For numerical features, filter where feature >= value
                filtered_df = filtered_df[filtered_df[key] >= value]
            elif key in categorical_features:
                # For categorical features, filter where feature matches value (case-insensitive)
                filtered_df = filtered_df[filtered_df[key].str.lower().str.contains(value.lower())]
    return filtered_df


def get_recommendations_by_preference(user_preferences, preprocessor, feature_matrix, df, top_n=10):
    # Filter the dataframe based on user preferences
    df_filtered = filter_by_preferences(df, user_preferences)
    if df_filtered.empty:
        return pd.DataFrame()  # No matches found

    # Recreate the feature matrix for filtered data
    filtered_feature_matrix = preprocessor.transform(df_filtered)

    # Create target vector aligned with filtered df
    target = df_filtered['price_to_efficiency'].fillna(0).values

    # Train RandomForest on filtered data
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(filtered_feature_matrix, target)

    # Prepare full user preference vector with expected columns
    expected_columns = df.columns.tolist()
    user_pref_df = pd.DataFrame([{col: None for col in expected_columns}])
    for k, v in user_preferences.items():
        if k in expected_columns:
            user_pref_df.at[0, k] = v

    # Transform user preferences to feature vector
    user_feature_vector = preprocessor.transform(user_pref_df)

    # Predict similarity scores for filtered vehicles and for user preferences
    vehicle_scores = rf.predict(filtered_feature_matrix)
    user_score = rf.predict(user_feature_vector)[0]

    # Calculate similarity as negative absolute difference from user score (higher is better)
    similarity_scores = np.abs(vehicle_scores - user_score)

    df_filtered = df_filtered.copy()
    df_filtered['similarity_score'] = similarity_scores

    # Return top_n closest matches
    columns = ['Make', 'Model', 'Year', 'Market Category', 'Transmission Type', 'Vehicle Size',
               'Engine HP', 'Engine Fuel Type', 'MSRP', 'Efficiency', 'Price_per_HP',
               'price_to_efficiency', 'similarity_score']
    result_df = df_filtered.sort_values(by='similarity_score', ascending=False)[columns].head(top_n)

    return result_df

In [321]:
# Reset index so that feature_matrix and df_rec are aligned
df_rec_reset = df_rec.reset_index(drop=True)

print("\n--- Tree-Based Recommendations for 'Prius' ---")
display(get_recommendations_tree('Prius', preprocessor, feature_matrix, df_rec_reset))


user_preferences = {
    'Vehicle Style': '4dr Hatchback',
    'Vehicle Size': 'Compact',
    'Engine HP': 100,
    'Year': 2015,
    'Transmission Type': 'Automatic',
    'Market Category': 'Hatchback,Hybrid'
}

print("\n--- Tree-Based Recommendations by user preference ---")
display(get_recommendations_by_preference(user_preferences, preprocessor, feature_matrix, df_rec_reset))


--- Tree-Based Recommendations for 'Prius' ---


Unnamed: 0,Make,Model,Year,Market Category,Transmission Type,Vehicle Size,Engine HP,Engine Fuel Type,Efficiency,Price_per_HP,price_to_efficiency,similarity_score
7695,Toyota,Prius,2015,"Hatchback,Hybrid",AUTOMATIC,Compact,134.0,regular unleaded,36.940299,201.380597,13.689197,0.08
7693,Toyota,Prius,2015,"Hatchback,Hybrid",AUTOMATIC,Compact,134.0,regular unleaded,36.940299,180.597015,15.264586,0.03
7696,Toyota,Prius,2015,"Hatchback,Hybrid",AUTOMATIC,Compact,134.0,regular unleaded,36.940299,173.246269,15.912254,0.03
7694,Toyota,Prius,2015,"Hatchback,Hybrid",AUTOMATIC,Compact,134.0,regular unleaded,36.940299,212.201493,12.991137,0.02
7692,Toyota,Prius,2015,"Hatchback,Hybrid",AUTOMATIC,Compact,134.0,regular unleaded,36.940299,223.91791,12.311381,0.02
1983,Chevrolet,Bolt EV,2017,Hatchback,DIRECT_DRIVE,Compact,200.0,electric,59.5,204.525,14.545899,0.02
5312,Mercedes-Benz,GLE-Class Coupe,2017,"Crossover,Factory Tuner,Luxury,Performance",AUTOMATIC,Midsize,362.0,premium unleaded (required),5.524862,192.403315,0.793232,0.01
3892,Honda,Element,2009,Crossover,AUTOMATIC,Compact,166.0,regular unleaded,13.554217,155.240964,5.259688,0.01
3085,Honda,CR-Z,2016,"Hatchback,Hybrid",AUTOMATIC,Compact,130.0,regular unleaded,28.846154,193.0,11.497072,0.01
1119,Audi,A6,2017,Luxury,AUTOMATED_MANUAL,Midsize,252.0,premium unleaded (recommended),75.0,204.761905,14.534884,0.01



--- Tree-Based Recommendations by user preference ---


Unnamed: 0,Make,Model,Year,Market Category,Transmission Type,Vehicle Size,Engine HP,Engine Fuel Type,MSRP,Efficiency,Price_per_HP,price_to_efficiency,similarity_score
7701,Toyota,Prius,2016,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,24700,45.867769,204.132231,18.569947,4.626278
7708,Toyota,Prius,2017,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,25165,45.867769,207.975207,18.22681,4.555171
7697,Toyota,Prius,2016,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,24200,42.975207,200.0,17.75835,3.918423
7707,Toyota,Prius,2017,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,24685,42.975207,204.008264,17.409442,3.823329
7698,Toyota,Prius,2016,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,26250,42.975207,216.942149,16.371507,2.70112
7676,Toyota,Prius Prime,2017,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,27100,44.628099,223.966942,16.467933,2.602457
7706,Toyota,Prius,2017,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,26735,42.975207,220.950413,16.074512,2.489382
7696,Toyota,Prius,2015,"Hatchback,Hybrid",AUTOMATIC,Compact,134.0,regular unleaded,23215,36.940299,173.246269,15.912254,2.362733
7693,Toyota,Prius,2015,"Hatchback,Hybrid",AUTOMATIC,Compact,134.0,regular unleaded,24200,36.940299,180.597015,15.264586,2.102696
7678,Toyota,Prius Prime,2017,"Hatchback,Hybrid",AUTOMATIC,Compact,121.0,regular unleaded,28800,44.628099,238.016529,15.495868,1.706818


In [323]:
# Run this cell to import everything you need
import ipywidgets as widgets
from ipywidgets import interact
from IPython.display import display, clear_output

# Assume all your previous code for data loading, preprocessing, and the
# get_recommendations_tree() function is already in memory.

def show_similar_cars(car_name):
    #####  Wrapper function to display recommendations.
    # Call your tree-based recommendation function
    recommendations = get_recommendations(car_name)
    display(recommendations)

# Use interact to instantly create a UI
interact(show_similar_cars, car_name=df_rec['Model'].unique().tolist());

interactive(children=(Dropdown(description='car_name', options=('1 Series M', '1 Series', '100', '124 Spider',…