In [16]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import SGDRegressor
from joblib import Memory
import pickle

# Import data

In [2]:
cars = pd.read_csv('used_cars_cleaned.csv')

In [3]:
cars.head()

Unnamed: 0,brand,base_model,trim,model_year,mileage,engine_type,fuel_type,horsepower,engine_size,ext_col,transmission_type,number_of_speeds,dual_shift_mode,int_col,accident_damage_reported,clean_title,price
0,Ford,Utility Police,Interceptor Base,2013,51000,V6 Cylinder Engine,Flex Fuel Capability,300,3.7,Black,A/T,6,0,Black,1,1,10300
1,INFINITI,Q50 Hybrid,Sport,2015,88900,V6 Cylinder Engine,Gas/Electric Hybrid,354,3.5,Black,A/T,7,0,Black,0,1,15500
2,Audi,S3 2,.0T Premium Plus,2017,84000,4 Cylinder Engine,Gasoline Fuel,292,2.0,Blue,A/T,6,0,Black,0,1,31000
3,BMW,740 Il,,2001,242000,8 Cylinder Engine,Gasoline Fuel,282,4.4,Green,A/T,Unknown,0,Green,0,1,7300
4,Lexus,Rc 350,F Sport,2021,23436,V6 Cylinder Engine,Gasoline Fuel,311,3.5,Black,A/T,6,0,Black,0,1,41927


In [4]:
cars.dtypes

brand                        object
base_model                   object
trim                         object
model_year                    int64
mileage                       int64
engine_type                  object
fuel_type                    object
horsepower                    int64
engine_size                 float64
ext_col                      object
transmission_type            object
number_of_speeds             object
dual_shift_mode               int64
int_col                      object
accident_damage_reported     object
clean_title                  object
price                         int64
dtype: object

Category has been changed to object again. Let's fix it.

In [5]:
for col in cars:
    if cars[col].dtype not in ['float64', 'int64']:
        cars[col] = cars[col].astype('category')

In [6]:
cars.dtypes

brand                       category
base_model                  category
trim                        category
model_year                     int64
mileage                        int64
engine_type                 category
fuel_type                   category
horsepower                     int64
engine_size                  float64
ext_col                     category
transmission_type           category
number_of_speeds            category
dual_shift_mode                int64
int_col                     category
accident_damage_reported    category
clean_title                 category
price                          int64
dtype: object

## Preprocessing and Feature Selection

In [7]:
#Define features and target
X = cars.drop(columns=['price']) #Drops the target column
y = cars['price']

In [8]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['category']).columns

Categorical columns will be OneHotEncoded. That will result in a large number of columns greatly increasing computing time. 

To work around this issue, I will create a function to group more rare categories into an "Other" category to simplify the process.

In [9]:
def group_rare_categories(df, categorical_features, threshold=0.05):
    df_copy = df.copy()
    for col in categorical_features:
        #Calculate category features
        value_counts = df[col].value_counts(normalize=True)
        rare_categories = value_counts[value_counts < threshold].index

        #Replace rare categories with "Other"
        df_copy[col] = df_copy[col].apply(lambda x: x if x not in rare_categories else 'Other')

    return df_copy

#Apply function
X_grouped = group_rare_categories(X, categorical_features, threshold=0.05)

In [10]:
#Numeric and categorical preprocessors
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),  # Handle NaNs in numeric columns
    ('scaler', RobustScaler())  # Scale numeric values
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),  # Handle NaNs in categorical columns
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Encode categorical values
])

In [11]:
#Combine preprocessors in a Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [14]:
#Cache steps to avoid expensive re-computing
memory = Memory(location='cache_dir', verbose=0)

#Create pipeline-- utilize RFECV to test different subsets of features to improve model performanceb
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', RFECV(estimator=SGDRegressor(), step=1, cv=5, scoring='r2', n_jobs=-1))
], memory=memory)

#Fit pipeline
pipeline.fit(X_grouped, y)

#Extract selector from pipeline
selector = pipeline.named_steps['selector']

#Get transformed data
X_transformed = pipeline.named_steps['preprocessor'].transform(X)

#Get selected feature names
selected_features = selector.support_
all_features = pipeline.named_steps['preprocessor'].get_feature_names_out()
selected_feature_names = [all_features[i] for i, selected in enumerate(selected_features) if selected]

#Create DataFrame for the selected features
selected_features_df = pd.DataFrame(X_transformed[:, selected_features], columns=selected_feature_names)

#Add the target variable back to the DataFrame
selected_features_df['price'] = y.reset_index(drop=True)

print(selected_features_df)

      num__model_year  num__mileage  num__horsepower  num__engine_size  \
0              -0.375     -0.151515        -0.065789          0.090909   
1              -0.125      0.370523         0.289474          0.000000   
2               0.125      0.303030        -0.118421         -0.681818   
3              -1.875      2.479339        -0.184211          0.409091   
4               0.625     -0.531185         0.006579          0.000000   
...               ...           ...              ...               ...   
3196            0.875     -0.826446         1.927632          0.227273   
3197            0.250     -0.114256        -0.453947         -0.681818   
3198            0.750     -0.703857         0.256579         -0.227273   
3199            0.500     -0.399449         0.921053          0.000000   
3200            0.500     -0.261708        -0.407895         -0.681818   

      num__dual_shift_mode  cat__brand_BMW  cat__brand_Chevrolet  \
0                      0.0             0.0 

That's a lot of rows with a ranking of "1"!

In [15]:
selected_features_df.to_csv('selected_features.csv')

## Export the preprocessor and pipeline

In [18]:
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

with open('pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)