In [2]:
import numpy as np
import pandas as pd
import matplotlib as plt
import re
import os

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer

In [5]:
os.listdir("./data")

['used-cars.csv', '.gitignore']

In [6]:
raw = pd.read_csv("./data/used-cars.csv")

In [7]:
# initial cleanup
cars = raw.copy()
cars.drop("Unnamed: 0", axis=1, inplace=True)
cars.columns = [x.lower() for x in cars.columns]

In [8]:
cars.head()

Unnamed: 0,name,location,year,kilometers_driven,fuel_type,transmission,owner_type,mileage,engine,power,seats,new_price,price
0,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
1,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,13 km/kg,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
2,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
3,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74
4,Nissan Micra Diesel XV,Jaipur,2013,86999,Diesel,Manual,First,23.08 kmpl,1461 CC,63.1 bhp,5.0,,3.5


In [56]:
# will be passed a dataframe and should return a dataframe

def car_make(df):
    return df.apply(lambda x: x.str.split(" ").str[0])

def remove_units(df):
    return df.apply(lambda x: x.str.split(" ").str[0].astype("float"))

def convert_years(df):
    return df.apply(lambda x: 2019 - x)

def bin_owner(df):
    return df.map(lambda x: "Third" if x not in ["First", "Second"] else x)

def ordinal_owner(df):
    owner_map = {
        "First": 1,
        "Second": 2,
        "Third": 3
    }
    
    return df.map(lambda x: owner_map[x])

In [76]:
owner_pipeline = Pipeline(
    [
        ('high_val_bin', FunctionTransformer(bin_owner)),
        ('ordinal_encode', FunctionTransformer(ordinal_owner))
    ]
)

car_make_pipeline = Pipeline(
    [
        ('car_make', FunctionTransformer(car_make)),
        ('one_hot', OneHotEncoder())
    ]
)

cols_with_units_pipeline = Pipeline(
    [
        ("remove_units", FunctionTransformer(remove_units)),
        ("standard_scaler", StandardScaler())
    ]
)

ct = ColumnTransformer(
    [
         # seems to only work when passed a list of column names and expects function to take and return a dataframe
        ("owner_pipeline", owner_pipeline, ["owner_type"]),
        ("one_hot", OneHotEncoder(handle_unknown="ignore"), ["location", "fuel_type"]),
        ("cols_with_units", cols_with_units_pipeline, ["mileage", "engine", "power"]),
        ("ordinal", OrdinalEncoder(), ["transmission"]),
        ("convert_years", FunctionTransformer(convert_years), ["year"]),
        ("car_make_pipeline", car_make_pipeline, ["name"]),
    ],
    # remainder='passthrough'
)

pipe = Pipeline(
        [
            ("column transformers", ct),
            ("regression", LinearRegression()),
        ]
)

In [77]:
X_train = cars.drop(["new_price", "price"], axis = 1)
y_train = cars["price"]

In [79]:
pipe.fit_transform(X_train)

<5847x50 sparse matrix of type '<class 'numpy.float64'>'
	with 50810 stored elements in Compressed Sparse Row format>

In [51]:
scores = cross_val_score(pipe, X_train, y_train, cv=5)

  return df.applymap(lambda x: "Third" if x not in ["First", "Second"] else x)
  return df.applymap(lambda x: owner_map[x])
  return df.applymap(lambda x: "Third" if x not in ["First", "Second"] else x)
  return df.applymap(lambda x: owner_map[x])
  return df.applymap(lambda x: "Third" if x not in ["First", "Second"] else x)
  return df.applymap(lambda x: owner_map[x])
  return df.applymap(lambda x: "Third" if x not in ["First", "Second"] else x)
  return df.applymap(lambda x: owner_map[x])
  return df.applymap(lambda x: "Third" if x not in ["First", "Second"] else x)
  return df.applymap(lambda x: owner_map[x])


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 681, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/utils/parallel.py", line 65, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/joblib/parallel.py", line 1863, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/joblib/parallel.py", line 1792, in _get_sequential_output
    res = func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/utils/parallel.py", line 127, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/pipeline.py", line 957, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/pipeline.py", line 479, in fit_transform
    return last_step.fit_transform(Xt, y, **fit_params_last_step)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 157, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/base.py", line 919, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/preprocessing/_encoders.py", line 985, in fit
    self._fit(
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/preprocessing/_encoders.py", line 78, in _fit
    X_list, n_samples, n_features = self._check_X(
                                    ^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/preprocessing/_encoders.py", line 44, in _check_X
    X_temp = check_array(X, dtype=None, force_all_finite=force_all_finite)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/utils/validation.py", line 938, in check_array
    raise ValueError(
ValueError: Expected 2D array, got 1D array instead:
array=['Hyundai' 'Honda' 'Maruti' ... 'Mahindra' 'Maruti' 'Chevrolet'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/pipeline.py", line 423, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/pipeline.py", line 377, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/joblib/memory.py", line 353, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/pipeline.py", line 957, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 157, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 754, in fit_transform
    result = self._fit_transform(X, y, _fit_transform_one)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/calebcrouse/venvs/basic/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 694, in _fit_transform
    raise ValueError(_ERR_MSG_1DCOLUMN) from e
ValueError: 1D data passed to a transformer that expects 2D data. Try to specify the column selection as a list of one item instead of a scalar.


In [98]:
# one hot encode catagorical variables