In [2]:
# initial setup
import polars as pl
import polars.selectors as cs
import sklearn 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config
set_config(transform_output='polars')

import warnings
warnings.filterwarnings('ignore')

In [3]:
# load the dataset
raw = pl.read_csv("/home/carlos/projects/ai/bmw.xls")
raw

model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
str,i64,i64,str,i64,str,i64,f64,f64
""" 5 Series""",2014,11200,"""Automatic""",67068,"""Diesel""",125,57.6,2.0
""" 6 Series""",2018,27000,"""Automatic""",14827,"""Petrol""",145,42.8,2.0
""" 5 Series""",2016,16000,"""Automatic""",62794,"""Diesel""",160,51.4,3.0
""" 1 Series""",2017,12750,"""Automatic""",26676,"""Diesel""",145,72.4,1.5
""" 7 Series""",2014,14500,"""Automatic""",39554,"""Diesel""",160,50.4,3.0
…,…,…,…,…,…,…,…,…
""" X3""",2016,19000,"""Automatic""",40818,"""Diesel""",150,54.3,2.0
""" 5 Series""",2016,14600,"""Automatic""",42947,"""Diesel""",125,60.1,2.0
""" 3 Series""",2017,13100,"""Manual""",25468,"""Petrol""",200,42.8,2.0
""" 1 Series""",2014,9930,"""Automatic""",45000,"""Diesel""",30,64.2,2.0


In [4]:
# pre-process to cast columns to correct type

def preprocess(df):
    return df.with_columns(
        model=pl.col('model').cast(pl.Categorical), 
        year=pl.col('year').cast(pl.String).cast(pl.Categorical),
        transmission=pl.col('transmission').cast(pl.Categorical),
        fuelType=pl.col('fuelType').cast(pl.Categorical))

preprocess(raw)

model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
cat,cat,i64,cat,i64,cat,i64,f64,f64
""" 5 Series""","""2014""",11200,"""Automatic""",67068,"""Diesel""",125,57.6,2.0
""" 6 Series""","""2018""",27000,"""Automatic""",14827,"""Petrol""",145,42.8,2.0
""" 5 Series""","""2016""",16000,"""Automatic""",62794,"""Diesel""",160,51.4,3.0
""" 1 Series""","""2017""",12750,"""Automatic""",26676,"""Diesel""",145,72.4,1.5
""" 7 Series""","""2014""",14500,"""Automatic""",39554,"""Diesel""",160,50.4,3.0
…,…,…,…,…,…,…,…,…
""" X3""","""2016""",19000,"""Automatic""",40818,"""Diesel""",150,54.3,2.0
""" 5 Series""","""2016""",14600,"""Automatic""",42947,"""Diesel""",125,60.1,2.0
""" 3 Series""","""2017""",13100,"""Manual""",25468,"""Petrol""",200,42.8,2.0
""" 1 Series""","""2014""",9930,"""Automatic""",45000,"""Diesel""",30,64.2,2.0


# Data Processing

In [5]:
# print(preprocess(raw).select(cs.categorical()).columns)
# print(preprocess(raw).select(cs.numeric()).columns)
pre_transformer = FunctionTransformer(preprocess)

numeric_features = preprocess(raw.drop('price')).select(cs.numeric()).columns
cat_features = preprocess(raw).select(cs.categorical()).columns

num_pipeline = Pipeline([
    ('std', StandardScaler())
])

#num_pipeline.fit_transform(preprocess(raw).select(numeric_features))

cat_pipeline = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

#cat_pipeline.fit_transform(preprocess(raw).select(cat_features))

ct = ColumnTransformer(transformers=[
    ('num', num_pipeline, numeric_features),
    ('cat', cat_pipeline, cat_features)
])

ct.fit_transform(preprocess(raw).select([*numeric_features, *cat_features]))

num__mileage,num__tax,num__mpg,num__engineSize,cat__model_ 1 Series,cat__model_ 2 Series,cat__model_ 3 Series,cat__model_ 4 Series,cat__model_ 5 Series,cat__model_ 6 Series,cat__model_ 7 Series,cat__model_ 8 Series,cat__model_ M2,cat__model_ M3,cat__model_ M4,cat__model_ M5,cat__model_ M6,cat__model_ X1,cat__model_ X2,cat__model_ X3,cat__model_ X4,cat__model_ X5,cat__model_ X6,cat__model_ X7,cat__model_ Z3,cat__model_ Z4,cat__model_ i3,cat__model_ i8,cat__year_1996,cat__year_1997,cat__year_1998,cat__year_1999,cat__year_2000,cat__year_2001,cat__year_2002,cat__year_2003,cat__year_2004,cat__year_2005,cat__year_2006,cat__year_2007,cat__year_2008,cat__year_2009,cat__year_2010,cat__year_2011,cat__year_2012,cat__year_2013,cat__year_2014,cat__year_2015,cat__year_2016,cat__year_2017,cat__year_2018,cat__year_2019,cat__year_2020,cat__transmission_Automatic,cat__transmission_Manual,cat__transmission_Semi-Auto,cat__fuelType_Diesel,cat__fuelType_Electric,cat__fuelType_Hybrid,cat__fuelType_Other,cat__fuelType_Petrol
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.653447,-0.108963,0.038326,-0.303911,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
-0.424388,0.216199,-0.433982,-0.303911,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1.483453,0.46007,-0.159533,1.507591,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
0.046894,0.216199,0.510634,-1.209662,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
0.559104,0.46007,-0.191445,1.507591,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.609379,0.297489,-0.066986,-0.303911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
0.694058,-0.108963,0.118108,-0.303911,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
-0.001153,1.110393,-0.433982,-0.303911,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
0.775714,-1.65348,0.24895,-0.303911,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [6]:
from sklearn.linear_model import LinearRegression


linear = LinearRegression()

x = raw.drop('price')
y = raw.select('price')
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

pipe = Pipeline([
    ('pre', pre_transformer),
    ('ct', ct),
    ('model', linear)
])

pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.8807085987784302

In [7]:
pipe.predict(X_test)

array([[13725.50551084],
       [13311.94377535],
       [39130.0722473 ],
       ...,
       [23489.03570078],
       [45853.76138838],
       [43503.28736544]], shape=(2157, 1))

In [8]:
y_test.head(8)

price
i64
15300
15495
39875
21730
13799
24499
42202
32400


In [9]:
pipe.predict(X_test.head(8))

array([[13725.50551084],
       [13311.94377535],
       [39130.0722473 ],
       [23534.16429859],
       [14754.59855195],
       [26484.48657778],
       [41065.15222211],
       [29364.09048829]])