In [1]:
import pandas as pd
import pickle

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.compose import make_column_selector as selector

In [2]:
df = pd.read_csv('argentina_cars.csv')

In [3]:
df['money_change'] = df.apply(lambda row: row['money']/0.0057 if row['currency'] == 'dólares' else row['money'], axis=1)

# df['motor'] = df['motor'].fillna(method='bfill')
df['motor'] = df['motor'].astype(float)

df.color = df.color.fillna(method='bfill')
df.gear = df.gear.fillna(method='bfill')
df.body_type = df.body_type.fillna(method='bfill')

df.rename(columns={'kilometres': 'km'}, inplace=True)


In [4]:
y = df['money_change']
X = df.drop(columns=['money', 'currency', 'money_change', 'color', 'model', 'body_type', 'motor'])
# X = df.drop(columns=['money', 'currency', 'money_change'])

X.head(2)

Unnamed: 0,brand,year,fuel_type,door,gear,km
0,Toyota,2022,Nafta,5.0,Automática,500
1,Jeep,2022,Nafta,5.0,Automática,500


In [5]:

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="object")),
        ("cat", categorical_transformer, selector(dtype_include="object")),
    ]
)

In [6]:
pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LinearRegression())]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

pipeline.fit(X_train, y_train)


In [7]:
print("model score: %.3f" % pipeline.score(X_test, y_test))

model score: 0.444


In [8]:
user = pd.DataFrame({'brand': ['toyota'],'year': [2022], 'fuel_type': ['Nafta'] ,'door': [5], 'gear': ['Automática'], 'km': [0]})

In [9]:
pipeline.predict(user)

array([9621842.57696275])

In [10]:
# pickle.dump(pipeline, open('car_estimation_pipeline_v1.pkl', 'wb'))