In [110]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

In [116]:
data = pd.read_csv('./cars_v2(ModelReady).csv', index_col='Unnamed: 0')

In [111]:
class SelectModel:
    
    def __init__(self,make,model,data):
        self.model = model
        self.make = make
        self.data = data
    
    def filter_data(self):
        cond = (self.data['manufacturer']==self.make) & (self.data['model']==self.model)
        return self.data[cond]
        
    def fmtvs(self):
        model_data = self.filter_data()
        return (model_data.drop(columns=['price','manufacturer','model','state','paint_color','drive',
                                         'type']),model_data['price'])
    
    def fit_predict(self, sample_case):
        self.X, self.y = self.fmtvs()
        transformer = ColumnTransformer([('imputer', SimpleImputer(strategy='most_frequent'),['fuel', 'transmission']),
                                         ('scaler', StandardScaler(),['miles_per_year','odometer','age'])])
        model = make_pipeline(
                    transformer,
                    OneHotEncoder(),
                    Ridge(random_state=42)
                    )
        model.fit(self.X,self.y)
        #return model.predict(sample_case)[0]
        params = {
                'ridge__alpha': range(1,2),
            }

        search = RandomizedSearchCV(model,
                                    params,
                                    n_iter=30,
                                    n_jobs=-1,
                                    cv=5,
                                    verbose=True,
                                    random_state=42)
        search.fit(self.X,self.y)
        
        self.model = search.best_estimator_
        self.params = search.best_params_
        self.score = search.best_score_
        self.prediction = int(self.model.predict(sample_case))
        self.mae = int(mean_absolute_error(self.y, self.model.predict(self.X)))
        
        return self.prediction


In [112]:
sample = pd.DataFrame({'condition': ['fair'], 'cylinders': '4 cylinders', 'fuel': 'hybrid','odometer': [168000], 'title_status': ['clean'],
                       'transmission': 'automatic', 'size': 'compact','age': [14], 'miles_per_year': [12000]})

In [118]:
data_c = data[['manufacturer','model','condition','age','odometer', 'miles_per_year','price']]

In [120]:
data_c.isnull().sum()

manufacturer      0
model             0
condition         0
age               0
odometer          0
miles_per_year    0
price             0
dtype: int64

In [119]:
X = data_c.drop(columns=['price'])
y = data_c['price']

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)

In [123]:
transformer = ColumnTransformer([('scaler', StandardScaler(),['miles_per_year','odometer','age'])])

In [124]:
model = make_pipeline(
            transformer,
            OneHotEncoder(),
            RandomForestRegressor(random_state=42))
model.fit(X_train,y_train)


  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  ['miles_per_year', 'odometer',
                                                   'age'])])),
                ('onehotencoder', OneHotEncoder(cols=[])),
                ('randomforestregressor',
                 RandomForestRegressor(random_state=42))])

In [127]:
data_c.iloc[0,:]

manufacturer        hyundai
model                sonata
condition         excellent
age                       7
odometer              90821
miles_per_year        12974
price                  8750
Name: 0, dtype: object

In [126]:
model.predict(data_c.iloc[0].drop())

IndexError: tuple index out of range

In [125]:
data_c

Unnamed: 0,manufacturer,model,condition,age,odometer,miles_per_year,price
0,hyundai,sonata,excellent,7.0,90821.0,12974,8750
1,toyota,prius,good,7.0,92800.0,13257,10900
2,toyota,corolla,good,13.0,160600.0,12353,3400
3,jeep,cherokee,excellent,1.0,23772.0,23772,28990
4,subaru,wrx,excellent,6.0,102000.0,17000,15000
...,...,...,...,...,...,...,...
78099,subaru,forester,excellent,8.0,82000.0,10250,9885
78100,ford,gt,good,18.0,58000.0,3222,4800
78101,hyundai,sonata,fair,14.0,159980.0,11427,1600
78102,toyota,sequoia,excellent,17.0,160000.0,9411,9000
