In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [439]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np


class Kickit_dataframe_preprocessor (BaseEstimator, TransformerMixin):
    """ 
    This is transformer class that inherits from the BaseEstimator,
    and TransformerMixin of Sklearn library. 
    The class have been modified to provide to instances 
    all the preprocessing capacity specific to the  kick dataset
    It is compatible with sklearn Pipeline and provide insights as messages when ..... 
    
    parameters =             X (required)  : the dataframe
                  choosen_axis (optional)  : a subset of features of the previous dataframe
   
   mother class = Sklearn.Base.BaseEstimator , TransformerMixin
    """

    def fit(self, X, choosen_axis = None, y=None):
        return  self  
        
    
    def transform(self, X, choosen_axis = None , y=None):       
        
        
        # seting the list of column
        catList = ["Model", "Make" ,"Transmission"]
        # looping for the imputing strategy for each column
        for cat in catList:
            # replace each null value by the most frequent value
            X[cat] = X[cat].astype(str)
            X[cat]= X[cat].replace( ['?'], X[cat].mode )
            
        # seting the list of column
        numList = ["VehOdo","MMRAcquisitionAuctionAveragePrice"]
        
        # Special transition of "MMRAcquisitionAuctionAveragePrice"  to float with '?' as n
        X['MMRAcquisitionAuctionAveragePrice'] = X['MMRAcquisitionAuctionAveragePrice'].replace(['?'], '10')
        X['MMRAcquisitionAuctionAveragePrice'].astype('str')
        X['MMRAcquisitionAuctionAveragePrice'] = X['MMRAcquisitionAuctionAveragePrice'].astype('float')
        
        # looping for the imputing strategy for each column
        for num in numList:
            # replace each null value by the mean value
            X[num]= X[num].replace( ['?'], X[num].mean )
            #Converting the numerical olumns to int
           
        
        # selecting the features to keep
        if choosen_axis == None :
            pass
        else : 
            X = X[choosen_axis]
        return  X
    
    
    
class Kickit_dataframe_balancer (BaseEstimator, TransformerMixin):
    """ 
    This is transformer class that inherits from the BaseEstimator,
    and TransformerMixin of Sklearn library. 
    The class have been modified to provide to instances 
    capacity  to balance the  kick dataset
    It is compatible with sklearn Pipeline and provide insights as messages when ..... 
    parameters =             X (required)  : the dataframe
    mother class = Sklearn.Base.BaseEstimator , TransformerMixin
    """
    def fit(self, X, y=None):
        return  self  

    def transform(self, X, y=None):
        # Count how many targets are 1 (meaning that the car is a kick)
        num_one_targets = X['IsBadBuy'].sum()
        # Set a counter for targets that are 0 
        zero_targets_counter = 0
        row_list2 =[]
        # We want to create a "balanced" dataset, so we will have to remove some input/target pairs

        for i in X.index.astype('int64'):
            if (X.loc[i,'IsBadBuy']) == 0:
                zero_targets_counter += 1
                if zero_targets_counter > num_one_targets:
                    row_list2.append(i)
        X = X.drop(index=row_list2,axis= 0)
        #resetting the index
        X.index = range(len(X.index))
        return  X
    
    
    
class Kickit_frequency_encoder (BaseEstimator, TransformerMixin):
    def fit(self, X, catCols = ["Model", "Make" ,"Transmission"],  y=None):
        return  self  
        
    def transform(self, X, catCols = ["Model", "Make" ,"Transmission"], y=None):  
        global freq_cat_dict 
        freq_cat_dict = {}
        for col in X[catCols]:
            df_frequency_map = X[col].value_counts().to_dict()
            X[col] = X[col].map(df_frequency_map)
            freq_cat_dict.update(df_frequency_map)
        return  X  
    
    
    
    
    
class Kickit_weight_of_evidence_encoder (BaseEstimator, TransformerMixin):
    """ 
    This class provide to instances capacity  to encode the categorical features
    in the kick dataset. It is compatible with sklearn Pipeline.
    
    parameters =             X (required)  : the dataframe
    
    mother class = Sklearn.Base.BaseEstimator , TransformerMixin
    """

    
    def fit(self, X, catCols = None, y=None):
        return  self  
    
    def transform(self, X, catCols = ["Model", "Make" ,"Transmission"] , y=None):
        global woe_cat_dict 
        woe_cat_dict = {}
        for col in X[catCols]:

            WOE = np.log((0.5 + X[col][X.IsBadBuy == 0 ].value_counts())/(0.5 + X[col][X.IsBadBuy == 1 ].value_counts()))
            df_frequency_map = WOE.to_dict()
            X[col] = X[col].map(df_frequency_map)
            X = X.dropna()
            woe_cat_dict.update(df_frequency_map)
        return  X 
    


In [440]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
#from Kickit_app_transformers import Kickit_dataframe_preprocessor
#from Kickit_app_transformers import Kickit_frequency_encoder
#from Kickit_app_transformers import Kickit_weight_of_evidence_encoder
#from Kickit_app_transformers import  Kickit_dataframe_balancer
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [441]:
# Instantiating the pipeline object 
# this on will clean the data with respect to our strategy
preprocess = Kickit_dataframe_preprocessor()
# this one will use the frequency encoding technique for categorical variable 
freq_encoder = Kickit_frequency_encoder()
# # this one will use the weight of evidence encoding technique for categorical variable 
woe_encoder = Kickit_weight_of_evidence_encoder()
# this one will balance the dataset
balancer = Kickit_dataframe_balancer()
# this is the standard scaler from sklearn API
scaler = StandardScaler()

In [354]:
np.log(1)

0.0

In [355]:
# reading the dataset
df_raw = pd.read_csv('kick.csv')
# seleecting the data  that come from ADESA website
#df_adesa_raw = df_raw[ df_raw['Auction']=='ADESA']
#df_adesa_raw.index = range(len(df_adesa_raw.index))
#df_adesa_raw = df_adesa_raw.drop('Auction', axis =1)



In [356]:
choosen_axis = ["Model", "Make" ,"Transmission","VehOdo","MMRAcquisitionAuctionAveragePrice", "IsBadBuy"]
df_adesa_raw = df_raw[choosen_axis]

In [357]:
df_adesa_raw[pd.isnull(df_adesa_raw).any(axis=1)]

Unnamed: 0,Model,Make,Transmission,VehOdo,MMRAcquisitionAuctionAveragePrice,IsBadBuy


In [358]:
df_adesa_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72983 entries, 0 to 72982
Data columns (total 6 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Model                              72983 non-null  object
 1   Make                               72983 non-null  object
 2   Transmission                       72983 non-null  object
 3   VehOdo                             72983 non-null  int64 
 4   MMRAcquisitionAuctionAveragePrice  72983 non-null  object
 5   IsBadBuy                           72983 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 3.3+ MB


In [359]:
#df_adesa_raw['MMRAcquisitionAuctionAveragePrice'] = df_adesa_raw['MMRAcquisitionAuctionAveragePrice'].replace(['?'], '10')

In [360]:
#df_adesa_raw['MMRAcquisitionAuctionAveragePrice'].astype('str')

In [361]:
#df_adesa_raw['MMRAcquisitionAuctionAveragePrice'] = df_adesa_raw['MMRAcquisitionAuctionAveragePrice'].astype('float')

In [362]:
df_adesa_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72983 entries, 0 to 72982
Data columns (total 6 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Model                              72983 non-null  object
 1   Make                               72983 non-null  object
 2   Transmission                       72983 non-null  object
 3   VehOdo                             72983 non-null  int64 
 4   MMRAcquisitionAuctionAveragePrice  72983 non-null  object
 5   IsBadBuy                           72983 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 3.3+ MB


In [363]:
df_adesa_raw.isnull().sum()

Model                                0
Make                                 0
Transmission                         0
VehOdo                               0
MMRAcquisitionAuctionAveragePrice    0
IsBadBuy                             0
dtype: int64

In [442]:
# Creatine a pipeline for the first encoding technique : frequency encoder
df = df_adesa_raw.copy()
freq_pipeline = Pipeline([("preprocess",preprocess),("balancer", balancer),("fencoder",freq_encoder)])

In [443]:
data = df
data

Unnamed: 0,Model,Make,Transmission,VehOdo,MMRAcquisitionAuctionAveragePrice,IsBadBuy
0,MAZDA3,MAZDA,AUTO,89046,8155,0
1,1500 RAM PICKUP 2WD,DODGE,AUTO,93593,6854,0
2,STRATUS V6,DODGE,AUTO,73807,3202,0
3,NEON,DODGE,AUTO,65617,1893,0
4,FOCUS,FORD,MANUAL,69367,3913,0
...,...,...,...,...,...,...
72978,SABLE,MERCURY,AUTO,45234,1996,1
72979,MALIBU 4C,CHEVROLET,AUTO,71759,6418,0
72980,GRAND CHEROKEE 2WD V,JEEP,AUTO,88500,8545,0
72981,IMPALA,CHEVROLET,AUTO,79554,6420,0


In [444]:
freq_pipeline[2].fit_transform(data).isnull().sum()

Model                                0
Make                                 0
Transmission                         0
VehOdo                               0
MMRAcquisitionAuctionAveragePrice    0
IsBadBuy                             0
dtype: int64

In [445]:
adesa_freq_pipe = freq_pipeline.fit_transform(data)

In [446]:
adesa_freq_pipe[pd.isnull(adesa_freq_pipe).any(axis=1)]

Unnamed: 0,Model,Make,Transmission,VehOdo,MMRAcquisitionAuctionAveragePrice,IsBadBuy


In [449]:
woe_cat_dict


NameError: name 'woe_cat_dict' is not defined

In [370]:
columns = adesa_freq_pipe.columns
#data = adesa_freq_pipe["Model, Make, Transmission, VehOdo, MMRAcquisitionAuctionAveragePrice"]
scaled_feat_array = scaler.fit_transform(adesa_freq_pipe)

In [371]:
scaled_feat_df = pd.DataFrame(scaled_feat_array, columns=columns)
features = scaled_feat_df.drop('IsBadBuy',axis=1) 
target = adesa_freq_pipe['IsBadBuy']

In [372]:
features.head(5)

Unnamed: 0,Model,Make,Transmission,VehOdo,MMRAcquisitionAuctionAveragePrice
0,-0.406376,-1.481415,0.184665,1.101185,0.924245
1,0.607945,0.463286,0.184665,1.418752,0.417017
2,-0.10502,0.463286,0.184665,0.036878,-1.006807
3,-0.12707,0.463286,0.184665,-0.535119,-1.517154
4,0.681446,0.512666,-5.414855,-0.273215,-0.729606


In [373]:
adesa_freq_pipe.isnull().sum()

Model                                0
Make                                 0
Transmission                         0
VehOdo                               0
MMRAcquisitionAuctionAveragePrice    0
IsBadBuy                             0
dtype: int64

In [450]:
#  Creatine a pipeline for the Weight of evidence  encoding technique 
df = df_adesa_raw.copy()
woe_pipeline = Pipeline([("preprocess",preprocess),("balancer", balancer),("woe_encoder", woe_encoder)])
adesa_woe_pipe = woe_pipeline.fit_transform(df)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].map(df_frequency_map)


In [451]:
import pickle
# open a file, where you ant to store the data
file = open('kidict.pkl', 'wb')

# dump information to that file
pickle.dump(cat_dict, file)

In [438]:
import numpy 

cat_dict.values = numpy.nan_to_num(cat_dict.values)


AttributeError: 'dict' object attribute 'values' is read-only

<function dict.values>

In [408]:
cat_dict

{'1500 RAM PICKUP 2WD': -0.16917873135476025,
 '1500 RAM PICKUP 4WD': -0.3677247801253174,
 '1500 SIERRA PICKUP 2': 0.41985384556026406,
 '1500 SIERRA PICKUP 4': nan,
 '1500 SILVERADO PICKU': 0.742374121325096,
 '1500HD SIERRA PICKUP': nan,
 '2500 RAM PICKUP 2WD': -0.5108256237659907,
 '2500 RAM PICKUP 4WD': nan,
 '2500HD SILVERADO PIC': -0.25131442828090605,
 '3.2 CL 3.2L V 6 NA S': nan,
 '3.2 TL': nan,
 '3.2 TL 3.2L V6 EFI': nan,
 '3.2 TL 3.2L V6 FI DO': nan,
 '300': 0.7810085363512795,
 '300 2.7L V6 MPI': 1.0986122886681098,
 '300 3.5L / 6.0L V12': nan,
 '300 3.5L V6 MPI': nan,
 '300 3.5L V6 MPI / SM': 0.0,
 '300C': 0.0,
 '300C 5.7L V8 SFI': nan,
 '300C 5.7L V8 SFI / S': nan,
 '300M': -1.0986122886681098,
 '300M 3.5L V6 MPI': -0.8472978603872037,
 '300M 3.5L V6 MPI HO': nan,
 '350Z': nan,
 '350Z MFI V6 3.5L DOH': nan,
 '4 RUNNER 2WD V6': nan,
 '4 RUNNER 2WD V8': nan,
 '4 RUNNER 2WD V8 4.7L': nan,
 '4 RUNNER 4WD V6': nan,
 '4 RUNNER 4WD V6 3.4L': nan,
 '626 4C': nan,
 '626 4C 2.0L I-

In [409]:
  v in v.items()} }

AttributeError: 'float' object has no attribute 'items'

In [411]:
pd.replace(np.nan:'50')

In [413]:
cat_dict.replace(np.nan ,0)

AttributeError: 'dict' object has no attribute 'replace'

In [376]:
adesa_freq_pipe['IsBadBuy'].sum()

8976

In [377]:
adesa_woe_pipe['IsBadBuy'].sum()

8579

In [378]:
adesa_woe_pipe.isnull().sum()

Model                                0
Make                                 0
Transmission                         0
VehOdo                               0
MMRAcquisitionAuctionAveragePrice    0
IsBadBuy                             0
dtype: int64

In [379]:
adesa_woe_pipe[pd.isnull(adesa_woe_pipe).any(axis=1)]

Unnamed: 0,Model,Make,Transmission,VehOdo,MMRAcquisitionAuctionAveragePrice,IsBadBuy


In [380]:
columns = adesa_woe_pipe.columns
scaled_feat_array = scaler.fit_transform(adesa_woe_pipe)
scaled_feat_df = pd.DataFrame(scaled_feat_array, columns=columns)
features = scaled_feat_df.drop('IsBadBuy',axis=1) 
target = adesa_woe_pipe['IsBadBuy']

In [381]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=0)

In [382]:
bench_cross = [Pipeline([('lr_classifier',LogisticRegression(random_state=0))]),
               Pipeline([('dt_classifier',DecisionTreeClassifier())]),
               Pipeline([('rf_classifier',RandomForestClassifier())])] 

In [383]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [384]:
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}

# Fit the pipelines
for pipe in bench_cross:
    pipe.fit(x_train, y_train)

In [385]:
for i,model in enumerate(bench_cross):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(x_test,y_test)))

Logistic Regression Test Accuracy: 0.6415094339622641
Decision Tree Test Accuracy: 0.577542567878509
RandomForest Test Accuracy: 0.6056143580303728


In [386]:
# Create a pipeline
pipe = make_pipeline((RandomForestClassifier()))
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"randomforestclassifier": [RandomForestClassifier()],
                 "randomforestclassifier__n_estimators": [10],
                 "randomforestclassifier__max_depth":[5,8,15,25,30,None],
                 "randomforestclassifier__min_samples_leaf":[1,2,5,10,15,100],
                 "randomforestclassifier__max_leaf_nodes": [2, 5,10]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(x_train,y_train)

In [387]:
best_model.score(x_test,y_test)

0.6419696272434422

In [388]:
print(best_model.best_estimator_)

Pipeline(steps=[('randomforestclassifier',
                 RandomForestClassifier(max_depth=25, max_leaf_nodes=10,
                                        min_samples_leaf=100,
                                        n_estimators=10))])


In [452]:
import pickle
# open a file, where you ant to store the data
file = open('model.pkl', 'wb')

# dump information to that file
pickle.dump(best_model, file)