In [37]:
import pandas as pd
import numpy as np
np.random.seed(0)

from IPython.display import display
import ipywidgets as widgets

from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

from sklearn.dummy import DummyRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE
import joblib
import pickle

import xgboost as xgb
import catboost as ctb
import lightgbm as lgb

import mlflow
import mlflow.sklearn
import mlflow.xgboost

import re
import sys

import scikitplot as skplt

import seaborn as sns
import matplotlib.pyplot as plt

pd.pandas.set_option('display.max_columns', None)
%matplotlib inline

In [2]:
train_set = pd.read_csv('./train_set_feature_engineering.csv')

In [3]:
train_set.head()

Unnamed: 0,price_value,amenities,Oferta od,Typ,Stan,Engine capacity,No accident,Mileage,Door count,Fuel type,Gearbox,Transmission,Engine power,Color,Damaged,Region,Company,Price,Lacquer,Age,Norm Engine capacity,Norm Mileage,Norm Engine power,Norm Age,Norm amenities
0,63900,38,Firmy,Kompakt/Hatchback,Używane,1199,Tak,200,5,Benzyna,Manualna/Półautomatyczna,Na przednie koła,110,Czarny,Nie,warminsko-mazurskie,Citroën,11.065075,Metalik,6,7.089243,200,4.70048,1.791759,38
1,41900,24,Firmy,SUV,Używane,1598,Nie,118808,5,Benzyna,Automatyczna,Na przednie koła,117,Czarny,Nie,mazowieckie,Nissan,10.643041,Metalik,11,7.376508,118808,4.762174,2.397895,24
2,17950,28,Firmy,Coupe,Używane,1364,Tak,106000,3,Benzyna,Manualna/Półautomatyczna,Na przednie koła,90,Czarny,Nie,wielkopolskie,Opel,9.795345,Metalik,15,7.218177,106000,4.49981,2.70805,28
3,38000,35,Osoby prywatnej,Sedan,Używane,1798,Nie,172758,5,Benzyna,Manualna/Półautomatyczna,Na przednie koła,160,Czarny,Nie,opolskie,Audi,10.545341,Inny,15,7.49443,172758,5.075174,2.70805,35
4,21900,23,Osoby prywatnej,Auta miejskie/małe,Używane,1250,Tak,97128,5,Benzyna,Manualna/Półautomatyczna,Na przednie koła,82,Bordowy,Nie,opolskie,Ford,9.994242,Metalik,13,7.130899,97128,4.406719,2.564949,23


In [4]:
train_set['Door count'] = train_set['Door count'].astype('str')

In [5]:
col_label_encode = ['Typ', 'Fuel type', 'Color', 'Region', 'Company']
col_standard_scaler = ['Norm Engine capacity', 'Norm Mileage', 'Norm Engine power', 'Norm Age']
col_one_hot_encode = ['Oferta od', 'Stan', 'No accident', 'Door count', 'Gearbox', 'Transmission', 'Damaged', 'Lacquer']

In [6]:
le = LabelEncoder()
encoding_mapping = {}
for col in col_label_encode:
    
    col_new = f'norm_{col}'
    
    train_set[col_new] = le.fit_transform(train_set[col])
    
    encoding_mapping[col] = dict(zip(train_set[col], train_set[col_new]))

In [7]:
with open('encoding_mapping.txt', 'w') as file:
    file.write(str(encoding_mapping))

In [8]:
scaler = StandardScaler()
train_set[col_standard_scaler] = scaler.fit_transform(train_set[col_standard_scaler])

In [9]:
def one_hot_encode_dataframe(df):
    encoded_dataframe = pd.get_dummies(df, columns=col_one_hot_encode)
    return encoded_dataframe

In [10]:
train_set_enc = one_hot_encode_dataframe(train_set)

In [11]:
train_set_enc.head()

Unnamed: 0,price_value,amenities,Typ,Engine capacity,Mileage,Fuel type,Engine power,Color,Region,Company,Price,Age,Norm Engine capacity,Norm Mileage,Norm Engine power,Norm Age,Norm amenities,norm_Typ,norm_Fuel type,norm_Color,norm_Region,norm_Company,Oferta od_Firmy,Oferta od_Osoby prywatnej,Stan_Nowe,Stan_Używane,No accident_Nie,No accident_Tak,Door count_3,Door count_5,Gearbox_Automatyczna,Gearbox_Manualna/Półautomatyczna,Transmission_4x4 stały/dołączany automatycznie,Transmission_Na przednie koła,Transmission_Na tylne koła/4x4 (dołączany ręcznie),Damaged_Nie,Damaged_Tak,Lacquer_Inny,Lacquer_Metalik
0,63900,38,Kompakt/Hatchback,1199,200,Benzyna,110,Czarny,warminsko-mazurskie,Citroën,11.065075,6,-0.950847,-1.56425,-0.438799,-1.662566,38,3,0,4,13,14,1,0,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1
1,41900,24,SUV,1598,118808,Benzyna,117,Czarny,mazowieckie,Nissan,10.643041,11,-0.279087,-0.262598,-0.285932,-0.314186,24,6,0,4,6,57,1,0,0,1,1,0,0,1,1,0,0,1,0,1,0,0,1
2,17950,28,Coupe,1364,106000,Benzyna,90,Czarny,wielkopolskie,Opel,9.795345,15,-0.649339,-0.403158,-0.936032,0.37577,28,1,0,4,14,59,1,0,0,1,0,1,1,0,0,1,0,1,0,1,0,0,1
3,38000,35,Sedan,1798,172758,Benzyna,160,Czarny,opolskie,Audi,10.545341,15,-0.00333,0.329471,0.489637,0.37577,35,7,0,4,7,4,0,1,0,1,1,0,0,1,0,1,0,1,0,1,0,1,0
4,21900,23,Auta miejskie/małe,1250,97128,Benzyna,82,Bordowy,opolskie,Ford,9.994242,13,-0.853436,-0.500523,-1.166697,0.057435,23,0,0,2,7,21,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1


In [12]:
def get_feats(df, black_list):
    feats = df.columns
    return [x for x in feats if x not in black_list]

In [13]:
def get_X(df, feats):
    
    X = df[feats]
    return X

def get_y(df, target_var):
    return df[target_var].values

In [14]:
black_list = ['Typ', 'Fuel type','Color','Region','Company','price_value', 'amenities', 'Engine capacity', 'Mileage', 'Engine power', 'Price', 'Age', 'Norm amenities']

In [15]:
feats = get_feats(train_set_enc, black_list)

In [16]:
print(feats)

['Norm Engine capacity', 'Norm Mileage', 'Norm Engine power', 'Norm Age', 'norm_Typ', 'norm_Fuel type', 'norm_Color', 'norm_Region', 'norm_Company', 'Oferta od_Firmy', 'Oferta od_Osoby prywatnej', 'Stan_Nowe', 'Stan_Używane', 'No accident_Nie', 'No accident_Tak', 'Door count_3', 'Door count_5', 'Gearbox_Automatyczna', 'Gearbox_Manualna/Półautomatyczna', 'Transmission_4x4 stały/dołączany automatycznie', 'Transmission_Na przednie koła', 'Transmission_Na tylne koła/4x4 (dołączany ręcznie)', 'Damaged_Nie', 'Damaged_Tak', 'Lacquer_Inny', 'Lacquer_Metalik']


In [17]:
X = get_X(train_set_enc, feats)

In [18]:
y = get_y(train_set_enc, target_var = 'Price')

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .8, shuffle = True)

In [20]:
clf = RandomForestRegressor(n_estimators=500, max_depth=20, criterion='friedman_mse', random_state=0)

selector = RFE(
    clf,
    n_features_to_select=10,
    step=1,
)

selector.fit(X_train, y_train)

In [21]:
print(selector.support_)

[ True  True  True  True  True False  True  True  True False False False
 False False False False False False False False  True False  True False
 False False]


In [22]:
print(feats)

['Norm Engine capacity', 'Norm Mileage', 'Norm Engine power', 'Norm Age', 'norm_Typ', 'norm_Fuel type', 'norm_Color', 'norm_Region', 'norm_Company', 'Oferta od_Firmy', 'Oferta od_Osoby prywatnej', 'Stan_Nowe', 'Stan_Używane', 'No accident_Nie', 'No accident_Tak', 'Door count_3', 'Door count_5', 'Gearbox_Automatyczna', 'Gearbox_Manualna/Półautomatyczna', 'Transmission_4x4 stały/dołączany automatycznie', 'Transmission_Na przednie koła', 'Transmission_Na tylne koła/4x4 (dołączany ręcznie)', 'Damaged_Nie', 'Damaged_Tak', 'Lacquer_Inny', 'Lacquer_Metalik']


In [26]:
new_black_list = ['price_value', 'amenities', 'Engine capacity', 'Mileage', 'Engine power', 'Price', 'Age', 'Fuel type', 
                  'Color', 'Oferta od_Firmy', "Oferta od_Osoby prywatnej", 'Stan_Nowe', 'Stan_Używane', 'No accident_Nie',
                  'No accident_Tak', 'Door count_3', 'Door count_5', 'Gearbox_Automatyczna', 'Gearbox_Manualna/Półautomatyczna',
                 'Transmission_4x4 stały/dołączany automatycznie', 'Transmission_Na tylne koła/4x4 (dołączany ręcznie)',
                 'Damaged_Tak', 'Lacquer_Inny', 'Lacquer_Metalik', 'norm_Fuel type', 'Region', 'Norm amenities', 'Typ',
                  'Region', 'Company']

In [27]:
new_feats = get_feats(train_set_enc, new_black_list)

In [28]:
print(new_feats)

['Norm Engine capacity', 'Norm Mileage', 'Norm Engine power', 'Norm Age', 'norm_Typ', 'norm_Color', 'norm_Region', 'norm_Company', 'Transmission_Na przednie koła', 'Damaged_Nie']


In [29]:
train_set_selected = train_set_enc[new_feats]

In [30]:
train_set_selected['Price'] = train_set['Price']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set_selected['Price'] = train_set['Price']


In [31]:
train_set_selected.head()

Unnamed: 0,Norm Engine capacity,Norm Mileage,Norm Engine power,Norm Age,norm_Typ,norm_Color,norm_Region,norm_Company,Transmission_Na przednie koła,Damaged_Nie,Price
0,-0.950847,-1.56425,-0.438799,-1.662566,3,4,13,14,1,1,11.065075
1,-0.279087,-0.262598,-0.285932,-0.314186,6,4,6,57,1,1,10.643041
2,-0.649339,-0.403158,-0.936032,0.37577,1,4,14,59,1,1,9.795345
3,-0.00333,0.329471,0.489637,0.37577,7,4,7,4,1,1,10.545341
4,-0.853436,-0.500523,-1.166697,0.057435,0,2,7,21,1,1,9.994242


In [32]:
train_set_selected.to_csv('./train_set_selected.csv')

In [38]:
with open('scaler.pkl','wb') as f:
    pickle.dump(scaler, f)