In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import joblib
import warnings
warnings.filterwarnings('ignore')

# Veriyi okuyalım

In [2]:
df=pd.read_csv('car_acceptability.txt')
df

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,?,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,?,unacc
4,vhigh,vhigh,2,2,med,high,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,high,vgood
1724,low,low,5more,more,big,low,unacc
1725,low,low,5more,more,big,med,good
1726,low,low,5more,more,big,high,vgood


# Sütun isimlerini anlamlı hale getirelim

In [3]:
new_columns = ["Buying_Cost","Maintenance_Cost","Door_Count","Capacity(person)","Lug_Boot_Size","Safety","Class"]
df.columns = new_columns

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Buying_Cost       1727 non-null   object
 1   Maintenance_Cost  1728 non-null   object
 2   Door_Count        1728 non-null   object
 3   Capacity(person)  1728 non-null   object
 4   Lug_Boot_Size     1728 non-null   object
 5   Safety            1727 non-null   object
 6   Class             1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


# Hatalı değer kontrolleri yapalım

In [5]:
df['Buying_Cost'].value_counts()

high     432
med      431
low      430
vhigh    429
?          3
-          1
düsük      1
Name: Buying_Cost, dtype: int64

In [6]:
df['Buying_Cost'] = df['Buying_Cost'].replace('düsük', 'low')
df['Buying_Cost'] = df['Buying_Cost'].replace('-', '?')
df['Buying_Cost'].value_counts()

high     432
med      431
low      431
vhigh    429
?          4
Name: Buying_Cost, dtype: int64

In [7]:
df['Maintenance_Cost'].value_counts()

high     432
med      432
low      432
vhigh    428
?          4
Name: Maintenance_Cost, dtype: int64

In [8]:
df['Door_Count'].value_counts()

3        432
4        430
5more    430
2        429
?          3
-          1
44         1
iki        1
5+         1
Name: Door_Count, dtype: int64

In [9]:
df['Door_Count'] = df['Door_Count'].replace('-', '?')
df['Door_Count'] = df['Door_Count'].replace('44', '4')
df['Door_Count'] = df['Door_Count'].replace('iki', '2')
df['Door_Count'] = df['Door_Count'].replace('5+', '5more')
df['Door_Count'].value_counts()

3        432
4        431
5more    431
2        430
?          4
Name: Door_Count, dtype: int64

In [10]:
df['Capacity(person)'].value_counts()

4       576
more    576
2       573
?         3
Name: Capacity(person), dtype: int64

In [11]:
df['Lug_Boot_Size'].value_counts()

big      576
small    575
med      575
?          2
Name: Lug_Boot_Size, dtype: int64

In [12]:
df['Safety'].value_counts()

high    576
med     574
low     572
?         2
-         2
*         1
Name: Safety, dtype: int64

In [13]:
df['Safety'] = df['Safety'].replace('-', '?')
df['Safety'] = df['Safety'].replace('*', '?')
df['Safety'].value_counts()

high    576
med     574
low     572
?         5
Name: Safety, dtype: int64

In [14]:
df['Class'].value_counts()

unacc    1208
acc       384
good       69
vgood      65
?           2
Name: Class, dtype: int64

# NaN değerleri soru işaretleriyle dolduralım. Ve bilinmeyen veri sınıflarını kaldıralım

In [15]:
df=df.fillna('?')
df=df[df['Class']!='?']

# Train Test Split İşlemi

In [16]:
X = df.drop('Class', axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=35)

# Catboost'u varsayılan ayarlarda eğitelim. Peki neden Catboost? Çünkü verilerimizin çoğu kategorik veriler ve bilinmeyen ? değerleri ile uğraşmaya gerek kalmıyor

In [17]:
categorical_columns_indices = list(X_train.select_dtypes(include='object').columns)
vanilla_model = CatBoostClassifier(cat_features=categorical_columns_indices,random_seed=35,verbose=0)
vanilla_model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x13337bbb410>

# Başarı durumunu gözlemleyelim

In [18]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
y_pred=vanilla_model.predict(X_test)
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         acc       1.00      0.91      0.95        78
        good       0.90      1.00      0.95        18
       unacc       0.97      1.00      0.99       233
       vgood       1.00      0.94      0.97        17

    accuracy                           0.98       346
   macro avg       0.97      0.96      0.96       346
weighted avg       0.98      0.98      0.98       346



## Acc ve Unacc arası hata en önemli hata tipi. Bu hatayı daha da azaltmak için RandomSearch ile optimize edebiliriz

In [19]:
confusion_matrix(y_pred, y_test)

array([[ 71,   1,   6,   0],
       [  0,  18,   0,   0],
       [  0,   0, 233,   0],
       [  0,   1,   0,  16]], dtype=int64)

# Parametre aralıklarını belirleyelim

In [20]:
param_dist = {
    'iterations': np.arange(100, 1000, 50),
    'learning_rate': np.logspace(-3, 0, 100),
    'depth': np.arange(3, 11, 2),
    'l2_leaf_reg': np.logspace(-20, 1, 100),
}

# Random search parametrelerini atayalım ve biraz uzun bir optimizasyona başlayalım

In [21]:
model = CatBoostClassifier(cat_features=categorical_columns_indices,random_seed=53,verbose=0)
random_search = RandomizedSearchCV(
    model, 
    param_distributions=param_dist,
    n_iter=50,
    cv=4,
    verbose=1,
    n_jobs=-1
)

In [22]:
random_search.fit(X_train, y_train)

Fitting 4 folds for each of 50 candidates, totalling 200 fits


# En iyi parametreleri öğrenelim ve başarı durumunu gözlemleyelim

In [23]:
best_params = random_search.best_params_
best_model = random_search.best_estimator_

y_pred = best_model.predict(X_test)
print(classification_report(y_pred,y_test))
print("Best Parameters:", best_params)

              precision    recall  f1-score   support

         acc       1.00      1.00      1.00        71
        good       0.90      1.00      0.95        18
       unacc       1.00      1.00      1.00       239
       vgood       1.00      0.89      0.94        18

    accuracy                           0.99       346
   macro avg       0.97      0.97      0.97       346
weighted avg       0.99      0.99      0.99       346

Best Parameters: {'learning_rate': 0.05722367659350217, 'l2_leaf_reg': 8.111308307896889e-12, 'iterations': 950, 'depth': 5}


# Yeni modelimiz kötü adayları ve kabul edilebilir adayları daha güzel ayırıyor. Çok iyi ve iyi adaylar arasında hata yapsa da kötü adaylar ve kabul edilebilir adaylar arası yapılan hata kadar kritik bir hata değil

In [24]:
confusion_matrix(y_pred, y_test)

array([[ 71,   0,   0,   0],
       [  0,  18,   0,   0],
       [  0,   0, 239,   0],
       [  0,   2,   0,  16]], dtype=int64)

In [25]:
filename='catboost_model.sav'
joblib.dump(best_model,filename)

['catboost_model.sav']