## Libs

In [15]:
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline 
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier 

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl

In [103]:
def print_metrics(true, pred):
    # calculate metrics & create visualizations
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

    print(f"accuracy: {accuracy_score(true, pred):.2f}")
    print(f"precision: {precision_score(true, pred):.2f}")
    print(f"recall: {recall_score(true, pred):.2f}")
    print(f"f1: {f1_score(true, pred):.2f}")

## Read data

In [149]:
# merged both datasets to apply fixes for both, splitting again
df = pd.read_csv("interdata/df_fixed_01.csv")

# to date 
df['repair_date'] = pd.to_datetime(df['repair_date']).astype(int)
df['breakdown_date'] = pd.to_datetime(df['breakdown_date']).astype(int)

# select columns
df = df[['ID', 'Maker', 'Adv_year', 'Adv_month',
       'Color', 'Reg_year', 'Bodytype', 'Runned_Miles', 'Engin_size',
       'Gearbox', 'Fuel_type', 'Price', 'Seat_num', 'Door_num', 'issue',
       'issue_id', 'Adv_day', 'breakdown_date', 'repair_complexity',
       'repair_cost', 'repair_hours', 'repair_date', 'value', 'Label']]

In [240]:
del df_test

In [241]:
df_val    = df[df['Label'].isna()]
df_train  = df[df['Label'].notna()]

print("Dimensions of the df_test:", df_val.shape)
print("Dimensions of the df_train:", df_train.shape)

Dimensions of the df_test: (16129, 24)
Dimensions of the df_train: (37637, 24)
Dimensions of the df_test: (16129, 24)
Dimensions of the df_train: (37637, 24)


In [242]:
X = df_train.drop(columns=["Label"])
y = df_train["Label"]
# X_test = df_val.drop(columns=["Label"])
# y_test = df_val["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Pipelines

### baseline: logistic regression

In [152]:
# Assign the numerical and categorical (nominal) columns
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

#numerical_features = ['Age', 'Flight distance'] # for tree-based methods
numerical_features = [col for col in X.columns.tolist() if col not in categorical_features]  # for linear methods

# Define the transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='infrequent_if_exist'), categorical_features)
    ])

In [153]:
# Now, the pipeline applies the appropriate transformations
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression())])

# Fit your pipeline on the training data
pipeline.fit(X_train, y_train)

# Apply your pipeline to the test set
y_pred_lm = pipeline.predict(X_test)

In [154]:
print_metrics(y_test, y_pred_lm)
# ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred_lm)).plot()

accuracy: 0.88
precision: 0.61
recall: 0.43
f1: 0.51
accuracy: 0.88
precision: 0.61
recall: 0.43
f1: 0.51


### XGBClassifier

In [206]:
estimators = [
    ('encoder', TargetEncoder()),
    ('clf', XGBClassifier(random_state=42))
]
pipe = Pipeline(steps=estimators)
pipe

In [222]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

search_space = {
    'clf__max_depth': Integer(2,8),
    'clf__learning_rate': Real(0.001, 0.3, prior='log-uniform'),
    'clf__subsample': Real(0.5, 1.0),
    'clf__colsample_bytree': Real(0.5, 1.0),
    'clf__colsample_bylevel': Real(0.5, 1.0),
    'clf__colsample_bynode' : Real(0.5, 1.0),
    'clf__reg_alpha': Real(0.0, 10.0),
    'clf__reg_lambda': Real(0.0, 10.0),
    'clf__gamma': Real(0.0, 10.0)
}

opt = BayesSearchCV(
    pipe, search_space, cv=5, n_iter=20, scoring='f1', random_state=42
) 
# in reality, you may consider setting cv and n_iter to higher values

In [223]:
opt.fit(X_train, y_train)

In [225]:
opt.best_score_

np.float64(0.7225006334205308)

In [230]:
y_pred = opt.best_estimator_.predict(X_test)
print_metrics(y_test, y_pred)

accuracy: 0.92
precision: 0.69
recall: 0.77
f1: 0.73
accuracy: 0.92
precision: 0.69
recall: 0.77
f1: 0.73


In [247]:
df_val.columns

Index(['ID', 'Maker', 'Adv_year', 'Adv_month', 'Color', 'Reg_year',
       'Bodytype', 'Runned_Miles', 'Engin_size', 'Gearbox', 'Fuel_type',
       'Price', 'Seat_num', 'Door_num', 'issue', 'issue_id', 'Adv_day',
       'breakdown_date', 'repair_complexity', 'repair_cost',
       'repair_hours', 'repair_date', 'value', 'Label'],
      dtype='object')

In [258]:
# export predictionsdf_val
try: df_val = df_val.drop(columns='Label')
except: pass 
y_validation = opt.best_estimator_.predict(df_val)


In [266]:
df_val['Label'] = y_validation
df_val.head()

cols_to_export = ['ID', 'Label']
df_export = df_val[cols_to_export]
df_export.to_csv("outdata/predicted.csv", index=False)

### Random Forests

In [142]:
# Now, the pipeline applies the appropriate transformations
pipe_rf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier())])

# Fit your pipeline on the training data
pipe_rf.fit(X_train, y_train)

# Apply your pipeline to the test set
y_pred_lm = pipe_rf.predict(X_test)

In [144]:
y_pred_rf = y_pred_lm

In [145]:
print_metrics(y_test, y_pred_rf)


accuracy: 0.91
precision: 0.69
recall: 0.63
f1: 0.66
accuracy: 0.91
precision: 0.69
recall: 0.63
f1: 0.66
