In [1]:
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
# from skmultilearn.model_selection import iterative_train_test_split
# from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import make_pipeline

from sklearn.multiclass import OneVsOneClassifier

# from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score

from sklearn.inspection import permutation_importance

In [3]:
rs = 42
file_path = "db\db-v1.4.2.csv"

In [4]:
df = pd.read_csv(file_path)

# df["Poles"] = np.where(df["Type"]=='OH6', 2, np.floor(3000 / df["Speed"] * 2))
df = df[df["Type"].isin(["OH2", "OH6", "BB1", "BB2", "BB3", "BB5"])][["Type", "Q", "H"]].dropna()

# poles_to_string = {2: "Two", 4: "Four", 6: "Six"}
# df["Poles"] = df["Poles"].map(poles_to_string)

In [5]:
X, y = df[["Q", "H"]].values, df["Type"].values

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=rs)

In [7]:
pipe_rf = make_pipeline(StandardScaler(), 
                        OneVsOneClassifier(RandomForestClassifier(max_depth=10), n_jobs=-1))
pipe_rf.fit(X_train, y_train)

In [8]:
pipe_rf.score(X_test, y_test)

0.9361702127659575

In [9]:
y_pred = pipe_rf.predict(X_test)
f1_score(y_pred, y_test, average="weighted")

0.937710233029382

In [10]:
feature_names = ["Q", "H"]
result = permutation_importance(pipe_rf, X_test, y_test, n_repeats=10, random_state=rs)
forest_importances = pd.Series(result.importances_mean, index=feature_names)

In [11]:
forest_importances

Q    0.137234
H    0.204255
dtype: float64

In [12]:
from joblib import dump, load

In [13]:
dump(pipe_rf, "model/type_model.joblib")

['model/type_model.joblib']