In [None]:
import pandas
import seaborn
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

: 

In [None]:
dataset = pandas.read_csv("datasets/sensor_readings_24.csv")
dataset.sample(10)

: 

In [None]:
dataset.info()

: 

In [None]:
dataset.dropna(subset=["funcao_robo"], inplace=True)

: 

In [None]:
dataset["funcao_robo"].info()

: 

In [None]:
dataset[dataset.isnull().any(axis=1)]

: 

In [None]:
# Temos quatro classes
dataset["funcao_robo"].unique()

: 

In [None]:
# Percebemos alguns dados faltantes em algumas filas aleatorias e decidimos preencher esses dados com KNNImputer
labels = dataset["funcao_robo"]

na_imputer = KNNImputer(n_neighbors=3, weights="uniform")
filled_dataset = na_imputer.fit_transform(dataset.drop(["funcao_robo"], axis=1))

dataset = pandas.DataFrame(filled_dataset)
dataset["labels"] = labels
dataset

: 

In [None]:
# Agora não temos mais dados faltantes S2
dataset.info()

: 

In [None]:
dataset.describe()

# Como a Média está muito próxima do Desvio Padrão, 
# consideramos não ter outliers, ou que estão impactando pouco

: 

In [None]:
dataset_without_label = dataset.drop(columns=["labels"])

asdad = seaborn.boxplot(x = dataset_without_label["6"])

: 

In [None]:
# Nao sabemos que fazer com esses outliers, nos perdoe professor
outliers_identifier = LocalOutlierFactor(n_neighbors=3)
outliers = outliers_identifier.fit_predict(dataset.drop(["labels"], axis=1))

pandas.Series(outliers).value_counts()

: 

In [None]:
for i in dataset_without_label.columns:
    dataset_without_label[i].plot.hist( legend=True )
    pyplot.show()

: 

In [None]:
X = dataset.drop(["labels"], axis=1)
X.sample(10)

: 

In [None]:
encoder = LabelEncoder()
labels = encoder.fit_transform(dataset["labels"])
y = pandas.DataFrame(labels)
y.sample(10)

: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

: 

In [None]:
minmax_scaler = MinMaxScaler()
X_train = pandas.DataFrame(minmax_scaler.fit_transform(X_train))
X_test = pandas.DataFrame(minmax_scaler.transform(X_test))
X_train.sample(10)

: 

In [None]:
knn = {}
forest = {}
hgb = {}

: 

In [None]:
def calculate_metrics(y_test, y_pred):
  acc = metrics.accuracy_score(y_test, y_pred)
  prec = metrics.precision_score(y_test, y_pred, average="weighted")
  rec = metrics.recall_score(y_test, y_pred, average="weighted")
  f_one = metrics.f1_score(y_test, y_pred, average="weighted")

  return {
      "ACC" : acc,
      "F1" : f_one,
      "REC" : rec,
      "PREC" : prec,
  }

def fit_and_predict(model, X_train, X_test, y_train, predict_probability=False):
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  probability = None

  if predict_probability: probability = model.predict_proba(X_test)

  return y_pred, probability

: 

In [None]:
def knn_train_and_fit(X_train, X_test, y_train, y_test, k=0):

    if k == 0:
        k = int( (len(y_train) + len(y_test)) ** 0.5 )

    knn = KNeighborsClassifier(n_neighbors=k)

    y_pred, _ = fit_and_predict(knn, X_train, X_test, y_train)
    training_metrics = calculate_metrics(y_test, y_pred)

    training_metrics["K"] = k

    return training_metrics


def forest_train_and_fit(X_train, X_test, y_train, y_test, estimators, depth, impurity):
    forest = RandomForestClassifier(n_estimators=estimators,max_depth=depth,random_state=42,
                                    verbose=0,class_weight="balanced", min_impurity_decrease=impurity)

    y_pred, _ = fit_and_predict(forest, X_train, X_test, y_train, True)
    training_metrics = calculate_metrics(y_test, y_pred)
    
    training_metrics["ESTIM"] = estimators
    training_metrics["DEPTH"] = depth
    training_metrics["IMPURITY"] = impurity

    return training_metrics


def hg_boost_train_and_fit(X_train, X_test, y_train, y_test, depth):
    hg = HistGradientBoostingClassifier(max_depth=depth, random_state=77)
    y_pred, _ = fit_and_predict(hg, X_train, X_test, y_train, True)

    training_metrics = calculate_metrics(y_test, y_pred)

    training_metrics["DEPTH"] = depth

    return training_metrics

: 

In [None]:
knn_training = []
for i in range(20):
    knn_training.append(knn_train_and_fit(X_train, X_test, y_train, y_test, i))

: 

In [None]:
hgb_training = []
for i in range(1, 20):
    hgb_training.append(hg_boost_train_and_fit(X_train, X_test, y_train, y_test, i))

: 

In [None]:
forest_training = []
for i in range(1, 100, 5):
    forest_training.append(forest_train_and_fit(X_train, X_test, y_train, y_test, i, 5, 0.00001))

: 

In [None]:
for i in range(1, 25):
    forest_training.append(forest_train_and_fit(X_train, X_test, y_train, y_test, 41, i, 0.00001))

: 

In [None]:
def get_row_of_max_field(data: list, field: str):
    max_field_value = 0
    selected_row = 0

    for i in range(len(data)):
      if max_field_value == 0:
        max_field_value = data[i][field]
        selected_row = i
        pass

      if data[i][field] > max_field_value:
        max_field_value = data[i][field]
        selected_row = i

    return data[selected_row]

: 

In [None]:
hgb['minmax'] = get_row_of_max_field(hgb_training, 'F1')
forest['minmax'] = get_row_of_max_field(forest_training, 'F1')
knn['minmax'] = get_row_of_max_field(knn_training, 'F1')

: 

In [None]:
all_trained = {
    'knn' : knn,
    'forest' : forest,
    'hgb' : hgb,
}

: 

In [None]:
all_trained

: 

In [None]:
fOne = []
acc = []
rec = []
prec = []

for i in all_trained:
  fOne.append(all_trained[i]['minmax']['F1'])
  acc.append(all_trained[i]['minmax']['ACC'])
  rec.append(all_trained[i]['minmax']['REC'])
  prec.append(all_trained[i]['minmax']['PREC'])

: 

In [None]:
modelNames = ['K-Nearest Neightbors', 'Random Forest', 'HGBoost']

: 

In [None]:
th = list(range(1, 4))
fig, ax = pyplot.subplots(figsize=(15, 7))
ax.plot(modelNames, acc, label='Acuracy')
ax.plot(modelNames, fOne, label='F1 Score')
ax.plot(modelNames, rec, label='Recall')
ax.plot(modelNames, prec, label='Precision')
ax.set_xlabel('Models')
ax.set_ylabel('Percentage %')
ax.set_title("Metrics for Models")
ax.legend()

: 

In [None]:
best_model = all_trained["hgb"]

: 

In [None]:
fig, ax = pyplot.subplots(figsize=(15, 7))
values = [best_model['REC'], best_model['PREC'], best_model['F1'], best_model['ACC'], best_model['AUC']]
xlocs = [0.2, 1.2, 2.2, 3.2, 4.2]
colors = ['#DCC48E','#EAEFD3', '#B3C0A4', '#505168', '#27233A']
categories = ['Recall', 'Precision', 'F1 Score', 'Acuracy']
ax.bar(categories, values, color = colors);
ax.set_title("Metrics Best Model - HGBoost", fontsize=20)
ax.set_ylabel('Percentage %')
ax.set_xlabel('Metrics')
for i, v in enumerate(values):
    pyplot.text(xlocs[i] - 0.25, v + 0.01, str(round(v, 2)))

: 