# Necessary imports

In [46]:
%pip install scikit-learn
%pip install seaborn
%pip install pandas

import pandas
import seaborn

# Tools
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import classification_report, confusion_matrix


# models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC


from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


# Definindo constantes

In [47]:
RANDOM_SEED = 1337
N_JOBS = -1
FOLDS = 10

# Carregamento dos dados

In [48]:
def load_data(file_path: str, positive_label_multiplication: int = 2) -> pandas.DataFrame:
    """ Receives a file path for the dataset training, testing and validation datasets. """

    # Loading data from csv file
    df = pandas.read_csv(file_path)

    # Selecting useful features
    useful_features = [
        "Bidder_Tendency",
        "Bidding_Ratio",
        "Successive_Outbidding",
        "Last_Bidding",
        "Auction_Bids",
        "Starting_Price_Average",
        "Early_Bidding",
        "Winning_Ratio",
        "Auction_Duration",
        "Class"
    ]

    df = df[useful_features]

    # Augmenting positive label data
    positive_labels = df[df["Class"] == 1]

    dfs_to_concat = [df]
    for _ in range(positive_label_multiplication):
        dfs_to_concat.append(positive_labels)

    df = pandas.concat(dfs_to_concat)
    df = df.sample(frac=1)

    # Separating features and labels
    columns = list(df.columns)
    features = columns[:len(columns)-1]
    label = columns[len(columns)-1:]

    X = df[features]
    y = df[label]

    # Creating training, testing and validation datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = RANDOM_SEED)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.125, random_state = RANDOM_SEED)

    return X_train, y_train, X_test, y_test, X_valid, y_valid

In [49]:
file_path = "./dataset/Shill Bidding Dataset.csv"

X_train, y_train, X_test, y_test, X_valid, y_valid = load_data(file_path)

# Declarão de função para treinamento de modelos

A ferramenta gridsearch será usada para determinar a melhor combinação de parâmetros para cada modelo.

Uma função search é definida, que recebe uma instância de modelo a ser "variado" e um dicionário de parâmetros e suas variações. 

In [50]:
def search(model, model_parameters): 
    """ """
    model_pipeline = Pipeline([
        ('clf', model)
    ])

    grid_search = GridSearchCV(
        model_pipeline,
        model_parameters,
        n_jobs=N_JOBS,
        cv=FOLDS,
        verbose=1
    )

    grid_search.fit(X_train, y_train.values.ravel())

    print(f"Best score after optimization: {grid_search.best_score_}")
    print("Best params:")
    for key, value in grid_search.best_params_.items():
        print(f"{key}: {value}")

    return grid_search

# Variando K-NN

Default para leaf_size é 30, então decidimos um range de 20 a 40
p=0-> minkowski_distance,  p=1-> Manhattan distance, p=2-> Euclidean distance

In [None]:
knn_parameters = {

    "clf__n_neighbors":range (4,10),
    "clf__weights":["uniform", "distance"],
    "clf__algorithm":["auto", "ball_tree", "kd_tree", "brute"],
    "clf__leaf_size":range(20,40),
    "clf__p":range(1,3)
    
}

gs_knn = search(KNeighborsClassifier(), knn_parameters)

# Variando LVQ

# Variando Decision Tree

Max_depth começa com 3 porque é o minímo recomendado pelo Scikitlearn e vai até 50 porque raramente é necessário mais que 50. 
Min_samples_leaf foi recomendado ser entre 1 e 20

In [None]:
dt_parameters = {
    "clf__criterion": ["gini", "entropy", "log_loss"],
    "clf__splitter": ["best", "random"],
    "clf__max_features": [0.2,0.4,0.6,0.8, None, "sqrt", "log2"],
    "clf__max_depth": range(3, 50),
    #"clf__min_samples_split": range(2, 10),
    #"clf__min_samples_leaf": range (1,20)

}

gs_dt = search(DecisionTreeClassifier(), dt_parameters)

# Variando SVM

Como há muito mais exemplos da classe 0 que a classe 1, o peso dado aos exemplos da classe 1 foi maior para 1 melhor treinamento. 
O C foi entre 0.1 e 1.0 pois foram vistos exemplos de outros modelos sendo treinados e esses eram os valores vistos.
O gamma é o coeficiente do kernel e o C o parâmetro de regularização. 

In [None]:
svm_parameters = {
    "clf__kernel": ["linear", "poly", "rbf", "sigmoid"],
    "clf__gamma":["scale", "auto"],
    "clf__C": [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
    "clf__degree":range (1,4), #degree of polinomial function
    #"clf__class_weight":[0: 0.2, 1: 0.8],
    #"clf__decision_function_shape":["ovr","ovo"] #one-vs-rest,one-vs-one
}

gs_svm = search(SVC(), svm_parameters)

# Variando Random Forest
Parâmetros e ranges escolhidos para a variação:
1. n_estimators [50:150]: Número de árvores de decisão presentes na floresta.
2. criterion ["gini", "entropy", "log_loss"]: Algoritmo para medição de qualidade de divisão de nós. Servem para determinar quais as melhores features para estarem mais perto do topo do árvore (ou seja, que apresentam maior ganho de informação).
3. max_depth [3:50]: Profundida máxima da árvore. O próprio SKLearn recomenda 3 como mínimo. O máximo depende muito das situações, mas via de regra, uma árvore mais profunda representa um classificador com risco de overfitting. Manter uma profundidade balanceada pode garantir um modelo mais genérico que consegue lidar com registros novos (diferentes dos que foram usado para o treino) sem muitos problemas.
4. max_features ["sqrt", "log2", None]: Número de atributos a serem considerados na hora de dividir.

In [51]:
rf_parameters = {
    "clf__n_estimators": range(75, 125),
    "clf__criterion": ["gini", "entropy", "log_loss"],
    "clf__max_depth": range(3, 20),
    "clf__max_features": [0.2, 0.4, 0.6, 0.8, None, "sqrt", "log2"]
}

gs_rf = search(RandomForestClassifier(random_state=RANDOM_SEED), rf_parameters)

Fitting 10 folds for each of 49350 candidates, totalling 493500 fits
Best score after optimization: 0.9986964618249534
Best params:
clf__criterion: entropy
clf__max_depth: 8
clf__max_features: 0.8
clf__n_estimators: 106


# Variando Rede Neural MLP
Parâmetros e ranges escolhidos para a variação:
1. hidden_layer_sizes [(100,), (50, 50,), (33, 33, 34,), (25, 25, 25, 25,)]: Quantidade de camadas internas do perceptron (e o número de neurons em cada uma)
2. Activation ["identity", "logistic", "tanh", "relu"]: Função de ativação para os neurons. 
3. Solver ["lbfgs", "sgd"]: De acordo com o sklearn, o solver "adam" funciona melhor com datasets maiores (na casa das dezenas de milhares). Como nosso dataset tem uma escala menor, seu uso não é recomendado, pois as alternativas convergem mais rapidamente e performam melhor.

In [52]:
mlp_parameters = {
    "clf__hidden_layer_sizes": [(100,), (50, 50,), (33, 33, 34,), (25, 25, 25, 25,)],
    "clf__activation": ["identity", "logistic", "tanh", "relu"],
    "clf__solver": ["lbfgs", "sgd", "adam"]
}

gs_mlp = search(MLPClassifier(random_state=RANDOM_SEED, verbose=0), mlp_parameters)

Fitting 10 folds for each of 48 candidates, totalling 480 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Best score after optimization: 0.9983240223463687
Best params:
clf__activation: logistic
clf__hidden_layer_sizes: (100,)
clf__solver: lbfgs


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


# Variando Comitê de redes neurais Artificiais
Como o algoritmo Bagging é um ensemble e pode receber vários classificadores, decidimos instanciar o MLP com o resultado de sua otimização, para garantir maior qualidade do comitê.
Parâmetros e ranges escolhidos para a variação:
1. n_estimators [5:15]: Número de classificadores presentes no comitê.
2. max_samples [0.2:1]: Número de registros a serem extraídos de X na hora de treinar cada classificador do comitê.
3. max_features [0.2:1]: Número de atributos a serem considerados na hora de treinar cada classificador do comitê.

In [53]:
ensemble_mlp_parameters = {
    "clf__n_estimators": range(5, 15),
    "clf__max_samples": [0.2, 0.4, 0.6, 0.8, 1],
    "clf__max_features": [0.2, 0.4, 0.6, 0.8, 1]
}

ideal_mlp = MLPClassifier(
    hidden_layer_sizes=(33, 33, 34,),
    activation="tanh",
    solver="lbfgs",
    random_state=RANDOM_SEED,
    verbose=0
)

gs_ensemble_mlp = search(BaggingClassifier(ideal_mlp, random_state=RANDOM_SEED), ensemble_mlp_parameters)

Fitting 10 folds for each of 250 candidates, totalling 2500 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Best score after optimization: 0.9979515828677838
Best params:
clf__max_features: 0.8
clf__max_samples: 0.8
clf__n_estimators: 14


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


# Variando Comitê Heterogêneo
Como o algoritmo VotingClassifier é um ensemble e pode receber vários classificadores diferentes, decidimos instanciá-los já com os melhores parâmetros provenientes de suas otimizações. Os classificadores escolhidos para o VotingClassifier foram:
- MLP
- Random Forest
- SVM
- KNN

Parâmetros e ranges escolhidos para a variação:
1. voting ["hard", "soft"]: Regra para definir vencedor da votação (maioria simples com hard, probabilidade de classes com soft).
2. weights [None, [2.5, 2, 1.5, 1]]: pesos atribuídos para cada classificador. No caso de None, o peso é uniforme. Caso contrário, os pesos passados são atribuídos. Declaramos os classificadores em ordem do mais preciso para o menos preciso, garantindo que os melhores classificadores influenciem mais na decisão.

In [54]:
voting_ensemble_parameters = {
    "clf__voting": ["hard", "soft"],
    "clf__weights": [None, [2.5, 2, 1.5, 1]]
}

estimators = [
    ("mlp", MLPClassifier(
        hidden_layer_sizes=(33, 33, 34,),
        activation="tanh",
        solver="lbfgs",
        random_state=RANDOM_SEED
    )),
    ("rf", RandomForestClassifier(
        n_estimators=106,
        criterion="entropy",
        max_depth=8,
        max_features=0.8,
        random_state=RANDOM_SEED
    )),
    ("svm", SVC(

        random_state=RANDOM_SEED
    )),
    ("knn", KNeighborsClassifier(
        
        random_state=RANDOM_SEED
    ))
]

# TODO: add svc and knn best params

voting_ensemble = search(VotingClassifier(estimators), voting_ensemble_parameters)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


ValueError: 
All the 40 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/alps2/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/alps2/.local/lib/python3.10/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/alps2/.local/lib/python3.10/site-packages/sklearn/ensemble/_voting.py", line 351, in fit
    return super().fit(X, transformed_y, sample_weight)
  File "/home/alps2/.local/lib/python3.10/site-packages/sklearn/ensemble/_voting.py", line 68, in fit
    names, clfs = self._validate_estimators()
  File "/home/alps2/.local/lib/python3.10/site-packages/sklearn/ensemble/_base.py", line 262, in _validate_estimators
    names, estimators = zip(*self.estimators)
TypeError: 'MLPClassifier' object is not iterable
