<a href="https://colab.research.google.com/github/arturrur/mc853/blob/main/fun%C3%A7%C3%B5es.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing necessary libraries

import pandas as pd                       # For data manipulation and analysis
import numpy as np                        # For numerical computing
import time                               # For tracking time
import math                               # For mathematical operations
import warnings                           # For managing warnings

import shap                               # For SHAP (SHapley Additive exPlanations) values

import imblearn                           # For dealing with imbalanced datasets
from imblearn.over_sampling import RandomOverSampler   # For oversampling
from imblearn.under_sampling import RandomUnderSampler # For undersampling


import seaborn as sns                     # For statistical data visualization
import matplotlib.pyplot as plt           # For creating visualizations
import matplotlib.patches as mpatches     # For drawing patches in plots
import matplotlib.colors as mcolors       # For defining custom colors in plots
import matplotlib.ticker as ticker        # For formatting tick marks on plots
from matplotlib.ticker import FuncFormatter         # For custom tick formatting
from matplotlib.ticker import MaxNLocator

from sklearn.preprocessing import StandardScaler      # For feature scaling
from sklearn.model_selection import (StratifiedKFold) # For splitting data into train and test sets

from sklearn.metrics import (roc_auc_score,           # For evaluating model performance
                             recall_score)

from sklearn.svm import SVC                           # For Support Vector Classifier
from sklearn.linear_model import LogisticRegression   # For Logistic Regression Classifier
from sklearn.ensemble import (RandomForestClassifier, # For ensemble classifiers
                              GradientBoostingClassifier,
                              BaggingClassifier)

from sklearn.neural_network import MLPClassifier      # For Multi-layer Perceptron Classifier

from sklearn.impute import KNNImputer

import re
from sklearn.calibration import CalibratedClassifierCV


In [2]:
# Set the number of folds for cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)

# Set preprocessing: StandardScaler for feature standardization
preprocessing = StandardScaler()

# Initialize KNNImputer with the specified number of neighbors
imputer = KNNImputer(n_neighbors=3)

In [3]:
# Define a dictionary containing various classification algorithms

algorithms = {
    'svc_linear': SVC(probability=True, kernel='linear', random_state=0),
    # Support Vector Classifier with linear kernel

    'svc_rbf': SVC(probability=True, kernel='rbf', random_state=0),
    # Support Vector Classifier with radial basis function (RBF) kernel

    'random_forest': RandomForestClassifier(random_state=0),
    # Random Forest Classifier

    'gradient_boosting': GradientBoostingClassifier(random_state=0),
    # Gradient Boosting Classifier

    'logistic_regression': LogisticRegression(),
    # Logistic Regression Classifier

    'bagging': BaggingClassifier(random_state=0),
    # Bagging Classifier

    'mlp': MLPClassifier(random_state=0)
    # Multi-layer Perceptron Classifier
}

In [4]:
def data_sample(X, y):
    '''
    Receives a set of features and target feature separately.
    Returns balanced data, with the same number of samples in both classes.
    If the minority class is less than 5%, applies oversampling and undersampling.
    Otherwise, applies only undersampling.
    Parameters:
        X : array-like, shape (n_samples, n_features)
            The feature matrix.
        y : array-like, shape (n_samples,)
            The target variable.

    Returns:
        X_resampled : array-like, shape (n_samples_resampled, n_features)
            The resampled feature matrix.
        y_resampled : array-like, shape (n_samples_resampled,)
            The resampled target variable.
    '''

    # Define sampling strategies
    undersample = RandomUnderSampler(sampling_strategy='majority', random_state=1)
    oversample = RandomOverSampler(sampling_strategy=0.2, random_state=1)

    # Identify the minority class
    count_1 = (y == 1).sum()
    count_0 = (y == 0).sum()
    count_min = min(count_0, count_1)
    count_max = max(count_0, count_1)

    # Calculate the percentage of the minority class compared to the total number of instances
    ratio = (count_min / count_max)

    # If the minority class is more than 60% of the majority class, do not apply any resampling technique
    if ratio > 0.6:
        X_resampled, y_resampled = X, y

    # Check if the percentage of class 1 is at least 5% of the total number of instances
    # If it is less than 5%, apply both over and under sampling
    else:
        if ratio <= 0.2:
            X_resampled, y_resampled = oversample.fit_resample(X, y)
            X_resampled, y_resampled = undersample.fit_resample(X_resampled, y_resampled)

        # Otherwise, apply only undersampling
        else:
            X_resampled, y_resampled = undersample.fit_resample(X, y)

    return X_resampled, y_resampled


In [5]:
def impute_missing(data, n_neighbors=3):
    """
    Impute missing values using the K-nearest neighbors algorithm.

    Parameters:
        data (pd.DataFrame): Input DataFrame with missing values.
        n_neighbors (int, optional): Number of neighbors to use for imputation. Defaults to 3.

    Returns:
        pd.DataFrame: DataFrame with missing values imputed using KNN.
    """
    # Initialize KNNImputer with the specified number of neighbors
    imputer = KNNImputer(n_neighbors=n_neighbors)

    # Perform imputation
    imputed_data = imputer.fit_transform(data)

    # Convert the imputed array back to a DataFrame
    imputed_df = pd.DataFrame(imputed_data, columns=data.columns, index=data.index)

    return imputed_df


In [6]:
from sklearn.metrics import precision_score

def evaluate_cv(data):
    '''
    Receives data to be evaluated and returns the average performance inside cross-validation, using 3 metrics.
    Applies over-under sampling to get balanced datasets and standardizes features.

    Parameters:
    data : DataFrame
        The dataset containing features and the target variable.

    Returns:
    df : DataFrame
        A DataFrame containing the mean and standard deviation of each algorithm's performance across 5-fold cross-validation.
        The performance metrics include AUC (mean and standard deviation), sensitivity (mean and standard deviation),
        specificity (mean and standard deviation), prec_n (mean and standard deviation), and prec_p (mean and standard deviation).
    '''
    # Record the start time
    start_time = time.time()

    # Identify the target column
    target_feature = data.columns[-1]

    # Separate features (X) and target (y)
    X = data.drop(columns=[target_feature])
    y = data[target_feature]

    # Initialize dictionaries to store metrics for each algorithm
    sen = {}
    spe = {}
    auc = {}
    prec_n = {}  # Negative precision
    prec_p = {}  # Positive precision

    for algorithm in algorithms.keys():
        sen[algorithm] = []
        spe[algorithm] = []
        auc[algorithm] = []
        prec_n[algorithm] = []
        prec_p[algorithm] = []

    # Iterate through each round of the cross-validation
    for train, test in kf.split(X, y):
        # Allocate train and test data
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]

        # Apply over-under sampling
        X_train, y_train = data_sample(X_train, y_train)

        X_train = imputer.fit_transform(X_train)
        X_test = imputer.transform(X_test)

        # Standardize features
        X_train = preprocessing.fit_transform(X_train)
        X_test = preprocessing.transform(X_test)

        # Iterate through each algorithm
        for algorithm, (clf) in algorithms.items():

            clf.fit((X_train), y_train)

            # Make predictions for the test data
            y_pred = clf.predict(X_test)

            # Calculate sensitivity and specificity
            recallscore = recall_score(y_test, y_pred, labels=[0, 1], average=None)
            sen[algorithm].append(recallscore[1])
            spe[algorithm].append(recallscore[0])

            # Calculate precision for each class
            prec_score = precision_score(y_test, y_pred, labels=[0, 1], average=None)
            prec_n[algorithm].append(prec_score[0])
            prec_p[algorithm].append(prec_score[1])

            # Calculate the area under the ROC curve
            aucscore = roc_auc_score(y_test, (clf.predict_proba((X_test)))[:, 1])
            auc[algorithm].append(aucscore)

    # Create a DataFrame with the mean and standard deviation of each algorithm's performance across 5 folds
    df = pd.DataFrame(columns=list(algorithms.keys()))

    df.loc['auc (mean)'] = [np.mean(auc['svc_linear']), np.mean(auc['svc_rbf']), np.mean(auc['random_forest']),
                            np.mean(auc['gradient_boosting']), np.mean(auc['logistic_regression']),
                            np.mean(auc['bagging']), np.mean(auc['mlp'])]

    df.loc['auc (stdev)'] = [np.std(auc['svc_linear']), np.std(auc['svc_rbf']), np.std(auc['random_forest']),
                             np.std(auc['gradient_boosting']), np.std(auc['logistic_regression']),
                             np.std(auc['bagging']), np.std(auc['mlp'])]

    df.loc['rcl_1 (mean)'] = [np.mean(sen['svc_linear']), np.mean(sen['svc_rbf']), np.mean(sen['random_forest']),
                            np.mean(sen['gradient_boosting']), np.mean(sen['logistic_regression']),
                            np.mean(sen['bagging']), np.mean(sen['mlp'])]

    df.loc['rcl_1 (stdev)'] = [np.std(sen['svc_linear']), np.std(sen['svc_rbf']), np.std(sen['random_forest']),
                             np.std(sen['gradient_boosting']), np.std(sen['logistic_regression']),
                             np.std(sen['bagging']), np.std(sen['mlp'])]

    df.loc['rcl_0 (mean)'] = [np.mean(spe['svc_linear']), np.mean(spe['svc_rbf']), np.mean(spe['random_forest']),
                            np.mean(spe['gradient_boosting']), np.mean(spe['logistic_regression']),
                            np.mean(spe['bagging']), np.mean(spe['mlp'])]

    df.loc['rcl_0 (stdev)'] = [np.std(spe['svc_linear']), np.std(spe['svc_rbf']), np.std(spe['random_forest']),
                             np.std(spe['gradient_boosting']), np.std(spe['logistic_regression']),
                             np.std(spe['bagging']), np.std(spe['mlp'])]

    df.loc['prc_1 (mean)'] = [np.mean(prec_p['svc_linear']), np.mean(prec_p['svc_rbf']), np.mean(prec_p['random_forest']),
                                 np.mean(prec_p['gradient_boosting']), np.mean(prec_p['logistic_regression']),
                                 np.mean(prec_p['bagging']), np.mean(prec_p['mlp'])]

    df.loc['prc_1 (stdev)'] = [np.std(prec_p['svc_linear']), np.std(prec_p['svc_rbf']), np.std(prec_p['random_forest']),
                                  np.std(prec_p['gradient_boosting']), np.std(prec_p['logistic_regression']),
                                  np.std(prec_p['bagging']), np.std(prec_p['mlp'])]

    df.loc['prc_0 (mean)'] = [np.mean(prec_n['svc_linear']), np.mean(prec_n['svc_rbf']), np.mean(prec_n['random_forest']),
                                 np.mean(prec_n['gradient_boosting']), np.mean(prec_n['logistic_regression']),
                                 np.mean(prec_n['bagging']), np.mean(prec_n['mlp'])]

    df.loc['prc_0 (stdev)'] = [np.std(prec_n['svc_linear']), np.std(prec_n['svc_rbf']), np.std(prec_n['random_forest']),
                                  np.std(prec_n['gradient_boosting']), np.std(prec_n['logistic_regression']),
                                  np.std(prec_n['bagging']), np.std(prec_n['mlp'])]

    # Set caption for DataFrame
    df = df.style.set_caption('Average performance and standard deviation among 5-fold cross-validation')

    # Record the end time
    end_time = time.time()

    # Calculate the time taken
    total_time = end_time - start_time

    # Display the DataFrame
    display(df)

    # Print the total time taken to run cross-validation
    print(f"Total time taken to run cross-validation: {total_time:.2f} seconds")

    return df

In [7]:
from sklearn.metrics import precision_score

def evaluate_external(data, data_test):
    '''
    Receives data and data_test to be evaluated and returns the average performance, using 3 metrics.
    Applies over-under sampling to get balanced datasets and standardizes features.

    Parameters:
    data : DataFrame
        The training dataset containing features and the target variable.
    data_test : DataFrame
        The test dataset containing features and the target variable.

    Returns:
    df : DataFrame
        A DataFrame containing the mean performance of each algorithm across external validation.
        The performance metrics include AUC (mean), sensitivity (mean), specificity (mean), prec_n (mean), and prec_p (mean).
    '''
    # Record the start time
    start_time = time.time()

    # Identify the target column
    target_feature = data.columns[-1]

    # Separate features (X) and target (y) for training data
    X = data.drop(columns=[target_feature])
    y = data[target_feature]

    # Separate features (X) and target (y) for test data
    X_test = data_test.drop(columns=[target_feature])
    y_test = data_test[target_feature]

    # Initialize dictionaries to store metrics for each algorithm
    sen = {}
    spe = {}
    auc = {}
    prec_n = {}  # Negative precision
    prec_p = {}  # Positive precision

    for algorithm in algorithms.keys():
        sen[algorithm] = []
        spe[algorithm] = []
        auc[algorithm] = []
        prec_n[algorithm] = []
        prec_p[algorithm] = []

    # Apply over-under sampling to training data
    X_train, y_train = data_sample(X, y)
    #X_train = X
    #y_train = y

    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    # Apply preprocessing to both training and test data
    X_train = preprocessing.fit_transform(X_train)
    X_test = preprocessing.transform(X_test)

    # For each algorithm
    for algorithm, (clf) in algorithms.items():
        # Train model
        clf.fit((X_train), y_train)

        # Make predictions for the test data
        y_pred = clf.predict(X_test)

        # Calculate sensitivity and specificity
        recallscore = recall_score(y_test, y_pred, labels=[0, 1], average=None)
        sen[algorithm].append(recallscore[1])
        spe[algorithm].append(recallscore[0])

        # Calculate precision for each class
        prec_score = precision_score(y_test, y_pred, labels=[0, 1], average=None)
        prec_n[algorithm].append(prec_score[0])
        prec_p[algorithm].append(prec_score[1])

        # Calculate the area under the ROC curve
        aucscore = roc_auc_score(y_test, (clf.predict_proba((X_test)))[:, 1])
        auc[algorithm].append(aucscore)

    # Create a DataFrame with the mean performance of each algorithm across the external validation
    df = pd.DataFrame(columns=list(algorithms.keys()))

    df.loc['auc'] = [np.mean(auc['svc_linear']), np.mean(auc['svc_rbf']), np.mean(auc['random_forest']),
                            np.mean(auc['gradient_boosting']), np.mean(auc['logistic_regression']),
                            np.mean(auc['bagging']), np.mean(auc['mlp'])]

    df.loc['rcl_1'] = [np.mean(sen['svc_linear']), np.mean(sen['svc_rbf']), np.mean(sen['random_forest']),
                            np.mean(sen['gradient_boosting']), np.mean(sen['logistic_regression']),
                            np.mean(sen['bagging']), np.mean(sen['mlp'])]

    df.loc['rcl_0'] = [np.mean(spe['svc_linear']), np.mean(spe['svc_rbf']), np.mean(spe['random_forest']),
                            np.mean(spe['gradient_boosting']), np.mean(spe['logistic_regression']),
                            np.mean(spe['bagging']), np.mean(spe['mlp'])]

    df.loc['prc_1'] = [np.mean(prec_p['svc_linear']), np.mean(prec_p['svc_rbf']), np.mean(prec_p['random_forest']),
                            np.mean(prec_p['gradient_boosting']), np.mean(prec_p['logistic_regression']),
                            np.mean(prec_p['bagging']), np.mean(prec_p['mlp'])]

    df.loc['prc_0'] = [np.mean(prec_n['svc_linear']), np.mean(prec_n['svc_rbf']), np.mean(prec_n['random_forest']),
                            np.mean(prec_n['gradient_boosting']), np.mean(prec_n['logistic_regression']),
                            np.mean(prec_n['bagging']), np.mean(prec_n['mlp'])]

    # Set caption for DataFrame
    df = df.style.set_caption('Performance for external validation')

    # Record the end time
    end_time = time.time()

    # Calculate the time taken
    total_time = end_time - start_time

    # Display the DataFrame
    display(df)

    # Print the total time taken to run external-validation
    print(f"Total time taken to run external-validation: {total_time:.2f} seconds")

    return df

Exploração dos dados (sem tratamento)

In [8]:
df = pd.read_csv('https://raw.githubusercontent.com/arturrur/mc853/refs/heads/main/data/training/treino.csv')

def is_number(x):
    try:
        float(x)
        return True
    except:
        return False

# Aplica um filtro para identificar valores não numéricos
mask = ~df.map(is_number)
non_numeric_positions = mask[mask].stack()

# Substitui esse valor por 1
row, col = non_numeric_positions.index[0]
df.at[row, col] = 1

evaluate_cv(df)



Unnamed: 0,svc_linear,svc_rbf,random_forest,gradient_boosting,logistic_regression,bagging,mlp
auc (mean),0.840261,0.834347,0.843195,0.847594,0.841821,0.803941,0.78757
auc (stdev),0.010062,0.008164,0.007984,0.008779,0.01082,0.008631,0.011529
rcl_1 (mean),0.759374,0.740111,0.691983,0.759369,0.755528,0.620778,0.636195
rcl_1 (stdev),0.027116,0.040162,0.03637,0.028164,0.033271,0.031057,0.029335
rcl_0 (mean),0.771024,0.782343,0.823995,0.776231,0.775552,0.81777,0.77725
rcl_0 (stdev),0.010032,0.017033,0.014692,0.012774,0.012479,0.014517,0.012587
prc_1 (mean),0.280706,0.28601,0.316726,0.28547,0.283769,0.286486,0.251548
prc_1 (stdev),0.007665,0.007819,0.012821,0.006564,0.007253,0.009785,0.004794
prc_0 (mean),0.964637,0.962521,0.957964,0.964889,0.964326,0.948343,0.947881
prc_0 (stdev),0.003602,0.004971,0.004317,0.003549,0.004313,0.003399,0.003308


Total time taken to run cross-validation: 77.23 seconds


Unnamed: 0,svc_linear,svc_rbf,random_forest,gradient_boosting,logistic_regression,bagging,mlp
auc (mean),0.840261,0.834347,0.843195,0.847594,0.841821,0.803941,0.78757
auc (stdev),0.010062,0.008164,0.007984,0.008779,0.01082,0.008631,0.011529
rcl_1 (mean),0.759374,0.740111,0.691983,0.759369,0.755528,0.620778,0.636195
rcl_1 (stdev),0.027116,0.040162,0.03637,0.028164,0.033271,0.031057,0.029335
rcl_0 (mean),0.771024,0.782343,0.823995,0.776231,0.775552,0.81777,0.77725
rcl_0 (stdev),0.010032,0.017033,0.014692,0.012774,0.012479,0.014517,0.012587
prc_1 (mean),0.280706,0.28601,0.316726,0.28547,0.283769,0.286486,0.251548
prc_1 (stdev),0.007665,0.007819,0.012821,0.006564,0.007253,0.009785,0.004794
prc_0 (mean),0.964637,0.962521,0.957964,0.964889,0.964326,0.948343,0.947881
prc_0 (stdev),0.003602,0.004971,0.004317,0.003549,0.004313,0.003399,0.003308


Aplicando data_sample

In [9]:
# Identifica e separa a coluna target do resto das features
target_feature = df.columns[-1]
features = df.drop(columns=[target_feature])
target = df[target_feature]

# Realiza a normalização das classes e concatena a resposta num novo DF
normal_features, normal_target = data_sample(features, target)
normal_df = pd.concat([normal_features, normal_target], axis=1)

evaluate_cv(normal_df)



Unnamed: 0,svc_linear,svc_rbf,random_forest,gradient_boosting,logistic_regression,bagging,mlp
auc (mean),0.85865,0.887924,0.932799,0.877508,0.859037,0.904242,0.872586
auc (stdev),0.006274,0.01,0.005629,0.008848,0.006251,0.007186,0.011315
rcl_1 (mean),0.779883,0.82967,0.873238,0.809862,0.771385,0.842109,0.851734
rcl_1 (stdev),0.028908,0.018754,0.014425,0.021459,0.024158,0.004748,0.020878
rcl_0 (mean),0.781564,0.798534,0.81778,0.784387,0.790618,0.814942,0.786085
rcl_0 (stdev),0.014711,0.010544,0.01449,0.011104,0.016051,0.015239,0.013107
prc_1 (mean),0.781368,0.804648,0.827482,0.789819,0.786815,0.820029,0.799346
prc_1 (stdev),0.008264,0.008653,0.011825,0.007306,0.00895,0.011755,0.009622
prc_0 (mean),0.780993,0.824495,0.865985,0.805346,0.776244,0.837708,0.841776
prc_0 (stdev),0.020335,0.01618,0.013252,0.016878,0.015461,0.003419,0.018622


Total time taken to run cross-validation: 62.77 seconds


Unnamed: 0,svc_linear,svc_rbf,random_forest,gradient_boosting,logistic_regression,bagging,mlp
auc (mean),0.85865,0.887924,0.932799,0.877508,0.859037,0.904242,0.872586
auc (stdev),0.006274,0.01,0.005629,0.008848,0.006251,0.007186,0.011315
rcl_1 (mean),0.779883,0.82967,0.873238,0.809862,0.771385,0.842109,0.851734
rcl_1 (stdev),0.028908,0.018754,0.014425,0.021459,0.024158,0.004748,0.020878
rcl_0 (mean),0.781564,0.798534,0.81778,0.784387,0.790618,0.814942,0.786085
rcl_0 (stdev),0.014711,0.010544,0.01449,0.011104,0.016051,0.015239,0.013107
prc_1 (mean),0.781368,0.804648,0.827482,0.789819,0.786815,0.820029,0.799346
prc_1 (stdev),0.008264,0.008653,0.011825,0.007306,0.00895,0.011755,0.009622
prc_0 (mean),0.780993,0.824495,0.865985,0.805346,0.776244,0.837708,0.841776
prc_0 (stdev),0.020335,0.01618,0.013252,0.016878,0.015461,0.003419,0.018622


In [10]:
df_test = pd.read_csv('https://raw.githubusercontent.com/arturrur/mc853/refs/heads/main/data/test/teste.csv')
# Identifica e separa a coluna target do resto das features
target_feature_test = df_test.columns[-1]
features_test = df_test.drop(columns=[target_feature_test])
target_test = df_test[target_feature_test]

# Realiza a normalização das classes e concatena a resposta num novo DF
normal_features_test, normal_target_test = data_sample(features_test, target_test)
normal_df_test = pd.concat([normal_features_test, normal_target_test], axis=1)
evaluate_external(normal_df, normal_df_test)



Unnamed: 0,svc_linear,svc_rbf,random_forest,gradient_boosting,logistic_regression,bagging,mlp
auc,0.813979,0.806345,0.814482,0.814288,0.814007,0.771707,0.753625
rcl_1,0.769231,0.72679,0.67374,0.763926,0.763926,0.591512,0.625995
rcl_0,0.721485,0.737401,0.774536,0.740053,0.724138,0.779841,0.740053
prc_1,0.734177,0.734584,0.749263,0.746114,0.734694,0.728758,0.706587
prc_0,0.75766,0.729659,0.703614,0.758152,0.754144,0.65625,0.664286


Total time taken to run external-validation: 22.31 seconds


Unnamed: 0,svc_linear,svc_rbf,random_forest,gradient_boosting,logistic_regression,bagging,mlp
auc,0.813979,0.806345,0.814482,0.814288,0.814007,0.771707,0.753625
rcl_1,0.769231,0.72679,0.67374,0.763926,0.763926,0.591512,0.625995
rcl_0,0.721485,0.737401,0.774536,0.740053,0.724138,0.779841,0.740053
prc_1,0.734177,0.734584,0.749263,0.746114,0.734694,0.728758,0.706587
prc_0,0.75766,0.729659,0.703614,0.758152,0.754144,0.65625,0.664286
