In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
from deap import base, creator, tools, algorithms
import random
import pickle
import matplotlib.pyplot as plt
import sys
import datetime

  from pandas.core import (


In [None]:
# Redirect output to a file named by the current date and time
current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_file_name = f"{current_time}.txt"
sys.stdout = open(output_file_name, "w")

# Define the binarization windows
binarization_windows = [
    ([0, 25], [26, 50]),
    ([0, 21], [32, 50]),
    ([0, 16], [37, 50]),
    ([0, 11], [42, 50])
]

# Load the dataset
file_path = "ResidualProducts/ID-Features-Vote.csv"
dataset = pd.read_csv(file_path)

# Genetic Algorithm configuration
num_generations = 100
population_size = 50

def evaluate_individual(individual, X, y):
    selected_features = [bool(gene) for gene in individual]
    if len(selected_features) != X.shape[1]:
        raise ValueError(f"Dimension mismatch: {len(selected_features)} != {X.shape[1]}")
    if sum(selected_features) == 0:
        return 0.0,
    X_selected = X[:, selected_features]
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    clf = RandomForestClassifier(random_state=42)
    scores = cross_val_score(clf, X_selected, y, cv=skf, scoring="accuracy")
    return np.mean(scores),

for idx, (class_0_range, class_1_range) in enumerate(binarization_windows):
    print(f"Processing binarization window {idx + 1}: 0->{class_0_range}, 1->{class_1_range}")

    def binarize_vote(vote):
        if class_0_range[0] <= vote <= class_0_range[1]:
            return 0
        elif class_1_range[0] <= vote <= class_1_range[1]:
            return 1
        else:
            return None

    dataset[f"binary_vote_{idx}"] = dataset["vote"].apply(binarize_vote)
    filtered_dataset = dataset.dropna(subset=[f"binary_vote_{idx}"])
    filtered_dataset[f"binary_vote_{idx}"] = filtered_dataset[f"binary_vote_{idx}"].astype(int)

    X = filtered_dataset.drop(columns=["ID", "PAINTING", "vote", f"binary_vote_{idx}"]).values
    y = filtered_dataset[f"binary_vote_{idx}"].values

    class_counts = filtered_dataset[f"binary_vote_{idx}"].value_counts()
    minority_class = class_counts.idxmin()
    majority_class = class_counts.idxmax()

    minority_data = filtered_dataset[filtered_dataset[f"binary_vote_{idx}"] == minority_class]
    majority_data = filtered_dataset[filtered_dataset[f"binary_vote_{idx}"] == majority_class]
    majority_downsampled = majority_data.sample(len(minority_data), random_state=42)
    balanced_dataset = pd.concat([minority_data, majority_downsampled]).sample(frac=1, random_state=42)

    X_balanced = balanced_dataset.drop(columns=["ID", "PAINTING", "vote", f"binary_vote_{idx}"]).values
    y_balanced = balanced_dataset[f"binary_vote_{idx}"].values

    num_features = X_balanced.shape[1]
    print(f"Number of features: {num_features}")

    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMax)
    toolbox = base.Toolbox()
    toolbox.register("attr_bool", random.randint, 0, 1)
    toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=num_features)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    toolbox.register("evaluate", evaluate_individual, X=X_balanced, y=y_balanced)
    toolbox.register("mate", tools.cxTwoPoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
    toolbox.register("select", tools.selTournament, tournsize=3)

    population = toolbox.population(n=population_size)
    hall_of_fame = tools.HallOfFame(1)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("min", np.min)
    stats.register("max", np.max)

    population, log = algorithms.eaSimple(
        population, toolbox, cxpb=0.5, mutpb=0.2, ngen=num_generations,
        stats=stats, halloffame=hall_of_fame, verbose=True
    )

    best_individual = hall_of_fame[0]
    selected_features = [i for i, gene in enumerate(best_individual) if gene == 1]

    X_selected = X_balanced[:, selected_features]
    final_model = RandomForestClassifier(random_state=42)
    final_model.fit(X_selected, y_balanced)

    y_pred = final_model.predict(X_selected)
    accuracy = accuracy_score(y_balanced, y_pred)
    sensitivity = recall_score(y_balanced, y_pred, pos_label=1)
    specificity = recall_score(y_balanced, y_pred, pos_label=0)
    precision = precision_score(y_balanced, y_pred, pos_label=1)
    confusion = confusion_matrix(y_balanced, y_pred)
    npv = confusion[0, 0] / (confusion[0, 0] + confusion[0, 1]) if (confusion[0, 0] + confusion[0, 1]) > 0 else 0

    model_file = f"ResidualProducts/final_model_window_{idx + 1}.pkl"
    with open(model_file, "wb") as f:
        pickle.dump(final_model, f)

    print(f"Binarization Window {idx + 1} Results:")
    print(f"Accuracy: {accuracy}")
    print(f"Sensitivity: {sensitivity}")
    print(f"Specificity: {specificity}")
    print(f"Precision: {precision}")
    print(f"NPV: {npv}")
    print(f"Selected Features: {selected_features}")

    gen = log.select("gen")
    avg = log.select("avg")
    max_ = log.select("max")

    plt.figure()
    plt.plot(gen, avg, label="Average Fitness")
    plt.plot(gen, max_, label="Max Fitness")
    plt.xlabel("Generation")
    plt.ylabel("Fitness")
    plt.title(f"Genetic Algorithm Performance for Window {idx + 1}")
    plt.legend()
    plt.savefig(f"ResidualProducts/genetic_algorithm_performance_window_{idx + 1}.png")
    plt.show()

# Close the redirected output
sys.stdout.close()
# Reset stdout to default
sys.stdout = sys.__stdout__

In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
import pickle
import datetime
import sys

# Create an output file with a timestamp
current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_file_name = f"{current_time}.txt"

with open(output_file_name, "w") as output_file:
    sys.stdout = output_file

    # Define binarization windows for the 'vote' column
    binarization_windows = [
        ([0, 25], [26, 50]),
        ([0, 21], [32, 50]),
        ([0, 16], [37, 50]),
        ([0, 11], [42, 50])
    ]

    # Load the dataset
    file_path = "ResidualProducts/ID-Features-Vote.csv"
    dataset = pd.read_csv(file_path)

    # Extract feature names excluding non-relevant columns
    feature_names = dataset.columns.difference(["ID", "PAINTING", "vote"])

    for idx, (class_0_range, class_1_range) in enumerate(binarization_windows):
        print(f"Processing binarization window {idx + 1}: 0->{class_0_range}, 1->{class_1_range}")

        # Function to binarize the 'vote' column
        def binarize_vote(vote):
            if class_0_range[0] <= vote <= class_0_range[1]:
                return 0
            elif class_1_range[0] <= vote <= class_1_range[1]:
                return 1
            else:
                return None

        # Apply binarization and filter invalid rows
        dataset[f"binary_vote_{idx}"] = dataset["vote"].apply(binarize_vote)
        filtered_dataset = dataset.dropna(subset=[f"binary_vote_{idx}"])
        filtered_dataset[f"binary_vote_{idx}"] = filtered_dataset[f"binary_vote_{idx}"].astype(int)

        # Prepare feature matrix and target vector
        X = filtered_dataset[feature_names].values
        y = filtered_dataset[f"binary_vote_{idx}"].values

        # 5-Fold Cross-Validation evaluation
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        model = RandomForestClassifier(n_estimators=100, random_state=42)

        accuracy_scores = []
        sensitivity_scores = []
        specificity_scores = []
        precision_scores = []
        npv_scores = []

        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            # Compute evaluation metrics
            accuracy = accuracy_score(y_test, y_pred)
            sensitivity = recall_score(y_test, y_pred, pos_label=1)
            specificity = recall_score(y_test, y_pred, pos_label=0)
            precision = precision_score(y_test, y_pred, pos_label=1)
            confusion = confusion_matrix(y_test, y_pred)
            npv = confusion[0, 0] / (confusion[0, 0] + confusion[0, 1]) if (confusion[0, 0] + confusion[0, 1]) > 0 else 0

            # Store scores for averaging
            accuracy_scores.append(accuracy)
            sensitivity_scores.append(sensitivity)
            specificity_scores.append(specificity)
            precision_scores.append(precision)
            npv_scores.append(npv)

        # Calculate mean metrics across the 5 folds
        avg_accuracy = np.mean(accuracy_scores)
        avg_sensitivity = np.mean(sensitivity_scores)
        avg_specificity = np.mean(specificity_scores)
        avg_precision = np.mean(precision_scores)
        avg_npv = np.mean(npv_scores)

        # Save the trained model for this window
        model_filename = f"ResidualProducts/final_model_window_{idx + 1}.pkl"
        with open(model_filename, "wb") as f:
            pickle.dump(model, f)

        # Extract feature importances
        feature_importances = model.feature_importances_
        sorted_indices = np.argsort(feature_importances)[::-1]
        top_features = [(feature_names[i], feature_importances[i]) for i in sorted_indices[:10]]

        # Print the evaluation results and top features
        print(f"Results for Binarization Window {idx + 1}:")
        print(f"Accuracy: {avg_accuracy:.4f}")
        print(f"Sensitivity: {avg_sensitivity:.4f}")
        print(f"Specificity: {avg_specificity:.4f}")
        print(f"Precision (PPV): {avg_precision:.4f}")
        print(f"Negative Predictive Value (NPV): {avg_npv:.4f}")
        print("\nTop 10 Important Features:")
        for feature, importance in top_features:
            print(f"{feature}: {importance:.4f}")

        sys.stdout.flush()
        print('\n\n')

    print("Processing completed successfully.")

# Restore standard output to console
sys.stdout = sys.__stdout__
print(f"Results saved to {output_file_name}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dataset[f"binary_vote_{idx}"] = filtered_dataset[f"binary_vote_{idx}"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dataset[f"binary_vote_{idx}"] = filtered_dataset[f"binary_vote_{idx}"].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dataset[f"bi