In [20]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
import warnings
warnings.filterwarnings("ignore")
from collections import Counter
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
import json 
base_dir = 'results\\hold_out_set_and_predicted_faults\\'
import pickle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import json
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [28]:
def load_data(file_name):
    df = pd.read_csv(base_dir+file_name)
    
    return df

def train_binary_classifier_for_clusters(cluster_labels, data, model_name='logistic_regression', test_size=0.2, random_state=42):
    """
    Trains a binary classifier for each cluster.

    Parameters:
    - cluster_labels: Cluster labels assigned by KMeans or any clustering algorithm
    - data: Input data (numpy array or pandas DataFrame)
    - model_name: Name of the binary classifier ('logistic_regression', 'random_forest', 'svm')
    - test_size: Proportion of the dataset to include in the test split (default is 0.2)
    - random_state: Seed for random number generation (default is 42)

    Returns:
    - classifiers: Dictionary containing trained classifiers for each cluster
    - results: Dictionary containing evaluation scores for each cluster
    """
    # Initialize dictionaries to store classifiers and accuracy scores
    classifiers = {}
    accuracy_scores = {}
    # Initialize dictionary to store results
    
    # Identify unique clusters
    unique_clusters = set(cluster_labels)
    results = {}
    for cluster in unique_clusters:
        print(f"Cluster Number: {cluster}")
        
        #print(cluster)
        # Select data points for the current cluster
        cluster_indices = (cluster_labels == cluster)
        X_cluster, y_cluster = data[cluster_indices], np.zeros(sum(cluster_indices))  # Target label for the cluster is 0

        # Select an equal number of data points from other clusters
        other_clusters = list(set(unique_clusters) - {cluster})

        
        other_indices = []
        for other_cluster in other_clusters:
            other_cluster_indices = np.where(cluster_labels == other_cluster)[0]
            random_other_indices = np.random.choice(other_cluster_indices, int(sum(cluster_indices)/2), replace=True)
            other_indices.extend(random_other_indices)

        X_other, y_other = data[other_indices], np.ones(sum(cluster_indices))  # Target label for other clusters is 1

        # Combine data for the current cluster and other clusters
        X_combined = np.vstack([X_cluster, X_other])
        y_combined = np.hstack([y_cluster, y_other])

        X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=test_size, random_state=random_state)

        # Train the specified binary classifier
        if model_name == 'logistic_regression':
            classifier = LogisticRegression(random_state=random_state)
        elif model_name == 'random_forest':
            classifier = RandomForestClassifier(random_state=random_state)
        elif model_name == 'svm':
            classifier = SVC(random_state=random_state)
        else:
            raise ValueError("Invalid model_name. Choose from 'logistic_regression', 'random_forest', or 'svm'.")

        classifier.fit(X_train, y_train)

        # Evaluate the classifier on the test set
        y_pred = classifier.predict(X_test)

        classifiers[str(cluster)] = classifier
        results[str(cluster)] = {'accuracy': accuracy_score(y_test, y_pred), 'precision': precision_score(y_test, y_pred), 
                                 'recall': recall_score(y_test, y_pred), 'f1': f1_score(y_test, y_pred)}

    return classifiers, results

In [8]:
file_name = 'LOF_predicted_faults_vals_sampling_freq_5T.csv'
df_anomaly_data = load_data(file_name)

In [13]:
df_anomaly_deduplicated = df_anomaly_data.drop_duplicates()

In [10]:
loaded_labels = np.load('models/cluster_labels.npy')

In [21]:
classifiers, results = train_binary_classifier_for_clusters(loaded_labels, df_anomaly_deduplicated.values)

Cluster Number: 0
Cluster Number: 1
Cluster Number: 2


In [22]:
classifiers

{'0': LogisticRegression(random_state=42),
 '1': LogisticRegression(random_state=42),
 '2': LogisticRegression(random_state=42)}

In [24]:
with open('results/logit_regression_cluster_models_results.json', "w") as outfile: 
        json.dump(results, outfile, indent=4)

In [27]:
for cluster in classifiers.keys():
    print(classifiers[cluster])
    joblib.dump(classifiers[cluster], 'models/logit_reg_cluster_'+cluster+'.joblib')

LogisticRegression(random_state=42)
LogisticRegression(random_state=42)
LogisticRegression(random_state=42)
