In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
import warnings
warnings.filterwarnings("ignore")
from collections import Counter
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
import json 
base_dir = 'results/'
import pickle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import json
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
def load_data(file_name):
    df = pd.read_csv(base_dir+file_name, index_col='Datetime')
    df.index = pd.to_datetime(df.index, format='%Y-%m-%d %H:%M:%S')
    return df

def train_binary_classifier_for_clusters(cluster_labels, data, model_name='logistic_regression', test_size=0.2, random_state=42, cv=True):
    """
    Trains a binary classifier for each cluster.

    Parameters:
    - cluster_labels: Cluster labels assigned by KMeans or any clustering algorithm
    - data: Input data (numpy array or pandas DataFrame)
    - model_name: Name of the binary classifier ('logistic_regression', 'random_forest', 'svm')
    - test_size: Proportion of the dataset to include in the test split (default is 0.2)
    - random_state: Seed for random number generation (default is 42)

    Returns:
    - classifiers: Dictionary containing trained classifiers for each cluster
    - results: Dictionary containing evaluation scores for each cluster
    """
    
    classifiers = {}
    accuracy_scores = {}
    # Initialize dictionary to store results
    
    # Identify unique clusters
    unique_clusters = set(cluster_labels)
    results = {}
    
    for cluster in unique_clusters:
        if cluster == -1:
            continue
        print(f"Cluster Number: {cluster}")
        results[str(cluster)] = {}
        
        #print(cluster)
        # Select data points for the current cluster
        cluster_indices = (cluster_labels == cluster)
        X_cluster, y_cluster = data[cluster_indices], np.zeros(sum(cluster_indices))  # Target label for the cluster is 0

        # Select an equal number of data points from other clusters
        other_clusters = list(set(unique_clusters) - {cluster})
        other_indices = []
        df_other_clusters = pd.DataFrame()
        
        for other_cluster in other_clusters:
            other_cluster_indices = np.where(cluster_labels == other_cluster)[0]
            df_other = data.iloc[other_cluster_indices]
            df_other_clusters = pd.concat([df_other_clusters, df_other])
            
        df_other_random = df_other_clusters.sample(n=X_cluster.shape[0], random_state=42)
        X_other, y_other = df_other_random.values, np.ones(df_other_random.shape[0])
        # Combine data for the current cluster and other clusters
        X_combined = np.vstack([X_cluster, X_other])
        y_combined = np.hstack([y_cluster, y_other])

        # Train the specified binary classifier
        if model_name == 'logistic_regression':
            classifier = LogisticRegression(random_state=random_state)
        elif model_name == 'random_forest':
            classifier = RandomForestClassifier(random_state=random_state)
        elif model_name == 'svm':
            classifier = SVC(random_state=random_state)
        elif model_name == 'naive_bayes':
            classifier = GaussianNB()
        else:
            raise ValueError("Invalid model_name. Choose from 'logistic_regression', 'random_forest', or 'svm'.")

        if cv == True:
            mean_accuracy, std_accuracy, mean_f1, std_f1, mean_precision, std_precision, mean_recall, std_recall = perform_cv_binary_classifier(X_combined, y_combined, classifier)
            print(f'Mean Accuracy: {mean_accuracy:.4f} Std: {std_accuracy}')
            print(f'Mean Precision: {mean_precision:.4f} Std: {std_precision}')
            print(f'Mean Recall: {mean_recall:.4f} Std: {std_recall}')
            print(f'Mean F1 Score: {mean_f1:.4f} Std: {std_f1}')
            
            results[str(cluster)]['Accuracy'] = [mean_accuracy, std_accuracy]
            results[str(cluster)]['F1'] = [mean_f1, std_f1]
            results[str(cluster)]['Precision'] = [mean_precision, std_precision]
            results[str(cluster)]['Recall'] = [mean_recall, std_recall]

        else:
            X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)
            classifier.fit(X_train, y_train)
            
            # Make predictions on the test data
            y_pred = classifier.predict(X_test)

            # Evaluate performance
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            
            print(f'Mean Accuracy: {accuracy:.4f}')
            print(f'Mean Precision: {precision:.4f}')
            print(f'Mean Recall: {recall:.4f}')
            print(f'Mean F1 Score: {f1:.4f}')
            classifiers[str(cluster)] = classifier
            
    return results, classifiers

def perform_cv_binary_classifier(X, y, classifier):
    num_folds = 10

    # Create a StratifiedKFold object
    stratified_kfold = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    
    # Lists to store evaluation results for each fold
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    
    # Perform cross-validation
    for train_index, test_index in stratified_kfold.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
        # Fit the model on the training data
        classifier.fit(X_train, y_train)
    
        # Make predictions on the test data
        y_pred = classifier.predict(X_test)
    
        # Evaluate performance
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
    
        # Append scores to the lists
        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    
    # Calculate and print mean scores across all folds
    mean_accuracy, std_accuracy = np.mean(accuracy_scores), np.std(accuracy_scores)
    mean_precision, std_precision = np.mean(precision_scores), np.std(precision_scores)
    mean_recall, std_recall = np.mean(recall_scores), np.std(recall_scores)
    mean_f1, std_f1 = np.mean(f1_scores), np.std(f1_scores)
    
    
    return mean_accuracy, std_accuracy, mean_f1, std_f1, mean_precision, std_precision, mean_recall, std_recall

In [None]:
file_name = 'faulty_data_predicted_elliptic_last_two_months.csv'
df_anomaly_data = load_data(file_name)

In [None]:
loaded_labels = np.load('results/cluster_labels.npy')

In [None]:
logit_results = train_binary_classifier_for_clusters(loaded_labels, df_anomaly_data)

In [None]:
with open('results/logit_regression_cluster_models_eval_10fold_results.json', "w") as outfile: 
        json.dump(logit_results, outfile, indent=4)

In [None]:
random_forest_results = train_binary_classifier_for_clusters(loaded_labels, df_anomaly_data, 'random_forest')

In [None]:
with open('results/rf_cluster_models_eval_10fold_results.json', "w") as outfile: 
        json.dump(random_forest_results, outfile, indent=4)

In [None]:
svm_results = train_binary_classifier_for_clusters(loaded_labels, df_anomaly_data, 'svm')

In [None]:
with open('results/svm_cluster_models_eval_10fold_results.json', "w") as outfile: 
        json.dump(svm_results, outfile, indent=4)

In [None]:
naive_bayes_results = train_binary_classifier_for_clusters(loaded_labels, df_anomaly_data, 'naive_bayes')

In [None]:
with open('results/nb_cluster_models_eval_10fold_results.json', "w") as outfile: 
        json.dump(naive_bayes_results, outfile, indent=4)

In [None]:
random_forest_results, rf_classifiers = train_binary_classifier_for_clusters(loaded_labels, df_anomaly_data, 'random_forest', cv=False)

In [None]:
# Save the dictionary containing trained models
joblib.dump(rf_classifiers, 'models/rf_cluster_models.joblib')