In [None]:
import json
import itertools
import random
import matplotlib.pyplot as plt
import numpy as np


def parse_rules_from_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        rules = []
        positive_count = 0
        negative_count = 0

        for line in lines:
            if positive_count >= 4 and negative_count >= 4:
                break
            parts = line.split('<--')
            label = parts[0].strip().lower()
            condition_part = parts[1].strip()
            conditions = condition_part.split(', ')
            rule_dict = {'label': label}

            for condition in conditions:
                rule_dict[condition] = 0 if 'inv_' in condition or 'task_' in condition else 1

            if label == 'positive' and positive_count < 4:
                rules.append(rule_dict)
                positive_count += 1
            elif label == 'negative' and negative_count < 4:
                rules.append(rule_dict)
                negative_count += 1

    return rules


def load_data(file_path):
    with open(file_path, 'r') as json_file:
        return json.load(json_file)

def split_data(data, train_frac=0.5, validate_frac=0.25):
    random.shuffle(data)
    n = len(data)
    train_end = int(n * train_frac)
    validate_end = train_end + int(n * validate_frac)
    return data[:train_end], data[train_end:validate_end], data[validate_end:]
import random

def split_data(data, train_frac=0.8, validate_frac=0.1):
    """
    Splits the data into train, validation, and test sets while maintaining original indices.

    Args:
    data (list): The dataset to be split.
    train_frac (float): Fraction of the dataset to be used as the training set.
    validate_frac (float): Fraction of the dataset to be used as the validation set.

    Returns:
    tuple: Three lists representing the train, validation, and test datasets.
    """
    indices = list(range(len(data)))
    random.shuffle(indices)

    train_end = int(len(data) * train_frac)
    validate_end = train_end + int(len(data) * validate_frac)

    train_indices = indices[:train_end]
    validate_indices = indices[train_end:validate_end]
    test_indices = indices[validate_end:]

    train_data = [data[i] for i in train_indices]
    validate_data = [data[i] for i in validate_indices]
    test_data = [data[i] for i in test_indices]

    return train_data, validate_data, test_data, test_indices


processed_data = load_data('/content/processed_transformed_data_json.json')


high_confidence_rules = parse_rules_from_file("/content/Corrected_High_Confidence_Rules.txt")
converted_rrl_sarp_rules = parse_rules_from_file("/content/corrected_converted_rrl_sarp.txt")
all_rule_conditions = high_confidence_rules + converted_rrl_sarp_rules

def generate_combinations(rule_conditions, min_components):
    combinations = []
    for r in range(min_components, len(rule_conditions) + 1):
        combinations.extend(itertools.combinations(rule_conditions, r))
    return combinations



def apply_rule(data, conditions, operation='AND', default_label='negative'):

    rule_label = conditions.pop('label', default_label)
    if data.get('label', default_label) != rule_label:
        return 'not ' + rule_label, data.get('actual_label', 'negative'), 0

    total_conditions = len(conditions)
    matched_conditions = sum(1 for key, value in conditions.items() if data.get(key, 0) == value)
    matched_all = all(data.get(key, 0) == value for key, value in conditions.items()) if operation == 'AND' else any(data.get(key, 0) == value for key, value in conditions.items())

    predicted_label = rule_label if matched_all else 'not ' + rule_label
    confidence = (matched_conditions / total_conditions) * 100
    actual_label = data.get('actual_label', 'negative')
    return predicted_label, actual_label, confidence


def evaluate_combinations_by_range(data_samples, rule_conditions, min_components):
    rule_results = []
    index_label_pairs = {}
    for index, data_sample in enumerate(data_samples):
        for min_comp in range(1, min_components + 1):
            combinations = generate_combinations(rule_conditions, min_comp)
            for combination in combinations:
                flattened_conditions = {k: v for d in combination for k, v in d.items()}
                for operation in ['AND', 'OR']:
                    label, actual_label, confidence = apply_rule(data_sample, flattened_conditions, operation)
                    rule_results.append((label, actual_label, confidence))
                    if index not in index_label_pairs or confidence > index_label_pairs[index][1]:
                        index_label_pairs[index] = (label, confidence)

    rule_results.sort(key=lambda x: x[2], reverse=True)
    top_rules = rule_results[:int(len(rule_results) * 0.0001)]

    tp, fp, fn, tn = 0, 0, 0, 0
    total_confidence = 0
    for label, actual_label, confidence in top_rules:
        total_confidence += confidence
        if label == actual_label:
            if label.startswith('not '):
                tn += 1
            else:
                tp += 1
        else:
            if label.startswith('not '):
                fn += 1
            else:
                fp += 1

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0
    interpretability = 1 / min_components
    overall_confidence = total_confidence / len(top_rules) if top_rules else 0

    return overall_confidence, precision, recall, f1_score, accuracy, interpretability, index_label_pairs


def find_best_min_component(validate_data, rule_conditions, min_range, max_range, weight_interpretability=0.0):
    best_score = 0
    best_min_comp = min_range
    for min_comp in range(min_range, max_range + 1):
        result = evaluate_combinations_by_range(validate_data, rule_conditions, min_comp)
        overall_confidence, precision, recall, f1_score, accuracy, interpretability, _ = result
        score = (accuracy * (1 - weight_interpretability)) + (interpretability * weight_interpretability)
        if overall_confidence > best_score:
            best_score = overall_confidence
            best_min_comp = min_comp
    return best_min_comp




min_comp=0
results = []
for _ in range(10):
    train_data, validate_data, test_data, test_indices = split_data(processed_data,0.5,0.25)
    best_min_components = find_best_min_component(train_data, all_rule_conditions, 1, 10)
    _, precision, recall, f1_score, accuracy, interpretability, indexed_label_pairs = evaluate_combinations_by_range(test_data, all_rule_conditions, best_min_components)
    original_index_label_pairs = [(test_indices[i], label) for i, (label, _) in indexed_label_pairs.items()]

    file_name = f'/content/test_predictions_{min_comp}.json'
    with open(file_name, 'w') as f:
        json.dump(original_index_label_pairs, f, indent=4)
    print(f'Results saved in {file_name}')
    overall_confidence, precision, recall, f1_score, accuracy, interpretability, _ = evaluate_combinations_by_range(test_data, all_rule_conditions, best_min_components)
    results.append([overall_confidence, precision, recall, f1_score, accuracy, interpretability])

    results_array = np.array([result[:-1] for result in results])
    mean_metrics = np.mean(results_array, axis=0)
    std_metrics = np.std(results_array, axis=0)
    min_comp=min_comp+1
    print(results_array)
    print(std_metrics)


