#RULE PREPROCESSING

In [None]:
import pandas as pd
import re

# Load the file
file_path = '/content/rrl.txt'
output_file_path = '/content/corrected_converted_rrl.txt'
df = pd.read_csv(file_path, delimiter='\t', skiprows=1, engine='python', header=None)


df.columns = ['RID', 'class_negative', 'class_positive', 'Support', 'Rule']
df['Rule'] = df['Rule'].astype(str)
def format_rule(row):
    conditions = re.sub(r'\b\d+_', '', row['Rule'])
    if row['class_negative'] > row['class_positive']:
        return f"negative <-- {conditions}"
    else:
        return f"positive <-- {conditions}"

df['Formatted Rule'] = df.apply(format_rule, axis=1)

with open(output_file_path, 'w') as f:
    for rule in df['Formatted Rule']:
        f.write(rule + '\n')

print(f'Formatted rules have been saved to {output_file_path}')


In [None]:
import re

# Path to the input file
input_path = '/content/CARL.txt'
output_path = '/content/Corrected_High_Confidence_Rules.txt'

# Read the file
with open(input_path, 'r') as file:
    lines = file.readlines()

processed_lines = [re.sub(r'\d+\.\d+ \(\d+\.\d+\)\s+', '', line) for line in lines]
with open(output_path, 'w') as file:
    file.writelines(processed_lines)

print(f'Processed file saved to {output_path}')


#DATA PREPROCESSING

In [None]:
import re
import json
def read_and_correctly_transform_data(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    transformed_lines = []
    for line in lines:
        tokens = re.split(r', (?=\w+ (?:on|at) )', line.strip())

        processed_tokens = []
        for token in tokens:
            parts = token.split()
            if parts[0] == "inv":
                # Handle "inv_" cases with "on" or "at"
                if parts[2] in ["on", "at"]:
                    processed_tokens.append(f"inv_{parts[1]} {parts[2]} {parts[3]}")
                else:
                    processed_tokens.append("inv_" + token)
            else:
                processed_tokens.append(token)

        transformed_line = ', '.join(processed_tokens)
        transformed_lines.append(transformed_line)

    return transformed_lines

input_file_path = '/content/transformed_data_no_index.data'
correct_transformed_data = read_and_correctly_transform_data(input_file_path)


In [None]:
import json

def create_json_from_all_data_corrected(data_lines):
    json_data = []
    for line in data_lines:
        features = line.split(',')
        data_dict = {feature.strip(): 0 if feature.strip().startswith('inv_') else 1 for feature in features if not 'negative' in feature.strip()}
        label_key = [f for f in features if 'negative' in f.strip()]
        if label_key:
            data_dict['label'] = 'negative'
        json_data.append(data_dict)

    return json_data
all_data_json_corrected = create_json_from_all_data_corrected(correct_transformed_data)

json_file_path_corrected = '/content/transformed_data_json_corrected.json'
with open(json_file_path_corrected, 'w') as json_file:
    json.dump(all_data_json_corrected, json_file, indent=4)

print("JSON file saved at:", json_file_path_corrected)


In [None]:
json_file_path_corrected = '/content/transformed_data_json_corrected.json'
with open(json_file_path_corrected, 'r') as json_file:
    all_data_json_corrected = json.load(json_file)

def process_data(data):

    label_key = None
    label_value = None
    for key, value in list(data.items()):
        if key.endswith("negative") or key.endswith("positive"):
            label_key = 'label'
            label_value = key.split('_')[-1]
            del data[key]
    new_data = data.copy()
    for key, value in data.items():
        if key.startswith("inv_") and value == 1:
            new_key = key[4:]
            if new_key not in data:
                new_data[new_key] = 0
        elif not key.startswith("inv_") and value == 1:
            new_key = "inv_" + key
            if new_key not in data:
                new_data[new_key] = 0
    if label_key and label_value:
        new_data[label_key] = label_value

    return new_data

processed_data = [process_data(data) for data in all_data_json_corrected]
processed_json_file_path = '/content/processed_transformed_data_json.json'
with open(processed_json_file_path, 'w') as json_file:
    json.dump(processed_data, json_file, indent=4)

processed_json_file_path
