# EXERCISE 1

* Implement the Apriori algorithm tailored for sequence data.
* It calculates the frequent sequences within a Transaction Dataset, 'SepsisCases.xes'.
* it extracts association rules from the frequent sequences to identify significant patterns in the data.

<br>
The 'SepsisCases.xes' is a real-life event log contains events of sepsis cases from a hospital. There are about 1000 cases with in total 15,000 events that were recorded for 16 different activities.

## Dataset

In [3]:
# Parse the 'SepsisCases.xes' file using the ElementTree module to extract trace information.

import xml.etree.ElementTree as ET
import pandas as pd
from functools import reduce
import numpy as np
import matplotlib.pyplot as plt

tree = ET.parse('SepsisCases.xes')
t = tree.findall('.//trace') # trace is a single case in the dataset

# Extract transactions from the parsed XML and store them in 'TD_AS_LIST'.
# 1 Iterates through each <trace> (i.e., each case).
# 2 Extracts the sequence of events (<event>) within that trace.
# 3 For each <event>, it retrieves the value of the attribute where key="concept:name" (i.e., the event's name).
# 4 Stores the event names as a list of sequences, where each list represents a patient's event history.
sequences =[[e.find('.//string[@key =\'concept:name\']').attrib['value']  for e in list(t.findall('.//event'))] for t in tree.findall('.//trace')]
sequences #A list of lists, where each inner list represents a sequence of events for a single patient.

[['ER Registration',
  'Leucocytes',
  'CRP',
  'LacticAcid',
  'ER Triage',
  'ER Sepsis Triage',
  'IV Liquid',
  'IV Antibiotics',
  'Admission NC',
  'CRP',
  'Leucocytes',
  'Leucocytes',
  'CRP',
  'Leucocytes',
  'CRP',
  'CRP',
  'Leucocytes',
  'Leucocytes',
  'CRP',
  'CRP',
  'Leucocytes',
  'Release A'],
 ['ER Registration',
  'ER Triage',
  'CRP',
  'LacticAcid',
  'Leucocytes',
  'ER Sepsis Triage',
  'IV Liquid',
  'IV Antibiotics',
  'Admission NC',
  'CRP',
  'CRP',
  'Release A'],
 ['ER Registration',
  'ER Triage',
  'ER Sepsis Triage',
  'Leucocytes',
  'CRP',
  'IV Liquid',
  'IV Antibiotics',
  'Admission NC',
  'Admission NC',
  'Leucocytes',
  'CRP',
  'Leucocytes',
  'CRP',
  'Release A'],
 ['ER Registration',
  'ER Triage',
  'ER Sepsis Triage',
  'CRP',
  'LacticAcid',
  'Leucocytes',
  'IV Liquid',
  'IV Antibiotics',
  'Admission NC',
  'Leucocytes',
  'CRP',
  'Release A',
  'Return ER'],
 ['ER Registration',
  'ER Triage',
  'ER Sepsis Triage',
  'IV Liqu

In [4]:
seq_sets = [set(s) for s in sequences] #removes duplicate events within each sequence. Ensures that each patient's event list is represented as a set of unique events.
seq_items = reduce(lambda x, y: x.union(y), seq_sets) #It combines two sets by taking their union
seq_items #This represents all possible events recorded across all patient cases.

{'Admission IC',
 'Admission NC',
 'CRP',
 'ER Registration',
 'ER Sepsis Triage',
 'ER Triage',
 'IV Antibiotics',
 'IV Liquid',
 'LacticAcid',
 'Leucocytes',
 'Release A',
 'Release B',
 'Release C',
 'Release D',
 'Release E',
 'Return ER'}

## Apriori Algorithm 

In [5]:
#Calculate support (frequency of a sequence in the dataset).
#Support is the proportion of transactions (patient event logs) that contain sequence.
def support(transaction_dataset, sequence):
        count = sum(1 for transaction in transaction_dataset if is_subsequence(sequence, transaction))
        return count / len(transaction_dataset)

def is_subsequence(sequence, transaction):
        it = iter(transaction)
        return all(any(item == x for x in it) for item in sequence)

def get_prefix_subsequences(sequence):
        return [sequence[:i] for i in range(len(sequence))]

In [6]:

def apriori_sequential_patterns(transaction_dataset, min_support): #Uses min_support to determine whether a sequence is frequent.

    frequent_sequences = []  # List to hold all frequent sequences

    # Get unique items in the dataset and generate candidate items
    unique_items = set(item for sublist in transaction_dataset for item in sublist)
    candidate_items = [[item] for item in unique_items]
    candidates = candidate_items

    while candidates:
        current_frequent_sequences = [seq for seq in candidates if support(transaction_dataset, seq) >= min_support] #Keeps only those sequences that occur at least min_support times in the dataset.
        if not current_frequent_sequences:
            break
        frequent_sequences.extend(current_frequent_sequences)


        new_candidates = set()
        for sequence in current_frequent_sequences: #Takes each frequent sequence from current_frequent_sequences.
            for item in candidate_items: #Appends each possible item from candidate_items to form a longer sequence.
                extended_sequence = sequence + item
                if len(extended_sequence) > 4:
                    continue
                if support(transaction_dataset, extended_sequence) >= min_support:
                    new_candidates.add(tuple(extended_sequence))

        candidates = [list(seq) for seq in new_candidates] #remove duplicates

    # Stops when no more patterns are found
    return frequent_sequences


In [7]:
EPSILON = 0.05

In [8]:
freq_seq = apriori_sequential_patterns(sequences, EPSILON)
print("The frequent sequences in the Transaction Dataset are:", freq_seq)

The frequent sequences in the Transaction Dataset are: [['Admission NC'], ['Admission IC'], ['Release B'], ['IV Liquid'], ['ER Triage'], ['Leucocytes'], ['Release A'], ['ER Sepsis Triage'], ['ER Registration'], ['IV Antibiotics'], ['LacticAcid'], ['Return ER'], ['CRP'], ['Admission NC', 'CRP'], ['ER Registration', 'ER Triage'], ['ER Sepsis Triage', 'Admission IC'], ['Leucocytes', 'IV Liquid'], ['ER Triage', 'Admission IC'], ['ER Sepsis Triage', 'Release B'], ['IV Antibiotics', 'Return ER'], ['LacticAcid', 'ER Sepsis Triage'], ['CRP', 'IV Liquid'], ['ER Triage', 'Release B'], ['LacticAcid', 'Admission IC'], ['ER Sepsis Triage', 'Admission NC'], ['ER Sepsis Triage', 'LacticAcid'], ['Admission IC', 'Release A'], ['Leucocytes', 'IV Antibiotics'], ['Leucocytes', 'CRP'], ['IV Liquid', 'Leucocytes'], ['ER Triage', 'Admission NC'], ['ER Triage', 'LacticAcid'], ['CRP', 'CRP'], ['ER Registration', 'Leucocytes'], ['Admission IC', 'Leucocytes'], ['ER Sepsis Triage', 'IV Liquid'], ['ER Registration

## Support 

In [9]:
# Plot the support distribution histogram of the frequent sequences.

support_values = [support(sequences, seq) for seq in freq_seq]
plt.hist(support_values, bins=20)
plt.xlabel('Support')
plt.ylabel('Frequency')
plt.title('Support Distribution of Frequent Sequences')
plt.show()



## Association Rules
Association rules highlight patterns where the presence of certain events (antecedent) leads to the occurrence of others (consequent), so reveal correlations between items



In [8]:
def extract_association_rules(frequent_sequences, transaction_dataset, confidence_threshold):
    association_rules = []
    for sequence in frequent_sequences:
        sequence_support = support(transaction_dataset, sequence) #calculates the support of the sequence
        for i in range(1, len(sequence)):
            #This process generates all possible splits of the sequence into antecedent and consequent.
            antecedent = sequence[:i]
            consequent = sequence[i:]
            antecedent_support = support(transaction_dataset, antecedent)
            #Confidence measures how often the consequent happens when the antecedent occurs.
            confidence = sequence_support / antecedent_support
            if confidence >= confidence_threshold:
                association_rules.append((antecedent, consequent, confidence))
    return association_rules

confidence_threshold = 0.8

association_rules = extract_association_rules(freq_seq, sequences, confidence_threshold)

print("Association Rules:")
for antecedent, consequent, confidence in association_rules:
    print(f"{antecedent} -> {consequent} (confidence: {confidence})")

Association Rules:
['CRP'] -> ['Leucocytes'] (confidence: 0.8152929493545183)
['ER Sepsis Triage'] -> ['Leucocytes'] (confidence: 0.9008579599618685)
['IV Liquid'] -> ['IV Antibiotics'] (confidence: 0.8791500664010624)
['Admission IC'] -> ['LacticAcid'] (confidence: 0.9636363636363635)
['ER Triage'] -> ['ER Sepsis Triage'] (confidence: 0.9828571428571429)
['Admission NC'] -> ['Release A'] (confidence: 0.8375)
['ER Registration'] -> ['CRP'] (confidence: 0.9561904761904761)
['IV Antibiotics'] -> ['Admission NC'] (confidence: 0.8396111786148237)
['Leucocytes'] -> ['CRP'] (confidence: 0.8557312252964427)
['IV Liquid'] -> ['Admission NC'] (confidence: 0.8154050464807437)
['ER Triage'] -> ['Leucocytes'] (confidence: 0.9438095238095238)
['LacticAcid'] -> ['CRP'] (confidence: 0.8430232558139535)
['IV Liquid'] -> ['Leucocytes'] (confidence: 0.8100929614873839)
['ER Registration'] -> ['LacticAcid'] (confidence: 0.8171428571428572)
['ER Registration'] -> ['ER Sepsis Triage'] (confidence: 0.992380

## Extracting significant pattern in the data
Prints out each association rule in a readable format, showing the antecedent, consequent, and the confidence of the rule

In [12]:
for rule in association_rules:
    print(f"Rule: {rule[0]} => {rule[1]} (Confidence: {round(rule[2], 2)})")

Rule: ['CRP'] => ['Leucocytes'] (Confidence: 0.82)
Rule: ['ER Sepsis Triage'] => ['Leucocytes'] (Confidence: 0.9)
Rule: ['IV Liquid'] => ['IV Antibiotics'] (Confidence: 0.88)
Rule: ['Admission IC'] => ['LacticAcid'] (Confidence: 0.96)
Rule: ['ER Triage'] => ['ER Sepsis Triage'] (Confidence: 0.98)
Rule: ['Admission NC'] => ['Release A'] (Confidence: 0.84)
Rule: ['ER Registration'] => ['CRP'] (Confidence: 0.96)
Rule: ['IV Antibiotics'] => ['Admission NC'] (Confidence: 0.84)
Rule: ['Leucocytes'] => ['CRP'] (Confidence: 0.86)
Rule: ['IV Liquid'] => ['Admission NC'] (Confidence: 0.82)
Rule: ['ER Triage'] => ['Leucocytes'] (Confidence: 0.94)
Rule: ['LacticAcid'] => ['CRP'] (Confidence: 0.84)
Rule: ['IV Liquid'] => ['Leucocytes'] (Confidence: 0.81)
Rule: ['ER Registration'] => ['LacticAcid'] (Confidence: 0.82)
Rule: ['ER Registration'] => ['ER Sepsis Triage'] (Confidence: 0.99)
Rule: ['Admission IC'] => ['CRP'] (Confidence: 0.98)
Rule: ['LacticAcid'] => ['Admission NC'] (Confidence: 0.84)
Rul