<a href="https://colab.research.google.com/github/ammaryasser21/project_OS/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [262]:
from collections import defaultdict
from itertools import combinations
import pandas as pd

def read_data(file_path):
    df = pd.read_excel(file_path)
    return df

In [263]:
def format_data(df, min_support):
    formatted_df = pd.DataFrame(df)

    if 'TiD' in df.columns:
           TiD = df.columns[0]
           items = df.columns[1]
           data_tuples = [(row['TiD'], row['items']) for index, row in df.iterrows()]
           item_tid_map = defaultdict(set)
           for tid, items in data_tuples:
             items_list = items.split(',')
             for item in items_list:
               item_tid_map[item].add(tid)
               formatted_output = {}

           for item, tid_set in item_tid_map.items():
            formatted_output[item] = ','.join(map(str, sorted(tid_set)))

           for item, tid_set in sorted(formatted_output.items()):
            print(f"{item: <5} {tid_set}")

           formatted_df = pd.DataFrame(formatted_output.items(), columns=['Item', 'TiD_set'])
           formatted_df.columns = ['itemset', 'TiD_set']
           itemset = formatted_df.columns[0]
           TiD_set = formatted_df.columns[1]
           print("The data is converted to vertical format.")
    else:
          formatted_df=pd.DataFrame(df)
          formatted_df.columns = ['itemset', 'TiD_set']
          itemset = formatted_df.columns[0]
          TiD_set = formatted_df.columns[1]
          print("The data is already structured vertically.")

    formatted_df['TiD_set'] = formatted_df['TiD_set'].apply(lambda x: x if len(x.split(',')) >= min_support else None)
    formatted_df = formatted_df[formatted_df['TiD_set'].notna()]
    return formatted_df

In [264]:
def remove_T(row):
    if any(x.lower().startswith('t') for x in row.split(',')):
        return ','.join([x[1:] if x.lower().startswith('t') else x for x in row.split(',')])
    return row

In [265]:
def generate_dict_from_df(formatted_df):
    formatted_dict = {
        row['itemset']: [int(num) for num in row['TiD_set'].split(',')] for _, row in formatted_df.iterrows()
    }
    return formatted_dict


In [266]:
def combine_items(filtered_items):
    result = []
    visited = set()

    for i in range(len(filtered_items)):
        for j in range(len(filtered_items)):
            if i != j and filtered_items[i][1:] == filtered_items[j][:-1] and filtered_items[i][0] != filtered_items[j][-1]:
                combined = filtered_items[i][0] + filtered_items[j]
                if combined not in visited:
                    result.append(combined)
                    visited.add(combined)

    return result

In [267]:
def filter_items(result, formatted_dict, min_support):
    filtered_items = []
    for item in result:
        appearances = [formatted_dict[char] for char in item if char in formatted_dict]
        appearances = set.intersection(*[set(app) for app in appearances])
        if len(appearances) >= min_support:
            filtered_items.append(item)
    return filtered_items


In [268]:
def generate_frequent_list(formatted_dict,min_support,formatted_df):
    frequent_list = list(formatted_df['itemset'])
    all_combinations = list(combinations(frequent_list, 2))
    joined_combinations = ["".join(comb) for comb in all_combinations]
    filtered_items = filter_items(joined_combinations, formatted_dict, min_support)
    frequent_list+=filtered_items
    while True:
        result = combine_items(filtered_items)
        filtered_items = filter_items(result, formatted_dict, min_support)
        if not filtered_items:
            break
        frequent_list += filtered_items
    return frequent_list

In [269]:
def generate_association_rules(frequent_list):
    association_rules = []
    for item in frequent_list:
        other_items = [x for x in frequent_list if x != item]
        for other_item in other_items:
            rule = f"{item} -> {other_item}"
            association_rules.append(rule)
    return association_rules

In [270]:
def generate_strong_association_rules(frequent_list, min_confidence):
    strong_association_rules = []
    for item in frequent_list:
        other_items = [x for x in frequent_list if x != item]
        for other_item in other_items:
            support_X = frequent_list.count(item) / len(frequent_list)
            support_XY = frequent_list.count(item + other_item) / len(frequent_list)
            confidence = support_XY / support_X if support_X > 0 else 0

            if confidence >= min_confidence:
                rule = f"{item} -> {other_item}"
                strong_association_rules.append(rule)
    return strong_association_rules

In [271]:
def calculate_support(combinations_list, appearances_dict):
    lens_of_appearance = {}

    for comb in combinations_list:
        count = 0
        for char in comb:
            if char not in appearances_dict:
                count = 0
                break
            else:
                if count == 0:
                    count = len(appearances_dict[char])
                else:
                    count = len(set(appearances_dict[char]) & set(range(1, count+1)))
        lens_of_appearance[comb] = count

    return lens_of_appearance

In [272]:
def calculate_lift(rule, support_values):
    lhs, rhs = rule.split(' -> ')
    support_lhs = support_values.get(lhs, 0)
    support_rhs = support_values.get(rhs, 0)
    combined_support = support_values.get(lhs + rhs, 0)

    if support_lhs == 0 or support_rhs == 0 or combined_support == 0:
        return 0
    lift = combined_support / (support_lhs * support_rhs)
    return lift

In [273]:
def ECLAT(file_path, min_support, min_confidence):
    df = read_data(file_path)
    print(df)
    formatted_df = format_data(df, min_support)
    print(formatted_df)
    formatted_df['TiD_set'] = formatted_df['TiD_set'].apply(remove_T)
    print(formatted_df)
    formatted_dict = generate_dict_from_df(formatted_df)
    print(formatted_dict)
    frequent_list=generate_frequent_list(formatted_dict,min_support,formatted_df)
    print(frequent_list)
    rules = generate_association_rules(frequent_list)
    print(rules)
    strong_rules = generate_strong_association_rules(frequent_list, min_confidence)
    print(strong_rules)
    support = calculate_support(frequent_list, formatted_dict)
    print(support)
    for rule in strong_rules:
        lift = calculate_lift(rule, support)
        print(f"Lift for rule '{rule}': {lift}")


In [274]:
file_path_1 = '/content/Horizontal_Format.xlsx'
file_path_2 = '/content/Book1.xlsx'
ECLAT(file_path_1, 2, 0.3)
ECLAT(file_path_2, 2, 0.3)

   TiD        items
0    1  M,O,N,K,E,Y
1    2  D,O,N,K,E,Y
2    3      M,A,K,E
3    4    M,U,C,K,Y
4    5  C,O,O,K,I,E
A     3
C     4,5
D     2
E     1,2,3,5
I     5
K     1,2,3,4,5
M     1,3,4
N     1,2
O     1,2,5
U     4
Y     1,2,4
The data is converted to vertical format.
  itemset    TiD_set
0       M      1,3,4
1       O      1,2,5
2       N        1,2
3       K  1,2,3,4,5
4       E    1,2,3,5
5       Y      1,2,4
9       C        4,5
  itemset    TiD_set
0       M      1,3,4
1       O      1,2,5
2       N        1,2
3       K  1,2,3,4,5
4       E    1,2,3,5
5       Y      1,2,4
9       C        4,5
{'M': [1, 3, 4], 'O': [1, 2, 5], 'N': [1, 2], 'K': [1, 2, 3, 4, 5], 'E': [1, 2, 3, 5], 'Y': [1, 2, 4], 'C': [4, 5]}
['M', 'O', 'N', 'K', 'E', 'Y', 'C', 'MK', 'ME', 'MY', 'ON', 'OK', 'OE', 'OY', 'NK', 'NE', 'NY', 'KE', 'KY', 'KC', 'EY', 'MKE', 'MKY', 'ONK', 'ONE', 'ONY', 'OKE', 'OKY', 'OEY', 'NKE', 'NKY', 'NEY', 'KEY', 'ONKE', 'ONKY', 'ONEY', 'OKEY', 'NKEY', 'ONKEY']
['M -> O', 'M -