In [1]:
import tkinter as tk
from tkinter import filedialog
import pandas as pd
import numpy as np
import time

def edit_distance(s1, s2):
    m = len(s1)
    n = len(s2)

    # initialize the matrix
    d = [[0] * (n + 1) for i in range(m + 1)]

    # fill the first row and first column
    for i in range(1, m + 1):
        d[i][0] = i
    for j in range(1, n + 1):
        d[0][j] = j

    # fill the rest of the matrix
    for j in range(1, n + 1):
        for i in range(1, m + 1):
            if s1[i - 1] == s2[j - 1]:
                d[i][j] = d[i - 1][j - 1]
            else:
                d[i][j] = min(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1]) + 1

    # return the final value in the matrix
    return d[m][n]

def count_duplicates(records):
    count_dict = {}

    for row in records:
        record = tuple(row[1:])  # discard column 1

        if record in count_dict:
            count_dict[record]['count'] += 1
            count_dict[record]['ids'].append(row[0])  # add ID to list of IDs

        else:
            count_dict[record] = {'count': 1, 'ids': [row[0]]}

    duplicates = []
    total = 0
    for record, count_dict in count_dict.items():
        if count_dict['count'] > 1:
            duplicates.append(f"IDs: {', '.join(str(id) for id in count_dict['ids'])}, {record}: duplicate count: {count_dict['count']}")
            total = total + count_dict['count']

    lbl_total = tk.Label(window, text="Number of Duplicates: {}".format(total))
    lbl_total.pack()

    return duplicates

def show_column_selection_dialog(columns):
    def submit_selection():
        global selected_columns
        selected_columns = [column for column, var in column_vars.items() if var.get()]
        column_selection_window.destroy()

    column_selection_window = tk.Toplevel(window)
    column_selection_window.title("Select Columns")

    # Create a canvas and a scrollbar
    canvas = tk.Canvas(column_selection_window)
    scrollbar = tk.Scrollbar(column_selection_window, orient="vertical", command=canvas.yview)
    scrollable_frame = tk.Frame(canvas)

    scrollable_frame.bind(
        "<Configure>",
        lambda e: canvas.configure(scrollregion=canvas.bbox("all"))
    )

    canvas.create_window((0, 0), window=scrollable_frame, anchor="nw")
    canvas.pack(side="left", fill="both", expand=True)
    scrollbar.pack(side="right", fill="y")
    canvas.config(yscrollcommand=scrollbar.set)

    column_vars = {}

    for column in columns:
        var = tk.BooleanVar()
        chk = tk.Checkbutton(scrollable_frame, text=column, variable=var)
        chk.pack(anchor='w')
        column_vars[column] = var

    submit_button = tk.Button(column_selection_window, text="Submit", command=submit_selection)
    submit_button.pack(pady=10)

    column_selection_window.transient(window)
    column_selection_window.grab_set()
    window.wait_window(column_selection_window)

def upload_file():
    global selected_columns
    selected_columns = []  # Initialize this variable to store selected columns

    file = filedialog.askopenfilename(title="Select file", filetypes=[("CSV files", '.csv')])
    if file:
        lbl_my_file = tk.Label(window, text="File read!", font=("arial", 12, "bold"))
        lbl_my_file.pack()

        comparison = 0
        threshold = 5
        chunk_size = 50

        probable = []
        probable_dup = []

        df = pd.read_csv(file, infer_datetime_format=True, encoding='ISO-8859-1')

        file_name.set(file)

        # Show column selection dialog
        show_column_selection_dialog(df.columns)

        if not selected_columns:
            lbl_my_file.config(text="No columns selected. Please try again.")
            return

        # Clean up the data
        df.dropna(subset=selected_columns, inplace=True)

        # Create the `std_att` column dynamically based on selected columns
        df["std_att"] = df[selected_columns].astype(str).agg(''.join, axis=1)
        std_att = df["std_att"]

        # Continue with existing logic
        print("............Dataset..........")
        for index, row in df.iterrows():
            a = f"ID: {row['id']}, std_att: {row['std_att']}"
            print(a)

        mylist = tk.Text(window, height=15, width=150)
        mylist.pack(side=tk.TOP, padx=10, pady=10, expand=True)

        mylist1 = tk.Text(window, height=15, width=150)
        mylist1.pack(side=tk.BOTTOM, padx=10, pady=10, expand=True)

        unique_ids = []
        unique_std_att = []

        print("...........Edit Distance without sorting: ...........")
        for i in range(0, len(df), chunk_size):
            chunk = df.iloc[i:i + chunk_size]
            std_chunk = std_att.iloc[i:i + chunk_size]
            m = len(chunk)
            for j in range(m):
                for k in range(j + 1, m):
                    id_j = str(chunk.iloc[j]['id'])
                    id_k = str(chunk.iloc[k]['id'])
                    dist = edit_distance(std_chunk.iloc[j], std_chunk.iloc[k])
                    comparison += 1
                    if dist == 2 or dist == 1:
                        edit = (dist, f"Edit distance between ({id_j}){chunk.iloc[j]['std_att']} and ({id_k}){chunk.iloc[k]['std_att']} : {dist}")
                        probable_dup.append(edit)

                    if dist <= threshold:
                        pair = (dist, f"Edit distance between ({id_j}){chunk.iloc[j]['std_att']} and ({id_k}){chunk.iloc[k]['std_att']} : {dist}")
                        probable.append(pair)

                        if id_j not in unique_ids:
                            unique_ids.append(id_j)
                            unique_std_att.append((chunk.iloc[j]['id'], chunk.iloc[j]['std_att']))

                        if id_k not in unique_ids:
                            unique_ids.append(id_k)
                            unique_std_att.append((chunk.iloc[k]['id'], chunk.iloc[k]['std_att']))

        print("...........Edit Distance in Sorted order.............")
        probable = sorted(probable, key=lambda x: x[0])
        for pair in probable:
            print(pair[1])

        probable_dup = sorted(probable_dup, key=lambda x: x[0])
        mylist1.insert(tk.END, f"Probable Dupliates:\n")
        for edit in probable_dup:
            mylist1.insert(tk.END, f"{edit[1]}\n")
        mylist1.config(state='disabled')

        print("..........Dataset within the edit distance with unique id.............")
        print(unique_std_att)
        print("Length of unique attribute(edit distance): ", len(unique_std_att))

        duplicates = count_duplicates(unique_std_att)
        mylist.insert(tk.END, f"Exact Dupliates:\n")
        for i in range(len(duplicates)):
            mylist.insert(tk.END, f"{i + 1}: {duplicates[i]}\n")
        mylist.config(state='disabled')

        lbl_data = tk.Label(window, text="Number of data in dataset: {}".format(len(std_att)))
        lbl_data.pack()

        lbl_comparison = tk.Label(window, text="Number of Comparison: {}".format(comparison))
        lbl_comparison.pack()

# Initialize global variable
selected_columns = []

# Create the main window
window = tk.Tk()
window.geometry("1000x800")
window.title("Data Redundancy Detection System")
title_lbl = tk.Label(window, text="Upload .csv file", font=("arial", 30, "italic bold"), bd=7)
title_lbl.pack()

button = tk.Button(window, text="Upload File", relief=tk.RAISED, font=("arial", 15, "bold"), width=15, bg="light blue", command=upload_file)
button.pack()

file_name = tk.StringVar()
file_data = tk.StringVar()
my_file = tk.StringVar()

lbl_name = tk.Label(window, textvariable=file_name)
lbl_name.pack()

window.mainloop()


  df = pd.read_csv(file, infer_datetime_format=True, encoding='ISO-8859-1')


............Dataset..........
ID: 121, std_att: Tek Bahadur Bista280
ID: 122, std_att: Tek Bahadur Bista280
ID: 125, std_att: Pratima Neupane190
ID: 142, std_att: Bal Bd. Sarki330
ID: 143, std_att: Dipak Singh Sarki440
ID: 144, std_att: Shree Koli320
ID: 146, std_att: Gaurab Bahadur Bist240
ID: 147, std_att: Krisha Sharki20
ID: 148, std_att: Narayan Prasad Timilsena330
ID: 151, std_att: suman Chapagai280
ID: 180, std_att: Bishnu Prasad Timilsena330
ID: 181, std_att: Puran Karki300
ID: 182, std_att: Parash Bista200
ID: 183, std_att: Jivan Raj Upadhyay350
ID: 184, std_att: Tapendra Upadhyay320
ID: 185, std_att: Dhurba Bista180
ID: 186, std_att: Parash Bista190
ID: 187, std_att: Khem Bista190
ID: 188, std_att: Laxman Bista180
ID: 189, std_att: Hikmat Bista170
ID: 190, std_att: Rabi Prasad Jaisi200
ID: 191, std_att: Nar Bahadur Bista620
ID: 192, std_att: Janak Bista540
ID: 193, std_att: Topendra Kumar Shahi430
ID: 194, std_att: Khem Raj Regmi330
ID: 195, std_att: Sher Bahadur Shahi280
ID: 

In [3]:
import tkinter as tk
from tkinter import filedialog, Toplevel, Checkbutton, IntVar, Button, Label, Text, StringVar, Scrollbar, Frame
import pandas as pd
import numpy as np
import time

def edit_distance(s1, s2):
    m = len(s1)
    n = len(s2)

    # Initialize the matrix
    d = [[0] * (n + 1) for i in range(m + 1)]

    # Fill the first row and first column
    for i in range(1, m + 1):
        d[i][0] = i
    for j in range(1, n + 1):
        d[0][j] = j

    # Fill the rest of the matrix
    for j in range(1, n + 1):
        for i in range(1, m + 1):
            if s1[i - 1] == s2[j - 1]:
                d[i][j] = d[i - 1][j - 1]
            else:
                d[i][j] = min(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1]) + 1

    # Return the final value in the matrix
    return d[m][n]

def count_duplicates(records, id_column_name):
    count_dict = {}

    for row in records:
        record = tuple(row[1:])  # Discard the first column (ID)

        if record in count_dict:
            count_dict[record]['count'] += 1
            count_dict[record]['ids'].append(row[0])  # Add ID to list of IDs
        else:
            count_dict[record] = {'count': 1, 'ids': [row[0]]}

    duplicates = []
    total = 0
    for record, details in count_dict.items():
        if details['count'] > 1:
            duplicates.append(f"IDs: {', '.join(str(id) for id in details['ids'])}, {record}: duplicate count: {details['count']}")
            total += details['count']

    lbl_total = Label(window, text=f"Number of Duplicates: {total}")
    lbl_total.pack()

    return duplicates

def upload_file():
    file = filedialog.askopenfilename(title="Select file", filetypes=[("CSV files", '*.csv')])
    if file:
        lbl_my_file = Label(window, text="File read!", font=("arial", 12, "bold"))
        lbl_my_file.pack()

        # Load the CSV file
        df = pd.read_csv(file, infer_datetime_format=True, encoding='ISO-8859-1')
        file_name.set(file)

        # Show column selection popup
        show_column_selection_popup(df)

def show_column_selection_popup(df):
    # Create a new window for column selection
    popup = Toplevel(window)
    popup.title("Select Columns for std_att")
    popup.geometry("400x600")

    # Create a frame for the checkboxes and scrollbars
    frame = Frame(popup)
    frame.pack(fill=tk.BOTH, expand=True)

    # Add a scrollbar to the frame
    scrollbar_y = Scrollbar(frame, orient=tk.VERTICAL)
    scrollbar_y.pack(side=tk.RIGHT, fill=tk.Y)
    
    canvas = tk.Canvas(frame, yscrollcommand=scrollbar_y.set)
    canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

    scrollbar_y.config(command=canvas.yview)

    # Create a frame inside the canvas to hold the checkboxes
    checkbox_frame = Frame(canvas)
    canvas.create_window((0, 0), window=checkbox_frame, anchor=tk.NW)

    # Bind the frame to the canvas so that scrolling works
    def on_frame_configure(event):
        canvas.configure(scrollregion=canvas.bbox("all"))
    
    checkbox_frame.bind("<Configure>", on_frame_configure)

    # Create a list to keep track of selected columns
    selected_columns = []

    # Define a function to update the selected columns list
    def update_selection():
        selected_columns.clear()
        for var, col_name in column_vars:
            if var.get():
                selected_columns.append(col_name)
        popup.destroy()
        # Proceed with using selected columns
        create_std_att(df, selected_columns)

    # Create checkboxes for each column
    column_vars = []
    for col in df.columns:
        var = IntVar()
        chk = Checkbutton(checkbox_frame, text=col, variable=var)
        chk.pack(anchor=tk.W)
        column_vars.append((var, col))

    # Create a submit button
    submit_btn = Button(popup, text="Submit", command=update_selection)
    submit_btn.pack(pady=10)

def create_std_att(df, selected_columns):
    # Dynamically get the first column name
    id_column_name = df.columns[0]
    
    # Clean up the data
    df.dropna(subset=selected_columns, inplace=True)

    # Create std_att attribute based on selected columns
    df['std_att'] = df[selected_columns].astype(str).agg(''.join, axis=1)
    
    std = df[[id_column_name] + selected_columns].values
    unique_ids = []
    unique_std_att = []

    print("............Dataset..........")
    for index, row in df.iterrows():
        a = f"ID: {row[id_column_name]}, std_att: {row['std_att']}"
        print(a)

    mylist = Text(window, height=15, width=150)
    mylist.pack(side=tk.TOP, padx=10, pady=10, expand=True)

    mylist1 = Text(window, height=15, width=150)
    mylist1.pack(side=tk.BOTTOM, padx=10, pady=10, expand=True)

    comparison = 0
    threshold = 5
    chunk_size = 50

    probable = []
    probable_dup = []

    std_att = df['std_att']

    print("...........Edit Distance without sorting: ...........")
    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i+chunk_size]
        std_chunk = std_att.iloc[i:i+chunk_size]
        m = len(chunk)
        for j in range(m):
            for k in range(j + 1, m):
                id_j = str(chunk.iloc[j][id_column_name])
                id_k = str(chunk.iloc[k][id_column_name])
                dist = edit_distance(std_chunk.iloc[j], std_chunk.iloc[k])
                comparison += 1  # m * (m - 1) / 2
                if dist == 2 or dist == 1:
                    edit = (dist, f"Edit distance between ({id_j}){chunk.iloc[j]['std_att']} and ({id_k}){chunk.iloc[k]['std_att']} : {dist}")
                    probable_dup.append(edit)

                if dist <= threshold:
                    pair = (dist, f"Edit distance between ({id_j}){chunk.iloc[j]['std_att']} and ({id_k}){chunk.iloc[k]['std_att']} : {dist}")
                    probable.append(pair)

                    if id_j not in unique_ids:
                        unique_ids.append(id_j)
                        unique_std_att.append((chunk.iloc[j][id_column_name], chunk.iloc[j]['std_att']))

                    if id_k not in unique_ids:
                        unique_ids.append(id_k)
                        unique_std_att.append((chunk.iloc[k][id_column_name], chunk.iloc[k]['std_att']))

    print("...........Edit Distance in Sorted order.............")
    probable = sorted(probable, key=lambda x: x[0])  # sort by distance
    for pair in probable:
        print(pair[1])

    probable_dup = sorted(probable_dup, key=lambda x: x[0])  # sort by distance
    mylist1.insert(tk.END, f"Probable Dupliates:\n")
    z = 1
    for edit in probable_dup:
        mylist1.insert(tk.END, f"{z}: {edit[1]}\n")
        z = z + 1
    mylist1.config(state='disabled')

    print("..........Dataset within the edit distance with unique id.............")
    print(unique_std_att)
    print("Length of unique attribute(edit distance): ", len(unique_std_att))

    duplicates = count_duplicates(unique_std_att, id_column_name)
    mylist.insert(tk.END, f"Exact Dupliates:\n")
    for i in range(len(duplicates)):
        mylist.insert(tk.END, f"{i+1}: {duplicates[i]}\n")
    mylist.config(state='disabled')

    lbl_data = Label(window, text=f"Number of data in dataset: {len(std_att)}")
    lbl_data.pack()

    lbl_comparison = Label(window, text=f"Number of Comparison: {comparison}")
    lbl_comparison.pack()

start_time = time.time()

window = tk.Tk()
window.geometry("1000x800")
window.title("Data Redundancy Detection System")
title_lbl = Label(window, text="Upload .csv file", font=("arial", 30, "italic bold"), bd=7)
title_lbl.pack()
button = Button(window, text="Upload File", relief=tk.RAISED, font=("arial", 15, "bold"), width=15, bg="light blue", command=upload_file)
button.pack()

file_name = StringVar()
file_data = StringVar()
my_file = StringVar()

lbl_name = Label(window, textvariable=file_name)
lbl_name.pack()

window.mainloop()

end_time = time.time()
processing_time = end_time - start_time
print(f"\n \n \n Processing time: {processing_time} seconds")


  df = pd.read_csv(file, infer_datetime_format=True, encoding='ISO-8859-1')
  df = pd.read_csv(file, infer_datetime_format=True, encoding='ISO-8859-1')


............Dataset..........
ID: 1, std_att: 55MaleBlouse
ID: 2, std_att: 19MaleSweater
ID: 3, std_att: 50MaleJeans
ID: 4, std_att: 21MaleSandals
ID: 5, std_att: 45MaleBlouse
ID: 6, std_att: 46MaleSneakers
ID: 7, std_att: 63MaleShirt
ID: 8, std_att: 27MaleShorts
ID: 9, std_att: 26MaleCoat
ID: 10, std_att: 57MaleHandbag
ID: 11, std_att: 53MaleShoes
ID: 12, std_att: 30MaleShorts
ID: 13, std_att: 61MaleCoat
ID: 14, std_att: 65MaleDress
ID: 15, std_att: 64MaleCoat
ID: 16, std_att: 64MaleSkirt
ID: 17, std_att: 25MaleSunglasses
ID: 18, std_att: 53MaleDress
ID: 19, std_att: 52MaleSweater
ID: 20, std_att: 66MalePants
ID: 21, std_att: 21MalePants
ID: 22, std_att: 31MalePants
ID: 23, std_att: 56MalePants
ID: 24, std_att: 31MalePants
ID: 25, std_att: 18MaleJacket
ID: 26, std_att: 18MaleHoodie
ID: 27, std_att: 38MaleJewelry
ID: 28, std_att: 56MaleShorts
ID: 29, std_att: 54MaleHandbag
ID: 30, std_att: 31MaleDress
ID: 31, std_att: 57MaleJewelry
ID: 32, std_att: 33MaleDress
ID: 33, std_att: 36MaleJa