## Install Dependencies

In [1]:
# !pip install docx2txt
# !pip install sentence_transformers
# !pip install PyPDF2
# !pip install hashlib

## Import packages

In [2]:
#Importing the required packages

import os
import hashlib
from shutil import move

from sentence_transformers import SentenceTransformer
import PyPDF2
import docx2txt

import pickle

  from .autonotebook import tqdm as notebook_tqdm


## SET PATH

In [3]:
# Define directory paths
document_folder = r"C:\Users\ankit\Downloads\Duplicate files\Files"
duplicates_folder = r"C:\Users\ankit\Downloads\Duplicate files\Files\Duplicate_files"

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

#Define similarity score
cosine_score=0.9

###  Defining HASH and SHA Algorithm

In [4]:
# Defining the SHA Algorithm
def create_hash_database(folder_path):
    hash_database = {} #creating a dictionary for hash_database
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            try:
                with open(file_path, 'rb') as f:
                    md5_hash = hashlib.md5(f.read()).hexdigest()
                    sha256_hash = hashlib.sha256(f.read()).hexdigest()
                hash_database[filename] = (md5_hash, sha256_hash)
            except FileNotFoundError:
                print(f"Error: File '{filename}' not found. Hence skipping...")
    return hash_database



def check_duplicate(filename, hash_database):
    if filename in hash_database:
        return True
    return False

def move_duplicate_llm(filename,duplicate_file,similarity_score, source_folder, destination_folder):
    source_path = os.path.join(source_folder, filename)
    destination_path = os.path.join(destination_folder, filename)
    try:
        move(source_path, destination_path)
        print(f"Duplicate '{filename}' moved to duplicate folder with similarity score of {similarity_score} with file {duplicate_file}")
    except Exception as e:
        print(f"Error moving '{filename}': {e}")
        

def move_duplicate(filename, source_folder, destination_folder):
    source_path = os.path.join(source_folder, filename)
    destination_path = os.path.join(destination_folder, filename)
    try:
        move(source_path, destination_path)
        print(f"Duplicate '{filename}' moved to duplicate folder.")
    except Exception as e:
        print(f"Error moving '{filename}': {e}")

def extract_text(path):
    text = ""
    file_ext= path.split('.')[-1]    
    if file_ext == 'docx':
        text =docx2txt.process(path)
        
    elif file_ext == 'pdf':
        with open(path, "rb") as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text()
    else:
        pass

    return text

In [5]:
def create_embd_database(folder_path, destination_folder):
    embd_database = {} #creating a dictionary for embd_database
    #print("wfeew")
    #print(os.listdir(folder_path))
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf") or filename.endswith(".docx"):
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path):
                #print("inside row 21",filename)
                try:
                    with open(file_path, 'rb') as f:
                        text=extract_text(file_path)
                        #print(text)
                        embedding=model.encode(text, normalize_embeddings=True)
                        #print(filename,"rfwer",embedding)
                        
                    # Moving duplicates in intial database creation if cosine similarity is greter than cosine score
                    for key,value in embd_database.items():
                        similarity_score=value @ embedding.T
                        #print("Hi",similarity_score,key,value) ##
                        if similarity_score> cosine_score: #Check with the threshold limit(cosine score)
                            move_duplicate_llm(filename,key,similarity_score,folder_path, destination_folder)
                    else:
                        embd_database[filename] = embedding
                        #print("Embd\t\t\t\t\t\t\t",embd_database)
                except FileNotFoundError:
                    print(f"Error: File '{filename}' not found. Hence skipping...")
                except Exception as e:
                    print(f"Error moving '{filename}': {e}")
    return embd_database

In [6]:
def sha_algo():
    #load existing hash database or create a new one
    if os.path.exists("hash_database.txt"):
        with open("hash_database.txt", "r") as f:
            hash_database = eval(f.read())
    else:
        hash_database = create_hash_database(document_folder)
        with open("hash_database.txt", "w") as f:
            f.write(str(hash_database))
    
    #Check for new documents
    for filename in os.listdir(document_folder):
        file_path = os.path.join(document_folder, filename)
        if os.path.isfile(file_path):
            #New document: update hash database and potentially move existing duplicates
            try:
                with open(file_path, "rb") as f:
                    md5_hash = hashlib.md5(f.read()).hexdigest()
                    sha256_hash = hashlib.sha256(f.read()).hexdigest()
                hash_database[filename] = (md5_hash, sha256_hash)
                
                #check for existing duplicates based on MD5 or SHA-256
                for existing_filename, existing_hashes in hash_database.items():
                    if(existing_filename != filename) and (md5_hash == existing_hashes[0]): #checking for MD5
                        #move existing duplicates
                        move_duplicate(existing_filename, document_folder, duplicates_folder)
                        break #only move one duplicate per new document
            except FileNotFoundError:
                print(f"Error: File'{filename}' not found. Skipping...")

    with open("hash_database.txt", "w") as f:
        f.write(str(hash_database))
    print("Hash database updated.")
    


In [7]:
def LLM_algo():
    print('LLM model is running....')
    
    #Setting up the model bge-large-zh-v1.5
    
    new_files={}
    if os.path.exists("embd_database.pkl"):
        with open("embd_database.pkl", "rb") as file:
            embd_database = pickle.load(file)
    else:
        print("Creating Database")
        embd_database = create_embd_database(document_folder,duplicates_folder) # Duplicate folder path is provided because at the databasse if any duplicates are there the function will move the files to duplicates folder 
        with open("embd_database.pkl", "wb") as file:
            pickle.dump(embd_database,file)
    
    for filename in os.listdir(document_folder):
        if filename.endswith(".pdf") or filename.endswith(".docx"):
            file_path = os.path.join(document_folder, filename)
            if os.path.isfile(file_path):
                #New document: update hash database and potentially move existing duplicates
                try:
                    #with open(file_path, "rb") as f:
                    text=extract_text(file_path)
                    embedding=model.encode(text, normalize_embeddings=True)
                    

                    #check for existing duplicates based on embedding
                    #print(embd_database,'\n\n\n\n')
                    #print(embd_database.items())
                    for existing_filename, existing_embd in embd_database.items():
#                     print("\n\n\nfile: ",existing_filename)
#                     print("\n\n\n\nexostend",existing_embd,"\n")
                    #checking for Duplicates
                        #print("\n\n\tSDSDS",filename,"\n'existing filename",existing_filename,"\nExisting Embedding:", existing_embd,'\n')
                        similarity_score= existing_embd @ embedding.T
                        #print("\nSimilarity Score:",similarity_score,"\n")
                        if(filename not in embd_database.keys()) and similarity_score> cosine_score:
                            move_duplicate_llm(filename,existing_filename, similarity_score,document_folder, duplicates_folder)
                            break #only move one duplicate per new document
                        else:
                            new_files[filename] = embedding
                except FileNotFoundError:
                    print(f"Error: File'{filename}' not found. Skipping...")
    
    embd_database={**new_files,**embd_database}
                
    #Save the embedding dictionary inn pkl           
    with open("embd_database.pkl", "wb") as file:
            pickle.dump(embd_database,file)
    print("Completed!")

In [8]:
if __name__ == "__main__":
    sha_algo()
    LLM_algo()

Duplicate 'Kupdf_net_irctc_ticket_format - Copy.pdf' moved to duplicate folder.
Duplicate 'Machine learning - Wikipedia_1page.pdf' moved to duplicate folder.
Hash database updated.
LLM model is running....
Creating Database
Duplicate 'Hugging Face - Copy (2).docx' moved to duplicate folder with similarity score of 0.9999999403953552 with file Hugging Face - Copy (2) - Copy.docx
Completed!


In [None]:
#Pickle file saved 

In [9]:
with open("embd_database.pkl", "rb") as file:
            embd_database = pickle.load(file)
embd_database

{'Hugging Face - Copy (2) - Copy.docx': array([-6.44911006e-02, -3.28525119e-02, -5.84419854e-02, -8.45637396e-02,
        -4.81047966e-02,  1.51398061e-02,  2.30619702e-02, -7.87695572e-02,
         4.12523700e-03, -3.33901122e-02,  3.80569249e-02, -7.90949017e-02,
         4.25057746e-02,  3.94689012e-03, -1.81123503e-02,  3.88245061e-02,
         5.16425855e-02,  4.86485772e-02, -7.08298534e-02, -4.06162627e-02,
         5.05968295e-02,  4.32929099e-02,  5.41223660e-02, -1.16621656e-03,
         9.20810271e-03, -3.74906720e-03, -2.09698849e-03, -6.43281490e-02,
         9.85889435e-02, -3.89200114e-02,  3.11710574e-02, -3.88246146e-03,
         4.48776446e-02,  4.29849140e-02,  2.62836693e-03,  3.56246382e-02,
        -3.22245248e-02, -1.38986465e-02,  3.25993225e-02, -2.48916559e-02,
        -2.73086876e-02,  4.76000905e-02,  1.35237584e-02, -3.53948288e-02,
         3.44009176e-02, -7.21533895e-02, -6.38202429e-02,  5.81247620e-02,
        -7.66088068e-02,  9.01801512e-02,  3.2544