In [5]:
import os
import hashlib
import json
from tqdm import tqdm

def calculate_md5(file_path):
    """Calculate the MD5 hash of a file."""
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def list_files_and_hashes(directory):
    """List all files in the directory and calculate their MD5 hash."""
    files_data = []
    total_files = sum([len(files) for _, _, files in os.walk(directory)])
    print(f"Total files to process: {total_files}")

    for root, dirs, files in tqdm(os.walk(directory), desc="Scanning directories"):
        for file in tqdm(files, desc=f"Processing files in {root}", leave=False):
            file_path = os.path.join(root, file)
            if os.path.islink(file_path):
                continue
            file_hash = calculate_md5(file_path)
            files_data.append({
                'path': file_path,
                'hash': file_hash,
                'size': os.path.getsize(file_path),
                'last_modified': os.path.getmtime(file_path)
            })
    return files_data

def save_data_to_json(data, filename):
    """Save the file data to a JSON file."""
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)
    print(f"File data saved to {filename}")

def load_data_from_json(filename):
    """Load file data from a JSON file."""
    with open(filename, 'r') as f:
        return json.load(f)

def detect_duplicates(files_data):
    """Detect duplicate files based on their MD5 hash."""
    hash_dict = {}
    duplicates = []
    for file in tqdm(files_data, desc="Detecting duplicates"):
        file_hash = file['hash']
        if file_hash in hash_dict:
            duplicates.append((hash_dict[file_hash], file))
        else:
            hash_dict[file_hash] = file
    return duplicates

def main():
    directory = "/Users/arunpatro/My Drive"  # Input directory
    output_file = "file_data.json"

    print(f"Scanning directory: {directory}")
    
    # List files and calculate their MD5 hash
    files_data = list_files_and_hashes(directory)

    # Save data to JSON
    save_data_to_json(files_data, output_file)

    # Load data from JSON
    loaded_data = load_data_from_json(output_file)

    # Detect duplicates
    print("Detecting duplicate files...")
    duplicates = detect_duplicates(loaded_data)
    
    if duplicates:
        print("Duplicate files found:")
        for dup in duplicates:
            print(f"Original: {dup[0]['path']} <--> Duplicate: {dup[1]['path']}")
    else:
        print("No duplicates found.")

In [6]:
main()

Scanning directory: /Users/arunpatro/My Drive
Total files to process: 19322


Scanning directories: 695it [01:02, 11.19it/s]


FileNotFoundError: [Errno 2] No such file or directory: '/Users/arunpatro/My Drive/IIT - KGP/AGV/AGV Treat/IMG_2615.JPG'