In [3]:
#  Standard Imports
import os
import shutil
import filecmp
import datetime
import pathlib
import hashlib

import pandas as pd

from collections import Counter
# Testing
# import py_workspace_util as pwu

In [None]:
def dir_duplicate_check(path):
    """
    Parameters:
    -----------
    path : path to destination directory   : str : :

     Description:
    ------------
    Loops through all files in the `path` directory and uses a md5 has to check for duplicates.

    Returns:
    --------
    A record file in `/logs` with the date, time, and function name in the title of the file; prints results if duplicates are found.
    
    The record file and print statement will be empty if no duplicates are found.

    """
    # Getting the date in YYYY-MM-DD format &
    # the time in HH:MM format. Both are used for
    # naming the log file. The HH:MM is used to
    # prevent files being overridden
    today    = datetime.datetime.today().strftime('%Y-%m-%d')
    run_time = str(datetime.datetime.now())[11:16].replace(":","_")
    # Defining the full path to the file being moved
    # Create an empty dictionary
    file_dict = dict()
    # Validating the path of files to be checked
    if os.path.isdir(path):
        # Looping through the directory
        for file in os.listdir(path):
            # Creating a full filepath for each file
            full_path    = os.path.join(path, file)
            # Reading each file and creating a md5 has for each
            # and then adding each file name and md5 as key:value
            # pairs to the dictionary
            file_hash    = hashlib.md5(open(full_path, "rb").read()).hexdigest()
            file_dict[file] = file_hash
    # Creating a Counter
    count_dict = Counter(unique.values())
    # Extracting the dict values and counting
    # If the hash value is duplicated there are duplicate files then saving duplicates to a new variable
    results    = [key for key, value in file_dict.items() if count_dict[value] > 1]
    # Writing a confirmation to the log file for the renamed file
    # No files are being modified, but it is good to have a record for posterity
    with open(f"logs/{today}-{run_time}-dir_duplicate_check-record.txt", "w") as py_logger:
        dt_now=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        py_logger.write(f"Day          : {today} @ {str(datetime.datetime.now())[11:16]} \n")
        py_logger.write(f"Analyst      : To be added \n")
        py_logger.write(f"Directory    : {path} \n")
        py_logger.write(f"Function Run : dir_duplicate_check() \n\n")
        # Will be empty if there are no duplicates
        py_logger.write(f">> {dt_now}: Duplicates = {results} \n\n")
    # Print message for the user
    print(f">> Checking {path} For Duplicates", sep="\n")
    print(f">> No File Names Will Be Printed If There Are None", sep="\n")
    print(f">> -----------------------------------------------", sep="\n")
    print(*results, sep="\n")

In [None]:
dir_duplicate_check(path="C:/Users/andre/Documents/Sample")

In [None]:
dir_duplicate_check(path = "C:/Users/andre/Documents/Udemy/R For Stats & DS/Data")

In [None]:
path = "C:/Users/andre/Documents/Sample"

unique = dict()

for file in os.listdir(path):
    full_path = os.path.join(path, file)
    file_hash = hashlib.md5(open(full_path, "rb").read()).hexdigest()
    unique[file] = file_hash

In [None]:
values_list = list(unique.values())

In [None]:
for value in unique.values():
    if values_list.count(value) > 1:
        print(value)
    else:
        pass

In [None]:
count_dict = Counter(ini_dict.values())
result = [key for key, value in ini_dict.items() if count_dict[value] > 1]

In [None]:
str(datetime.datetime.now())[11:16]

In [None]:
run_time = str(datetime.datetime.now())[11:16]
run_time

In [19]:
dir      = "C:/Users/andre/Documents/Sample"
files    = os.listdir("C:/Users/andre/Documents/Sample")
data_ext = [".csv",".json"]
data_dir = "data"

In [26]:
for file in files:
    if file.endswith(tuple(data_ext)):
        source = os.path.join(dir,file)
        destination = os.path.join(dir, data_dir, file)
        shutil.move(source, destination)
        print(f"{file} moved to {data_dir}")

json_test_rename_1.json moved to data
name_mapping.csv moved to data
RDS20250827_01.csv moved to data
RDS20250827_02.csv moved to data
RDS20250827_03.csv moved to data
RDS20250827_04.csv moved to data
RDS20250827_05.csv moved to data
RDS20250827_07.json moved to data


In [23]:
os.path.join(dir, data_dir)

'C:/Users/andre/Documents/Sample\\data'

In [8]:
# Directory/folder names - updated as needed for each directory
data_dir = "data"
code_dir = "code"
txt_dir  = "txt"
msft_dir = "msft"

# List of directories/folders - updated as needed for each directory
directories = [data_dir, code_dir, txt_dir, msft_dir]

path = "C:/Users/andre/Desktop/Stuff/Dummy_Files"

# Function to create the folders listed above
def create_new_folders():
    for directory in directories:
        folder = os.path.join(path, directory)
        if not os.path.exists(folder):
            os.mkdir(folder)

In [9]:
create_new_folders()