In [1]:
import os
import codecs
def count_utf8(path):
    if(os.path.isfile(path)):
        with codecs.open(path, 'r', 'utf-8') as infile:
            count = len(infile.readlines()) 
    return count

In [2]:
badpath1 = "C:/LOGS/2024_07_JUL/JULY_10_CMP1/outbound/report/07102024/MA/MA_20240710.dat"
print(count_utf8(badpath1))
badpath2 = "C:/LOGS/2024_07_JUL/JULY_10_CMP2/outbound/report/07102024/MA/MA_20240710.dat"
print(count_utf8(badpath2))

44
44


In [3]:
## Function to recursively copy a directory, and process each file in destination directory.

import shutil
import os
def process_tree(src, dest, func=None, verbose=False):
    os.makedirs(dest, exist_ok=True)
    with os.scandir(src) as entries:
        for entry in entries:
            src2 = os.path.join(src, entry.name)
            dest2 = os.path.join(dest, entry.name)

            if entry.is_file():
                if(verbose):
                    print('File: '+entry.name)
                shutil.copy(src2, dest2)
                if func:
                    func(dest2)
                
            if entry.is_dir():
                if(verbose):
                    print('Dir: '+entry.name)
                process_tree(src2, dest2, func, verbose)

In [4]:
## Function to act as generator for all files in a directory.

def walk_dir_gen(src):
    with os.scandir(src) as entries:
        for entry in entries:
            src2 = os.path.join(src, entry.name)
            yield src2

In [5]:
testpath = "C:/LOGS/2024_08_AUG/AUG_20_CMP1"

for src in walk_dir_gen(testpath):
    print(src)

C:/LOGS/2024_08_AUG/AUG_20_CMP1\outbound


In [6]:
from collections import deque

def walk_tree_gen(src):
    op_stack = deque()
    op_stack.append(os.path.abspath(src))

    while True:
        if len(op_stack) == 0:
            return
        src2 = op_stack.popleft()
        if os.path.isfile(src2):
            yield src2
            continue
        if os.path.isdir(src2):
            for src3 in walk_dir_gen(src2):
                op_stack.append(src3)

for file_1 in walk_tree_gen(testpath):
    print(f'    FILE: {file_1}')

In [7]:
badpath1 = "C:/LOGS/2024_07_JUL/JULY_10_CMP1/outbound/report/07102024/MA/MA_20240710.dat"
badpath2 = "C:/LOGS/2024_07_JUL/JULY_10_CMP2/outbound/report/07102024/MA/MA_20240710.dat"

In [8]:
import re
import Levenshtein as lv
import pandas as pd

def get_lines_match(path1, path2):
    lines1 = []
    lines2 = []
    with codecs.open(path1, 'r', 'utf-8') as in1:
        lines1 = in1.readlines()
    with codecs.open(path2, 'r', 'utf-8') as in2:
        lines2 = in2.readlines()
    assert len(lines1) == len(lines2)
    lines_match = 0
    sum_dist = 0
    max_dist = 0

    for i in range(len(lines1)):
        if lines1[i] == lines2[i]:
            lines_match = lines_match + 1
        else:
            dist = lv.distance(lines1[i], lines2[i])
            sum_dist = sum_dist + dist
            if dist > max_dist:
                max_dist = dist
    return (lines_match, sum_dist, max_dist)
        
   

In [9]:
get_lines_match(badpath1, badpath2)

(43, 1, 1)

In [10]:
import re
import Levenshtein as lv
import pandas as pd

def compare_trees(src1, src2):
    pf1 = walk_tree_gen(src1)
    pf2 = walk_tree_gen(src2)
    f1 = None
    f2 = None
    data = []

    while True:
        try:
            if f1 == f2:
                path1 = next(pf1,None)
                path2 = next(pf2,None)
            else:
                local_dir1 = re.sub(r'^.*outbound\\', '', path1)
                local_dir2 = re.sub(r'^.*outbound\\', '', path2)
                print(f'{local_dir1=}')
                print(f'{local_dir2=}')
                if local_dir1 < local_dir2:
                    path1 = next(pf1,None)
                else:
                    path2 = next(pf2,None)
            if path1 == None or path2 == None:
                break
            d1,f1 = os.path.split(path1)
            d2,f2 = os.path.split(path2)
            lines1 = count_utf8(path2)
            lines2 = count_utf8(path2)

                
            

            if f1 == f2 and lines1 == lines2:
                lines_match, sum_dist, max_dist = get_lines_match(path1,path2)

                record = {'path1': path1, 'path2': path2, 'file': f1, 'lines': lines1, 'lines_match': lines_match, 
                          'sum_lv_distance': sum_dist, 'max_lv_distance': max_dist}
                data.append(record)
            else:
                print(f'''{f1=} {f2=} {lines1=} {lines2=}
                {path1=}
                {path2=}''')

        except Exception as e:
            print('Error: '+repr(e))
            break;
    return pd.DataFrame(data)


        

In [11]:
src1 = "C:/LOGS/2024_11_NOV/Nov25_CMP1"
src2 = "C:/LOGS/2024_11_NOV/Nov25_CMP2"
df = compare_trees(src1, src2)
df.head()

f1='ZeroReport_20241126_0_MT.dat' f2='NJ_20241126.dat' lines1=10 lines2=10
                path1='C:\\LOGS\\2024_11_NOV\\Nov25_CMP1\\outbound\\report\\11262024\\MT\\ZeroReport_20241126_0_MT.dat'
                path2='C:\\LOGS\\2024_11_NOV\\Nov25_CMP2\\outbound\\report\\11262024\\NJ\\NJ_20241126.dat'
local_dir1='report\\11262024\\MT\\ZeroReport_20241126_0_MT.dat'
local_dir2='report\\11262024\\NJ\\NJ_20241126.dat'


Unnamed: 0,path1,path2,file,lines,lines_match,sum_lv_distance,max_lv_distance
0,C:\LOGS\2024_11_NOV\Nov25_CMP1\outbound\report...,C:\LOGS\2024_11_NOV\Nov25_CMP2\outbound\report...,NY_20241125.dat,764,762,14,7
1,C:\LOGS\2024_11_NOV\Nov25_CMP1\outbound\report...,C:\LOGS\2024_11_NOV\Nov25_CMP2\outbound\report...,ZeroReport_20241125_1502_NY.dat,11,8,17,7
2,C:\LOGS\2024_11_NOV\Nov25_CMP1\outbound\report...,C:\LOGS\2024_11_NOV\Nov25_CMP2\outbound\report...,ZeroReport_20241125_201416_NY.dat,11,8,19,8
3,C:\LOGS\2024_11_NOV\Nov25_CMP1\outbound\report...,C:\LOGS\2024_11_NOV\Nov25_CMP2\outbound\report...,ZeroReport_20241125_900_NY.dat,11,8,19,8
4,C:\LOGS\2024_11_NOV\Nov25_CMP1\outbound\report...,C:\LOGS\2024_11_NOV\Nov25_CMP2\outbound\report...,ZeroReport_20241125_CSR1_NY.dat,11,8,17,7


In [12]:
len(df)

1797

In [13]:
open(badpath1)

<_io.TextIOWrapper name='C:/LOGS/2024_07_JUL/JULY_10_CMP1/outbound/report/07102024/MA/MA_20240710.dat' mode='r' encoding='cp1252'>

In [14]:
import Levenshtein as lv

str1 = "kitten"
str2 = "sitting"
distance = lv.distance(str1, str2)
distance

3

In [15]:
str1 = "sat"
str2 = "set"
distance = lv.distance(str1, str2)
distance

1

In [16]:
 max(df['max_lv_distance'])

40

In [17]:
df[df['max_lv_distance'] > 30]

Unnamed: 0,path1,path2,file,lines,lines_match,sum_lv_distance,max_lv_distance
1371,C:\LOGS\2024_11_NOV\Nov25_CMP1\outbound\report...,C:\LOGS\2024_11_NOV\Nov25_CMP2\outbound\report...,ZeroReport_20241126_123_MI.dat,12,9,59,40


In [24]:
df[df['max_lv_distance'] > 30]['path1'].values



array(['C:\\LOGS\\2024_11_NOV\\Nov25_CMP1\\outbound\\report\\11262024\\MI\\ZeroReport_20241126_123_MI.dat'],
      dtype=object)

In [18]:
df[df['max_lv_distance'] > 30]

Unnamed: 0,path1,path2,file,lines,lines_match,sum_lv_distance,max_lv_distance
1371,C:\LOGS\2024_11_NOV\Nov25_CMP1\outbound\report...,C:\LOGS\2024_11_NOV\Nov25_CMP2\outbound\report...,ZeroReport_20241126_123_MI.dat,12,9,59,40


In [19]:
d1 = df[df['max_lv_distance'] > 30]['path1'].values[0]
d1

'C:\\LOGS\\2024_11_NOV\\Nov25_CMP1\\outbound\\report\\11262024\\MI\\ZeroReport_20241126_123_MI.dat'

In [20]:
max(abs(df['lines'] - df['lines_match']))

3