In [7]:
import os
import re

# Function to clean the text content by removing specific sections and returning the cleaned text and deleted sections
def clean_text_content(text):
    # Define regex pattern to find sections to be removed
    pattern = re.compile(r'(<TYPE>(GRAPHIC|EXCEL|ZIP).*?<TEXT>\s*begin)(.*?)(end)', re.DOTALL)
    
    # Find all matches of the pattern
    matches = pattern.findall(text)
    
    # Extract the text to be removed
    deleted_texts = [match[2] for match in matches]
    
    # Clean the text by removing the unwanted sections
    cleaned_text = pattern.sub(r'\1\nend', text)
    
    return cleaned_text, deleted_texts, matches

# Function to upload, clean, and save text file, and ensure unnecessary text is not deleted
def process_text_file(file_path, output_cleaned_path, output_deleted_path):
    try:
        # Read the text file
        with open(file_path, 'r') as file:
            text_content = file.read()
        
        # Clean the text content
        cleaned_content, deleted_texts, matches = clean_text_content(text_content)
        
        # Save the cleaned content to a new file
        with open(output_cleaned_path, 'w') as cleaned_file:
            cleaned_file.write(cleaned_content)
        
        # Save the deleted sections to a new file
        with open(output_deleted_path, 'w') as deleted_file:
            for deleted_text in deleted_texts:
                deleted_file.write(deleted_text + '\n\n')
        
        # Display the sections before and after deletion
        print("Sections Before and After Deletion:")
        for match in matches:
            section_before = f"{match[0]}{match[2]}{match[3]}"
            section_after = f"{match[0]}\n{match[3]}"
            print(f"Section Before:\n{section_before}\n")
            print(f"Section After:\n{section_after}\n")
        
        return output_cleaned_path, output_deleted_path
    
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None, None
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None, None

# Example usage
file_path = 'HOOKER_FURNISHINGS_Corp_2024-04-12_00_00_00.txt'
output_cleaned_path = 'cleaned_text.txt'
output_deleted_path = 'deleted_text.txt'

cleaned_file, deleted_file = process_text_file(file_path, output_cleaned_path, output_deleted_path)

if cleaned_file and deleted_file:
    print("Cleaned and deleted sections processed successfully.")


Sections Before and After Deletion:
Section Before:
<TYPE>GRAPHIC
<SEQUENCE>12
<FILENAME>hf_logo.jpg
<TEXT>
begin 644 hf_logo.jpg
M_]C_X  02D9)1@ ! 0$ E@"6  #_X0 B17AI9@  34T *@    @  0$2  ,
M   !  $       #_VP!#  (! 0(! 0(" @(" @(" P4# P,# P8$! ,%!P8'
M!P<&!P<("0L)" @*" <'"@T*"@L,# P,!PD.#PT,#@L,# S_VP!# 0(" @,#
M P8# P8," <(# P,# P,# P,# P,# P,# P,# P,# P,# P,# P,# P,# P,
M# P,# P,# P,# P,# S_P  1" !$ . # 2(  A$! Q$!_\0 'P   04! 0$!
M 0$           $" P0%!@<("0H+_\0 M1   @$# P($ P4%! 0   %] 0(#
M  01!1(A,4$&$U%A!R)Q%#*!D:$((T*QP152T? D,V)R@@D*%A<8&1HE)B<H
M*2HT-38W.#DZ0T1%1D=(24I35%565UA96F-D969G:&EJ<W1U=G=X>7J#A(6&
MAXB)BI*3E)66EYB9FJ*CI*6FIZBIJK*SM+6VM[BYNL+#Q,7&Q\C)RM+3U-76
MU]C9VN'BX^3EYN?HZ>KQ\O/T]?;W^/GZ_\0 'P$  P$! 0$! 0$! 0
M  $" P0%!@<("0H+_\0 M1$  @$"! 0#! <%! 0  0)W  $" Q$$!2$Q!A)!
M40=A<1,B,H$(%$*1H;'!"2,S4O 58G+1"A8D-.$E\1<8&1HF)R@I*C4V-S@Y
M.D-$149'2$E*4U155E=865IC9&5F9VAI:G-T=79W>'EZ@H.$A8:'B(F*DI.4
ME9:7F)F:HJ.DI::GJ*FJLK.TM;:WN+FZPL/$Q<;'R,G*TM/4U=;7V-G:XN/D
MY>;GZ.GJ