In [4]:
# pip install PyMuPDF
# pip install python-docx
# pip install spacy
# python -m spacy download en_core_web_sm - On Terminal

import os
import zipfile

# Get the current directory
current_directory = os.getcwd()

# Directory containing the .zip files
folder_path = os.path.join(current_directory, 'TenderFiles_1')

# Destination directory for extracted contents
destination_directory = os.path.join(current_directory, 'Extracted_TenderFiles_1')

# POS Tags of interest
pos_tags_of_interest = ['NOUN', 'VERB', 'ADJ', 'ADV']

# NER Tags of interest
ner_tags_of_interest = ['ORG', 'GPE', 'LOC', 'NORP', 'PRODUCT', 'EVENT', 'SCIENCE', 'ARTICLE']


In [None]:
# Ensure the destination directory exists
if not os.path.exists(destination_directory):
    os.makedirs(destination_directory)

# Iterate through the files in the directory
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    
    # Check if the file has a .zip extension
    if file_name.endswith('.zip'):
        tender_reference_number = file_name.split('-')[0]
        tender_extract_path = os.path.join(destination_directory, tender_reference_number)
        try:
            # Open the ZIP file
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                # Extract all contents to the destination directory
                zip_ref.extractall(tender_extract_path)

        except zipfile.BadZipFile as e:
            print(f"Error: {e} - {file_path} is not a valid ZIP file.")
        except Exception as e:
            print(f"An error occurred: {e}")
        
        # File has been unzipped - Look for more zip files within extracted content and unzip them
        for inner_file_name in os.listdir(tender_extract_path):
            inner_file_path = os.path.join(tender_extract_path, inner_file_name)
    
            # Check if the file has a .zip extension
            if inner_file_name.endswith('.zip'):
                try:
                    with zipfile.ZipFile(inner_file_path, 'r') as zip_ref:
                        zip_ref.extractall(tender_extract_path)
                except zipfile.BadZipFile as e:
                    print(f"Error: {e} - {inner_file_path} is not a valid ZIP file.")
                except Exception as e:
                    print(f"An error occurred: {e}")


In [16]:
import glob
import fitz
import docx

# Use os.listdir() to get a list of subfolders in the specified folder
tenders = [f for f in os.listdir(destination_directory) if os.path.isdir(os.path.join(destination_directory, f))]

# Now, subfolders contains a list of subfolder names in the specified folder
for tender_reference_number in tenders:
    tender_file_path = os.path.join(destination_directory, tender_reference_number)
    tender_summary_file_path = os.path.join(destination_directory, tender_reference_number + ".txt")
    
    # Create an empty summary file first
    with open(tender_summary_file_path, 'w') as file:
        file.write('')
    
    for root, dirs, files in os.walk(tender_file_path):
        for file in files:
            file_path = os.path.join(root, file)
            text_content = ''
            
            if file.endswith('.pdf'):
                pdf_document = fitz.open(file_path)

                # Iterate through each page in the PDF
                for page_num in range(pdf_document.page_count):
                    page = pdf_document[page_num]
                    text_content += page.get_text()

                # Close the PDF document
                pdf_document.close()

            if file.endswith('.docx'):
                doc = docx.Document(file_path)
                for paragraph in doc.paragraphs:
                    text_content += paragraph.text + '\n'

            with open(tender_summary_file_path, 'a', encoding='utf-8') as file_writer:
                file_writer.write(text_content)


In [12]:
import spacy

# Load the English language model
nlp = spacy.load('en_core_web_sm')

# Filter the list to get only .txt files
files = [file for file in os.listdir(destination_directory) if file.endswith(".txt")]
for file in files:
    file_path = os.path.join(destination_directory, file)
    file_name_without_extension = os.path.splitext(os.path.basename(file_path))[0]

    tender_useful_summary_file_path = os.path.join(destination_directory, file_name_without_extension + "_useful.sum")
    # Create an empty summary file first
    with open(tender_useful_summary_file_path, 'w') as file:
        file.write('')

    tender_useful_content = ''
    with open(file_path, 'r', encoding='utf-8') as tender_file:
        useful_tokens = []
        tender_contents = tender_file.read()
        
        # Process the document with spaCy
        tender_doc = nlp(tender_contents)
        
        for token in tender_doc:
            if token.pos_ in pos_tags_of_interest and token.text not in useful_tokens:
                useful_tokens.append(token.text)
        
        for ent in tender_doc.ents:
            if ent.label_ in ner_tags_of_interest and ent.text not in useful_tokens:
                useful_tokens.append(token.text)

        # Extract POS tags from the document
        #useful_tokens = [ token.text for token in tender_doc if token.pos_ in pos_tags_of_interest ]
        #useful_tokens = set(useful_tokens)
        
        tender_useful_content = " ".join(useful_tokens).strip()

        with open(tender_useful_summary_file_path, 'a', encoding='utf-8') as file_writer:
            file_writer.write(tender_useful_content)

