In [1]:
# pip install PyMuPDF
# pip install python-docx
# pip install spacy
# python -m spacy download en_core_web_sm - On Terminal

import os
import zipfile
import pandas as pd
import numpy as np
import warnings
from collections import defaultdict
import traceback
from bs4 import BeautifulSoup

# Read the dataset
dataset = pd.read_excel('Contracts_Dataset.xlsx', dtype=str)

# Get the current directory
current_directory = os.getcwd()

# Directory containing the .zip files
download_directory = os.path.join(current_directory, 'Tender_Files_1')

# Destination directory for extracted contents
extract_directory = os.path.join(current_directory, 'Tender_Files_Extract_1')

# POS Tags of interest
pos_tags_of_interest = ['NOUN', 'VERB', 'ADJ', 'ADV']

# NER Tags of interest
ner_tags_of_interest = ['ORG', 'GPE', 'LOC', 'NORP', 'PRODUCT', 'EVENT', 'SCIENCE', 'ARTICLE']


In [2]:
# Ensure the destination directory exists
if not os.path.exists(extract_directory):
    os.makedirs(extract_directory)

# Iterate through the files in the Tenders directory and unzip all of them
for file_name in os.listdir(download_directory):
    file_path = os.path.join(download_directory, file_name)
    
    # Check if the file has a .zip extension
    if file_name.endswith('.zip'):
        tender_reference_number = file_name.split('-')[0]
        tender_extract_path = os.path.join(extract_directory, tender_reference_number)
        try:
            # Open the ZIP file
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                # Extract all contents to the destination directory
                zip_ref.extractall(tender_extract_path)

        except zipfile.BadZipFile as e:
            print(f"Error: {e} - {file_path} is not a valid ZIP file.")
        except Exception as e:
            print(f"An error occurred: {e}")
        
        # File has been unzipped - Look for more zip files within extracted content and unzip them
        for inner_file_name in os.listdir(tender_extract_path):
            inner_file_path = os.path.join(tender_extract_path, inner_file_name)
    
            # Check if the file has a .zip extension
            if inner_file_name.endswith('.zip'):
                try:
                    with zipfile.ZipFile(inner_file_path, 'r') as zip_ref:
                        zip_ref.extractall(tender_extract_path)
                except zipfile.BadZipFile as e:
                    print(f"Error: {e} - {inner_file_path} is not a valid ZIP file.")
                except Exception as e:
                    print(f"An error occurred: {e}")


In [3]:
import glob
import fitz
import docx

# Extract the textual content from all pdf and docx files
tenders = [f for f in os.listdir(extract_directory) if os.path.isdir(os.path.join(extract_directory, f))]

# Now, subfolders contains a list of subfolder names in the specified folder
for tender_reference_number in tenders:
    tender_file_path = os.path.join(extract_directory, tender_reference_number)
    tender_summary_file_path = os.path.join(extract_directory, tender_reference_number + ".txt")
    
    # Create an empty summary file first
    with open(tender_summary_file_path, 'w') as file:
        file.write('')
    
    for root, dirs, files in os.walk(tender_file_path):
        for file in files:
            file_path = os.path.join(root, file)
            text_content = ''
            
            if file.endswith('.pdf'):
                pdf_document = fitz.open(file_path)

                # Iterate through each page in the PDF
                for page_num in range(pdf_document.page_count):
                    page = pdf_document[page_num]
                    text_content += page.get_text()

                # Close the PDF document
                pdf_document.close()

            if file.endswith('.docx'):
                doc = docx.Document(file_path)
                for paragraph in doc.paragraphs:
                    text_content += paragraph.text + '\n'

            with open(tender_summary_file_path, 'a', encoding='utf-8') as file_writer:
                file_writer.write(text_content)


In [None]:
# Download necessary NLTK data (you may have already done this)
nltk.download('stopwords')
nltk.download('punkt')

In [34]:
import nltk
from rake_nltk import Rake


from bs4 import BeautifulSoup

# Initialize RAKE
rake = Rake()

trimmed_data = {'Reference Number': [], 'Contract Title': [], 'Description': [], 
                'UNSPSC Title': [], 'Supplier Name': [], 'Tenders Content': []}
trimmed_df = pd.DataFrame(trimmed_data)
rows_to_add = []

# Loop through the DataFrame one row at a time
for index, row in dataset.iterrows():
    reference_number = row['Reference Number'].strip()
    title = row['Contract Title'].strip()
    description = row['Description'].strip()
    soup = BeautifulSoup(description, 'lxml')
    description_text = ''.join(soup.stripped_strings)
    unspsc_title = row['UNSPSC Title'].strip()
    supplier_name = str(row['Supplier Name']).strip()

    
    # Check if the extracted tender file exists for the reference number
    tender_useful_content = ''
    file_path = os.path.join(extract_directory, reference_number + ".txt")
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as tender_file:
            tender_contents = tender_file.read()

            # Extract keywords using RAKE
            rake.extract_keywords_from_text(tender_contents)  # You were missing this line
            keywords = rake.get_ranked_phrases()

            useful_tokens = [keyword for keyword in keywords]

            tender_useful_content = " ".join(useful_tokens).strip()
        
    
    new_row = {
        'Reference Number': reference_number,
        'Contract Title': title,
        'Description': description_text,
        'UNSPSC Title': unspsc_title,
        'Supplier Name': supplier_name,
        'Tenders Content': tender_useful_content
    }
    rows_to_add.append(new_row)

trimmed_df = pd.concat([trimmed_df, pd.DataFrame(rows_to_add)], ignore_index=True)
trimmed_df.to_csv('trimmed_dataset_rake.csv', index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rashi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rashi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [38]:
import pandas as pd

# Load the CSV file
trimmed_df = pd.read_csv('trimmed_dataset_rake.csv')

# Display the first few rows
print(trimmed_df['Tenders Content'].head())

0    36 24x7 forticare contract 12 24x7 forticare c...
1    addendum 1 important request dfes211522 reques...
2    addendum 1 important tender process tender pro...
3    thursday 28 october 2021 clarification please ...
4    ac 12 zone 1 filter theatre 11 room 2 610 h x ...
Name: Tenders Content, dtype: object
