<a href="https://colab.research.google.com/github/Yasaman-habibi/Pre_Processing_Report/blob/main/Filtered_Data_By_KeyWords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Import Library
from google.colab import files, drive
import matplotlib.pyplot as plt
import pandas as pd
import os
import io
import re

In [None]:
#Upload Files

drive.mount('/content/drive')
uploaded_Texts = files.upload()
Combined_path = "/content/drive/MyDrive/Combined_Texts"
os.makedirs(Combined_path, exist_ok=True)
Combined_file = os.path.join(Combined_path, "Combined_Texts.txt")

In [None]:
#Upload Files

df_dict = pd.read_excel("/content/drive/MyDrive/sustainability_table/Loughran-McDonald.xlsx")
keywords = pd.read_excel("/content/drive/MyDrive/sustainability_table/keywords.xlsx")
sustain_Dic = pd.read_excel("/content/drive/MyDrive/sustainability_table/sustain_Dic.xlsx")

In [None]:
#Reading text files to perform processing

text = ""
for filename in uploaded_Texts.keys():
    with open(filename, "r", encoding="utf-8") as file:
        text += file.read() + "\n\n"

def split_paragraphs(text):
    paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
    return paragraphs

paragraphs = split_paragraphs(text)

In [None]:
#Filtering text using a dictionary built based on Keywords
sustain_terms = set(sustain_Dic["Word"].dropna().str.lower())


#Process for a word or multiple words as a phrase in a keyword list
sustain_terms_raw = sustain_Dic["Word"].dropna().str.strip().str.lower().tolist()
single_terms = set(term for term in sustain_terms_raw if len(term.split()) == 1)
multi_terms = set(term for term in sustain_terms_raw if len(term.split()) > 1)


def contains_sustain_terms(paragraph):
    para_lower = paragraph.lower()
    #  multiple words
    padded_para = f" {para_lower} "
    match_multi = any(f" {term} " in padded_para for term in multi_terms)
    #  a word
    words_in_para = re.findall(r'\b\w+\b', para_lower)
    match_single = any(word in single_terms for word in words_in_para)
    return match_multi or match_single


#Filter paragraphs related to sustainability
filtered_paragraphs = [para for para in paragraphs if contains_sustain_terms(para)]
print(f"{len(filtered_paragraphs)} paragraphs contain sustainability-related terms.")

In [None]:
#Combine only paragraphs related to sustainability

Combined_Sustain_text_path = os.path.join(Combined_path, "Combined_Sustain_text.txt")

def get_next_index(base_path, prefix="Combined_Sustain_text_", suffix=".txt"):
    existing_files = os.listdir(base_path)
    indices = []

    for fname in existing_files:
        match = re.match(fr"{re.escape(prefix)}(\d+){re.escape(suffix)}", fname)
        if match:
            indices.append(int(match.group(1)))
    return max(indices, default=0) + 1


file_index = get_next_index(Combined_path)
output_filename = f"Combined_Sustain_text_{file_index}.txt"
output_path = os.path.join(Combined_path, output_filename)

with open(output_path, "w", encoding="utf-8") as outfile:
    for filename in uploaded_Texts.keys():
        with open(filename, "r", encoding="utf-8", errors="ignore") as infile:
            content = infile.read()
            paragraphs = split_paragraphs(content)
            filtered_paragraphs = [para for para in paragraphs if contains_sustain_terms(para)]

            if filtered_paragraphs:
                outfile.write(f"===== Start of File: {filename} =====\n")
                for para in filtered_paragraphs:
                    outfile.write(para + "\n\n")
                outfile.write(f"===== End of File: {filename} =====\n\n")

print(f" پاراگراف‌های فیلتر شده ذخیره شدند در: {output_path}")

In [None]:
#Combined Files Together
import glob

merged_output_path = os.path.join(Combined_path, "Combined_Sustain_MERGED.txt")

files_to_merge = sorted(
    glob.glob(os.path.join(Combined_path, "Combined_Sustain_text_*.txt")),
    key=lambda x: int(re.search(r"_(\d+)\.txt$", x).group(1))
)

with open(merged_output_path, "w", encoding="utf-8") as outfile:
    for file_path in files_to_merge:
        with open(file_path, "r", encoding="utf-8") as infile:
            content = infile.read()
            outfile.write(content + "\n")

print(f" همه فایل‌ها با موفقیت ترکیب شدند.\n مسیر فایل نهایی: {merged_output_path}")