<a href="https://colab.research.google.com/github/Yasaman-habibi/Analysis-of-Sustainability-Reports/blob/main/Filtered_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files, drive
import pandas as pd
import matplotlib.pyplot as plt
import os
import io
import re

In [None]:
drive.mount('/content/drive')

# 1. آپلود فایل‌های متنی
uploaded_Texts = files.upload()

Combined_path = "/content/drive/MyDrive/Combined_Texts"
os.makedirs(Combined_path, exist_ok=True)
Combined_file = os.path.join(Combined_path, "Combined_Texts.txt")

In [None]:
# 2. آپلود فایل دیکشنری Loughran-McDonald

uploaded_Dic = files.upload()
for filename in uploaded_Dic.keys():
    df_dict = pd.read_excel(io.BytesIO(uploaded_Dic[filename]))

Dic_path = "/content/drive/MyDrive/sustainability_table"
os.makedirs(Dic_path, exist_ok=True)
Dic_file = os.path.join(Dic_path, "Loughran-McDonald.xlsx")

df_dict.to_excel(Dic_file, index=False)

In [None]:
df_dict = pd.read_excel("/content/drive/MyDrive/sustainability_table/Loughran-McDonald.xlsx")
keywords = pd.read_excel("/content/drive/MyDrive/sustainability_table/keywords.xlsx")

In [None]:
# 3. ساخت فایل keywords
keywords_path = "/content/drive/MyDrive/sustainability_table"
os.makedirs(keywords_path, exist_ok=True)
keywords_file = os.path.join(keywords_path, "keywords.xlsx")

# لیست کلمات کلیدی نهایی
default_keywords = [
     "Sustainable" , "Sustainability" , "Sustainable finance" , "Sustainable innovation" , "Sustainable agriculture" ,
     "Sustainable materials" ,  "Sustainable supply chain" , "Sustainable development" ,
     "SDGs" , "ESG", "Non-Financial" , "Development" ,
     "Environmental" ,  "Environmental, social, and governance" , "Environmental protection", "Environmental impact" , "environment",
     "climate" , "Climate mitigation" , "Climate change" ,
     "Economic" , "Economic sustainability" , "Economy" , "Circular economy" , "Green economy" , "Green technology" ,
     "Social" , "Society" , "Corporate social responsibility" ,"CSR" ,
     "Carbon footprint", "Carbon emissions" ,  "Pollutants " ,  "Greenhouse Gas Emissions" ,  "Decarbonization" ,
     "Renewable energy" , "Clean energy" , "Energy efficiency" , "Recycling" , "Demographic changes" ,
     "Waste management" , "Zero waste" , "Natural Resources" , "Resource management" ,
     "Earth", "Air", "biodiversity",  "Crisis" ,  "Atmospheric" , "Water" , "pollution" , "Pollution reduction" ,
     "Drought" , "Famine" ,  "Water conservation" ,
     "Ground Warming" , "Global Warming" , "Species extinction" , "Ecosystem preservation" ,
     "Future Needs" ,  "Life cycle assessment" , "Eco-friendly" , "Responsible consumption" , "Human rights" ,
     "Better life",
]

updated_keywords_df = pd.DataFrame(sorted(set(default_keywords)), columns=["keyword"])
updated_keywords_df.to_excel(keywords_file, index=False)
keywords = updated_keywords_df["keyword"].str.lower().tolist()

In [None]:
# 4. ساخت مجموعه واژه‌های پایداری از دیکشنری بر مبنای keywords

sustain_terms = set()
for word in df_dict["Word"]:
    word_lower = str(word).lower()
    if any(key in word_lower for key in keywords):
        sustain_terms.add(word_lower)

print(f"{len(sustain_terms)} sustainability-related terms extracted.")


#  ذخیره ردیف‌های دیکشنری دز یک فایل جدید با استفاده از sustain_terms

sustain_Dic = df_dict[df_dict["Word"].str.lower().isin(sustain_terms)]


sustain_Dic_path = "/content/drive/MyDrive/sustainability_table"
os.makedirs(sustain_Dic_path, exist_ok=True)
sustain_Dic_file = os.path.join(sustain_Dic_path, "sustain_Dic.xlsx")
sustain_Dic.to_excel(sustain_Dic_file, index=False)

print(f"Filtered Excel file saved to: {sustain_Dic_path}")

In [None]:
# 5. خواندن فایل های متنی برای انجام process
text = ""
for filename in uploaded_Texts.keys():
    with open(filename, "r", encoding="utf-8") as file:
        text += file.read() + "\n\n"

def split_paragraphs(text):
    paragraphs = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
    return paragraphs

paragraphs = split_paragraphs(text)

In [None]:
# 6: فیلتر کردن متن با استفاده از دیکشنری ساخته شده بر اساس کلمات پایداری

sustain_Dic_path = "/content/drive/MyDrive/sustainability_table/sustain_Dic.xlsx"
sustain_Dic = pd.read_excel(sustain_Dic_path)


sustain_terms = set(sustain_Dic["Word"].dropna().str.lower())


def contains_sustain_terms(paragraph):
    words = paragraph.lower().split()
    words_clean = [word.strip(".,;:!?()[]{}\"'") for word in words]
    return any(word in sustain_terms for word in words_clean)


# فیلتر کردن پاراگراف‌های مرتبط با پایداری
filtered_paragraphs = [para for para in paragraphs if contains_sustain_terms(para)]

print(f"{len(filtered_paragraphs)} paragraphs contain sustainability-related terms.")

In [None]:
# 7. ترکیب فقط پاراگراف‌های مرتبط با پایداری

Combined_Sustain_text_path = os.path.join(Combined_path, "Combined_Sustain_text.txt")


def get_next_index(base_path, prefix="Combined_Sustain_text_", suffix=".txt"):
    existing_files = os.listdir(base_path)
    indices = []

    for fname in existing_files:
        match = re.match(fr"{re.escape(prefix)}(\d+){re.escape(suffix)}", fname)
        if match:
            indices.append(int(match.group(1)))
    return max(indices, default=0) + 1


file_index = get_next_index(Combined_path)
output_filename = f"Combined_Sustain_text_{file_index}.txt"
output_path = os.path.join(Combined_path, output_filename)

with open(output_path, "w", encoding="utf-8") as outfile:
    for filename in uploaded_Texts.keys():
        with open(filename, "r", encoding="utf-8", errors="ignore") as infile:
            content = infile.read()

            paragraphs = split_paragraphs(content)

            filtered_paragraphs = [para for para in paragraphs if contains_sustain_terms(para)]


            if filtered_paragraphs:
                outfile.write(f"===== Start of File: {filename} =====\n")
                for para in filtered_paragraphs:
                    outfile.write(para + "\n\n")
                outfile.write(f"===== End of File: {filename} =====\n\n")

print(f" پاراگراف‌های فیلتر شده ذخیره شدند در: {output_path}")

In [None]:
uploaded_Merged = files.upload()


merged_output_path = os.path.join(Combined_path, "Combined_Sustain_MERGED.txt")

with open(Combined_file, "w", encoding="utf-8") as outfile:
    for filename in uploaded_Merged.keys():
        with open(filename, "r", encoding="utf-8", errors="ignore") as infile:
            content = infile.read()
            outfile.write(content + "\n")

print(f"All text files have been combined into {Combined_file}")

In [None]:
#8. ترکیب فایل های شماره گذاری ساخته شده

import glob

merged_output_path = os.path.join(Combined_path, "Combined_Sustain_MERGED.txt")


files_to_merge = sorted(
    glob.glob(os.path.join(Combined_path, "Combined_Sustain_text_*.txt")),
    key=lambda x: int(re.search(r"_(\d+)\.txt$", x).group(1))
)


with open(merged_output_path, "w", encoding="utf-8") as outfile:
    for file_path in files_to_merge:
        with open(file_path, "r", encoding="utf-8") as infile:
            content = infile.read()
            outfile.write(content + "\n")


print(f" همه فایل‌ها با موفقیت ترکیب شدند.\n مسیر فایل نهایی: {merged_output_path}")