<a href="https://colab.research.google.com/github/Yasaman-habibi/Analysis-of-Sustainability-Reports/blob/main/Cleaned_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk
!pip install textblob
!pip install emoji
!pip install clean-text[gpl]

In [None]:
import os
import pandas as pd
import re
import emoji
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer
from cleantext import clean
import nltk
import matplotlib.pyplot as plt
from google.colab import drive, files
from time import sleep

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
drive.mount('/content/drive')

uploaded_Texts = files.upload()

Cleaned_path = "/content/drive/MyDrive/Cleaned_Texts"
os.makedirs(Cleaned_path, exist_ok=True)


In [None]:
# تعریف تابع پیش‌پردازش
def preprocess_text(text, do_spell_check=True):
    if pd.isnull(text):
        return ""

    text = text.lower()

    text = clean(text,
                 no_urls=True,
                 no_emails=True,
                 no_emoji=True,
                 no_punct=False)

    if do_spell_check:
        text = str(TextBlob(text).correct())            # اصلاح غلط‌های املایی

    text = re.sub(r'(.)\1{2,}', r'\1', text)            # حذف تکرار حروف
    text = re.sub(r'[^\w\s]', '', text)                 # حذف علائم نگارشی
    text = re.sub(r'\s+', ' ', text).strip()            # حذف فاصله‌های اضافی

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))        # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()                           # Lemmatize + stem
    tokens = [stemmer.stem(lemmatizer.lemmatize(token)) for token in tokens]

    pos_tags = pos_tag(tokens)                          # POS tagging

    return ' '.join(tokens)

In [None]:
# تنظیمات batch
#ذخیره هر فایل بصورت جداگانه
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('all', halt_on_error=False)

In [None]:
filenames = list(uploaded_Texts.keys())
batch_size = 5  # تعداد فایل در هر دسته

for i in range(0, len(filenames), batch_size):
    batch_files = filenames[i:i+batch_size]
    print(f" در حال پردازش دسته {i+1} تا {i+len(batch_files)} از {len(filenames)} فایل...")

    for filename in batch_files:
        with open(filename, "r", encoding="utf-8") as file:
            content = file.read()
            cleaned_text = preprocess_text(content)

        base_name = os.path.splitext(filename)[0]
        output_filename = f"cleaned_{base_name}.txt"
        output_path = os.path.join(Cleaned_path, output_filename)

        with open(output_path, "w", encoding="utf-8") as out_file:
            out_file.write(cleaned_text)

        print(f" فایل تمیز‌شده ذخیره شد: {output_filename}")

In [None]:
#یا ذخیره بصورت ترکیبی در یک فایل
# تنظیمات batch
filenames = list(uploaded_Texts.keys())
batch_size = 5

cleaned_paragraphs = []

for i in range(0, len(filenames), batch_size):
    batch_files = filenames[i:i+batch_size]
    print(f" در حال پردازش فایل‌های {i+1} تا {i+len(batch_files)} از {len(filenames)}...")

    for filename in batch_files:
        with open(filename, "r", encoding="utf-8") as file:
            content = file.read()
            cleaned = preprocess_text(content)
            cleaned_paragraphs.append(cleaned)

    sleep(0.5)

In [None]:
with open(Cleaned_file, "w", encoding="utf-8") as f:
    for para in cleaned_paragraphs:
        f.write(para + "\n\n")

print(f"\n تمام فایل‌ها پردازش و ذخیره شدند: {Cleaned_file}")