<a href="https://colab.research.google.com/github/Yasaman-habibi/Pre_Processing_Report/blob/main/Cleaned_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#instal Library
!pip install nltk
!pip install textblob
!pip install emoji
!pip install clean-text[gpl]
!pip install symspellpy

!pip install symspellpy spacy
!python -m spacy download en_core_web_sm

In [None]:
#Import Library
import os
import re
import glob
import nltk
import emoji
import spacy
import pandas as pd
from nltk import pos_tag
from cleantext import clean
import ipywidgets as widgets
from textblob import TextBlob
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from IPython.display import display
from nltk.tokenize import word_tokenize
from symspellpy import SymSpell, Verbosity
from concurrent.futures import ThreadPoolExecutor
from nltk.stem import PorterStemmer, WordNetLemmatizer
from ipywidgets import SelectMultiple, Button, VBox, Layout

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
#Upload Files
from google.colab import drive, files

drive.mount('/content/drive')
source_path = "/content/drive/MyDrive/Combined_Texts"
all_txt_files = glob.glob(os.path.join(source_path, "*.txt"))
selector = widgets.SelectMultiple(
    options=all_txt_files,
    description='Select files',
    rows=10
)
display(selector)

In [None]:
uploaded_Texts = list(selector.value)
Cleaned_path = "/content/drive/MyDrive/Cleaned_Texts"
os.makedirs(Cleaned_path, exist_ok=True)

print(" انتخاب شد:\n" + "\n".join(uploaded_Texts))

In [None]:
#Config
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = "/content/drive/MyDrive/sustainability_table/frequency_dictionary_en_82_765.txt"
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

stemmer = PorterStemmer()

nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
nlp.max_length = 50000000
stemmer = PorterStemmer()

nltk_stopwords = set(stopwords.words("english"))

In [None]:
# preprocess_text Function

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(text, do_spell_check=True, remove_stopwords=True, use_stem=True, chunk_size=50000):
    if not text or text.strip() == "":
        return []

    def process_chunk(chunk):
        chunk = re.sub(r'(.)\1{2,}', r'\1', chunk)

        #spell Check
        if do_spell_check:
            corrected_words = []
            for word in chunk.split():
                if re.search(r'[A-Z0-9]', word):
                    corrected_words.append(word)
                else:
                    suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
                    corrected_words.append(suggestions[0].term if suggestions else word)
            chunk = ' '.join(corrected_words)

        #Normalize
        chunk = re.sub(r'[^\w\s]', '', chunk)
        chunk = re.sub(r'\s+', ' ', chunk).strip()
        chunk = re.sub(r'\d+', '', chunk)

        chunk = chunk.lower()

        doc = nlp(chunk)

        #Remove Stop Words
        #lemmatization and stemming
        #Tokenize
        if use_stem:
            return [(stemmer.stem(token.lemma_), token.tag_)
                    for token in doc
                    if not remove_stopwords or not token.is_stop]
        else:
            return [(token.lemma_, token.tag_)
                    for token in doc
                    if not remove_stopwords or not token.is_stop]

    # Divide text into chunks
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

    # Merge Outputs
    tokens_out = []
    for chunk in chunks:
        tokens_out.extend(process_chunk(chunk))

    return tokens_out

In [None]:
#Batch Processing

from time import sleep

filenames = uploaded_Texts
batch_size = 5
cleaned_paragraphs = []


def process_file(filepath):
    with open(filepath, "r", encoding="utf-8") as file:
        content = file.read()

    cleaned = preprocess_text(content, do_spell_check=True, remove_stopwords=True, use_stem=True)

    cleaned_text_str = ' '.join([f"{word}/{pos}" for word, pos in cleaned])

    base_name = os.path.splitext(os.path.basename(filepath))[0]
    output_filename = f"cleaned_{base_name}.txt"
    output_path = os.path.join(Cleaned_path, output_filename)

    with open(output_path, "w", encoding="utf-8") as out_file:
        out_file.write(cleaned_text_str)

    print(f"فایل تمیزشده ذخیره شد: {output_filename}")
    return cleaned

#ThreadPoolExecutor
for i in range(0, len(filenames), batch_size):
    batch_files = filenames[i:i+batch_size]
    print(f"در حال پردازش فایل‌های {i+1} تا {i+len(batch_files)} از {len(filenames)}...")

    with ThreadPoolExecutor(max_workers=4) as executor:
        results = list(executor.map(process_file, batch_files))

    cleaned_paragraphs.extend(results)

    sleep(0.5)

print(f"\nتمام فایل‌ها پردازش شدند و متن‌ها در حافظه نگهداری شدند.")
