### Base class for data preprocessing

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

# Detect language
from langdetect import detect
from deep_translator import GoogleTranslator
from textwrap import wrap

In [2]:
# nltk.download('stopwords')

In [None]:
class DataIngestion:
    """
    Class responsible for reading data files and performing initial data cleaning.
    """
    def __init__(self, volumes_file_path:str, archives_file_path: str, 
                 articles_file_path: str, 
                 contents_file_path:str, 
                 authors_file_path:str, authors_articles_file_path:str):
        self.volumes_file_path = volumes_file_path        
        self.archives_file_path = archives_file_path
        self.articles_file_path = articles_file_path
        self.contents_file_path = contents_file_path
        self.authors_file_path = authors_file_path
        self.authors_articles_file_path = authors_articles_file_path

    def load_data(self) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Loads the archives, articles, and contents DataFrames from MySQL
        """
        print("[INFO] Loading data files...")
        volumes_df = self.mysql_writer.read_table_volumes()
        archives_df = self.mysql_writer.read_table_archives(status="")
        articles_df = self.mysql_writer.read_table_articles(status="")
        contents_df = self.mysql_writer.read_table_contents()
        authors_df = self.mysql_writer.read_table_authors()
        authors_articles_df = self.mysql_writer.read_table_authors_articles()

        return volumes_df, archives_df, articles_df, contents_df, authors_df

    def clean_articles(self, articles_df: pd.DataFrame) -> pd.DataFrame:
        """
        Simple Data preprocessing.
        """
        print("[INFO] Cleaning articles...")
        articles_df = articles_df.fillna('')

        return articles_df
    
    def clean_article_authors(self, articles_df: pd.DataFrame) -> pd.DataFrame:
        # Clean authors
        articles_df = articles_df.assign(author=articles_df['author'].fillna(''))
        articles_df['author_clean'] = (articles_df['author']
                                            .str.replace(r'\.', '', regex=True)
                                            .str.replace(r'\s+', ' ', regex=True)  # This replaces multiple spaces with one
                                            .str.strip()  # Remove leading/trailing spaces
                                            )

        return articles_df
    
    def clean_article_titles (self, articles_df: pd.DataFrame) -> pd.DataFrame:
        # Clean titles
        articles_df = articles_df.assign(article_title=articles_df['article_title'].fillna(''))
        articles_df["article_title_clean"] = articles_df["article_title"].str.lower()

        return articles_df
    
    def clean_article_abstracts(self, articles_df: pd.DataFrame) -> pd.DataFrame:
        # Clean abstracts
        articles_df = articles_df.assign(abstract=articles_df['abstract'].fillna(''))
        articles_df["abstract_clean"] = articles_df["abstract"].str.lower()
                
        # Update 'abstract_clean' to an empty string if it contains fewer than 3 words
        articles_df['abstract_clean'] = articles_df['abstract_clean'].apply(lambda x: '' if len(x.split()) < 3 else x)

        return articles_df
    
    def clean_article_contents(self, contents_df: pd.DataFrame) -> pd.DataFrame:
        # Clean contents
        contents_df = contents_df.assign(content=contents_df['content'].fillna(''))
        contents_df['content_clean'] = contents_df['content'].str.lower()

        return contents_df
    
    def clean_archive_titles (self, archives_df: pd.DataFrame) -> pd.DataFrame:
        # Clean titles
        archives_df = archives_df.assign(archive_title=archives_df['archive_title'].fillna(''))
        archives_df["archive_title_clean"] = archives_df["archive_title"].str.lower()

        return archives_df

class DataIngestion_MySQL:
    """
    Class responsible for reading data files and performing initial data cleaning.
    """
    def __init__(self):
        self.DB_HOST = "localhost"
        self.DB_USER = "root"
        self.DB_PASSWORD = "root"
        self.DB_NAME = "fmdb"
        self.mysql_writer = MySQLWriter(self.DB_HOST, self.DB_USER, self.DB_PASSWORD, self.DB_NAME)

    def load_data(self) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Loads the archives, articles, and contents DataFrames from MySQL
        """
        print("[INFO] Loading data files...")
        volumes_df = self.mysql_writer.read_table_volumes()
        archives_df = self.mysql_writer.read_table_archives(status="")
        articles_df = self.mysql_writer.read_table_articles(status="")
        contents_df = self.mysql_writer.read_table_contents()
        authors_df = self.mysql_writer.read_table_authors()
        authors_articles_df = self.mysql_writer.read_table_authors_articles()

        return volumes_df, archives_df, articles_df, contents_df, authors_df, authors_articles_df

    def clean_articles(self, articles_df: pd.DataFrame) -> pd.DataFrame:
        """
        Simple Data preprocessing.
        """
        print("[INFO] Cleaning articles...")
        articles_df = articles_df.fillna('')

        return articles_df
    
    def clean_article_authors(self, articles_df: pd.DataFrame) -> pd.DataFrame:
        # Clean authors
        articles_df = articles_df.assign(author=articles_df['author'].fillna(''))
        articles_df['author_clean'] = (articles_df['author']
                                            .str.replace(r'\.', '', regex=True)
                                            .str.replace(r'\s+', ' ', regex=True)  # This replaces multiple spaces with one
                                            .str.strip()  # Remove leading/trailing spaces
                                            )

        return articles_df
    
    def clean_article_titles (self, articles_df: pd.DataFrame) -> pd.DataFrame:
        # Clean titles
        articles_df = articles_df.assign(article_title=articles_df['article_title'].fillna(''))
        articles_df["article_title_clean"] = articles_df["article_title"].str.lower()

        return articles_df
    
    def clean_article_abstracts(self, articles_df: pd.DataFrame) -> pd.DataFrame:
        # Clean abstracts
        articles_df = articles_df.assign(abstract=articles_df['abstract'].fillna(''))
        articles_df["abstract_clean"] = articles_df["abstract"].str.lower()
                
        # Update 'abstract_clean' to an empty string if it contains fewer than 3 words
        articles_df['abstract_clean'] = articles_df['abstract_clean'].apply(lambda x: '' if len(x.split()) < 3 else x)

        return articles_df
    
    def clean_article_contents(self, contents_df: pd.DataFrame) -> pd.DataFrame:
        # Clean contents
        contents_df = contents_df.assign(content=contents_df['content'].fillna(''))
        contents_df['content_clean'] = contents_df['content'].str.lower()

        return contents_df
    
    def clean_archive_titles (self, archives_df: pd.DataFrame) -> pd.DataFrame:
        # Clean titles
        archives_df = archives_df.assign(archive_title=archives_df['archive_title'].fillna(''))
        archives_df["archive_title_clean"] = archives_df["archive_title"].str.lower()

        return archives_df

class TextPreprocessor:
    """
    Class responsible for additional text cleaning and preprocessing before topic modeling.
    """
    def __init__(self):
        
        self.stop_words = set(stopwords.words('english'))
        self.stop_words.update(['et', 'al']) # custom stop words
        self.lemmatizer = WordNetLemmatizer()

    def preprocess_text(self, text: str) -> str:
        """
        Removes URLs, non-alphabetic characters, stopwords, casefolding, lemmatization.
        """
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+', '', text, flags=re.MULTILINE)

        # Remove non-alpha
        # text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        
        # Remove non-alpha except for certain special characters that may affect context
        text = re.sub(r'[^a-zA-Z\s.,!?;:\'\"()\[\]-]', ' ', text)
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)

        # words = text.split()
        # Remove stopwords is not advised in BERTopic. 
        # https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html#document-length
        # words = [w for w in words if w not in self.stop_words]
        
        # Lemmatize
        # words = [self.lemmatizer.lemmatize(w) for w in words]
        # text = ' '.join(words)

        return text
    
    def remove_stop_words (self, text: str) -> str:
        # Remove stopwords is not advised in BERTopic. 
        # https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html#document-length

        words = text.split()
        words = [w for w in words if w not in self.stop_words]
        text = ' '.join(words)

        return text
    
    def lemmatize (self, text: str) -> str:
        # Lemmatize

        words = text.split()
        words = [self.lemmatizer.lemmatize(w) for w in words]
        text = ' '.join(words)

        return text

    # Translate to English if the language is not English
    def translate_to_english (self, lang, text):
        if lang != 'en':
            translated_text = GoogleTranslator(source='auto', target='en').translate(text)
            if translated_text is not None:
                text = translated_text
        return text
    
    def detect_language (self, text):
        detected_lang = detect(text)
        return detected_lang