### Data cleaning and preprocessing

In [None]:
import pandas as pd
import numpy as np
import re
import os
from dotenv import load_dotenv
load_dotenv()

In [2]:
# load data raw from data/abstract_narration.csv
def load_abstracts():
    """Load the abstract narration data from CSV file"""
    data = pd.read_csv(os.getenv('RAW_DATA_PATH'))
    return data

In [3]:
data = load_abstracts()

In [None]:
data.head(5)

In [None]:
data.shape

In [5]:
def clean_string_column(
        df: pd.DataFrame,
        column_name: str,
        lower: bool = True,
        remove_special: bool = True,
        remove_numbers: bool = False
) -> pd.DataFrame:
    """
    Cleans strings in the specified column of a pandas DataFrame.
    
    Args:
    - df: pd.DataFrame - The input DataFrame
    - column_name: str - The name of the column to clean
    - lower: bool - Whether to convert text to lowercase (default: True)
    - remove_special: bool - Whether to remove special characters (default: True)
    - remove_numbers: bool - Whether to remove numbers (default: False)
    
    Returns:
    - pd.DataFrame: DataFrame with cleaned column
    """

    def clean_text(text):
        if pd.isnull(text):
            return text  # Keep NaN as is
        
        # Strip leading/trailing whitespace
        text = text.strip()

        # Convert to lowercase if specified
        if lower:
            text = text.lower()

        # Remove special characters/punctuation if specified
        if remove_special:
            text = re.sub(r'[^\w\s]', '', text)

        # Remove numbers if specified
        if remove_numbers:
            text = re.sub(r'\d+', '', text)
        
        return text

    # Apply the cleaning function to the specified column
    df[column_name] = df[column_name].apply(clean_text)
    
    return df


In [6]:
# apply clean string to AbstractNarration column
data_cleaned = clean_string_column(df=data, column_name="AbstractNarration")

In [None]:
data_cleaned.AbstractNarration

In [None]:
data_cleaned.isnull().sum().sort_values(ascending=False)

In [9]:
data_cleaned = data_cleaned.dropna()

In [None]:
data_cleaned.isnull().sum().sort_values(ascending=False)

In [None]:
data_cleaned.nunique()

In [12]:
final_data = data_cleaned.copy()

In [13]:
final_data.drop_duplicates(inplace=True)

In [None]:
# percentage unique values to total values
print("% Unique values from total:", (final_data.nunique()/len(final_data)*100))

In [None]:
len(final_data)

In [16]:
final_data.to_csv(os.getenv("FINAL_DATA_PATH"), index=False)