In [1]:
import pandas as pd
import re
from datetime import datetime

class ChatbotResponseCleaner:
    def __init__(self, df):
        self.df = df
        self.current_year = datetime.now().year

    def clean_dates(self, text):
        # Replace relative dates with actual dates
        text = re.sub(r'last year', str(self.current_year - 1), text, flags=re.IGNORECASE)
        text = re.sub(r'this year', str(self.current_year), text, flags=re.IGNORECASE)
        text = re.sub(r'next year', str(self.current_year + 1), text, flags=re.IGNORECASE)
        text = re.sub(r'(?i)today', datetime.now().strftime('%Y-%m-%d'), text)
        text = re.sub(r'(?i)yesterday', (datetime.now() - pd.Timedelta(days=1)).strftime('%Y-%m-%d'), text)
        return text

    def clean_text(self, text):
        # Normalize abbreviations and shorthand
        text = re.sub(r'\bASAP\b', 'as soon as possible', text, flags=re.IGNORECASE)
        text = re.sub(r'\byep\b', 'yes', text, flags=re.IGNORECASE)
        text = re.sub(r'\bnope\b', 'no', text, flags=re.IGNORECASE)
        text = re.sub(r'\bN/A\b', 'No Response', text, flags=re.IGNORECASE)
        text = text.strip()
        return text

    def clean_field_names(self):
        # Split concatenated field names and standardize format
        self.df['Field Name'] = self.df['Field Name'].astype(str).apply(lambda x: ', '.join(x.split(',')) if pd.notnull(x) else '')

    def drop_empty_columns(self):
        # Drop unnamed or completely empty columns
        self.df = self.df.dropna(axis=1, how='all')

    def remove_duplicates(self):
        # Drop duplicate question-answer pairs
        self.df = self.df.drop_duplicates(subset=['Question', 'Answer'])

    def clean_responses(self):
        # Apply all cleaning steps
        self.clean_field_names()
        self.drop_empty_columns()
        self.remove_duplicates()
        self.df['Cleaned Answer'] = self.df['Answer'].astype(str).apply(self.clean_text).apply(self.clean_dates)
        self.df['Cleaned Question'] = self.df['Question'].astype(str).apply(self.clean_text).apply(self.clean_dates)
        return self.df

# Example usage
# cleaner = ChatbotResponseCleaner(data)
# cleaned_df = cleaner.clean_responses()
# cleaned_df.to_csv('cleaned_responses.csv', index=False)
