In [1]:
import pandas as pd
import re
from datetime import datetime

In [2]:
class ChatParser:
    def __init__(self, file_path):
        '''
        Initialize the parser with the file path of the chat text file.
        file_path: Path to the Whatsapp chat text file.
        '''
        self.file_path = file_path
        self.messages = None
    
    def parse_chat(self):
        '''
        Parse the Whatsapp chat text file.
        Extracts timestamp, sender, message, and message length and returns a DataFrame.     
        '''
        with open(self.file_path, 'r', encoding='utf-8') as file:
            chat_data = file.readlines()
        
        messages = []
        pattern = r'(\d{2}/\d{2}/\d{4}, \d{2}:\d{2}) - (.+?): (.+)'
        
        for line in chat_data:
            match = re.match(pattern, line.strip())
            if match:
                timestamp_str, sender, message = match.groups()
                try:
                    timestamp = datetime.strptime(timestamp_str, '%d/%m/%Y, %H:%M')
                    messages.append({
                        'timestamp': timestamp,
                        'sender': sender,
                        'message': message.strip(),
                        'message_length': len(message)
                    })
                except ValueError:
                    print(f"Skipping invalid line: {line}")
        
        self.messages = pd.DataFrame(messages)
        return self.messages
    
    def clean_data(self):
        '''
        Clean the parsed chat data by removing unwanted rows.
        Removes rows containing info that is not useful for analysis.
        '''
        if self.messages is None:
            raise ValueError('Chat data is not parsed yet. Please call parse_chat() first.')
        
        unwanted_messages = [
            '<Media omitted>',
            'This message was deleted',
            'You deleted this message',
            'Missed voice call',
            'Missed video call',
            'null'
        ]

        self.messages = self.messages[~self.messages['message'].isin(unwanted_messages)]
        return self.messages
    
    def export_to_csv(self, output_path):
        '''
        Export the cleaned data to a CSV file.
        output_path: Path to save the CSV file.
        '''
        if self.messages is None:
            raise ValueError('No data to export. Please ensure the chat data is parsed and cleaned first.')
        
        self.messages.to_csv(output_path, index=False)
        print(f'Data exported successfully to {output_path}')

In [3]:
# Example Usage
# Initialize the parser with the text file
parser = ChatParser('WhatsApp_Chat.txt')

# Parse the chat text file
messages = parser.parse_chat()

# Clean unnecessary data 
cleaned_messages = parser.clean_data()

# Export chat data to CSV file
parser.export_to_csv('Whatsapp_Chat.csv')

Data exported successfully to Whatsapp_Chat.csv
