In [4]:
import re
import pandas as pd

In [8]:
# extract all the member chats and date_time data.
# chats: Format - '<person_name>:<chat>'
# date_time: Format - ['date', 'time']

with open("data/WhatsApp Chat with Po po po po po.txt", encoding="utf-8") as f:
    data = f.read()

pattern = r"(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2}) - (.+)"
matches = re.findall(pattern, data)

users = []
messages = []
dates = []
times = []

for date, time, chat in matches:
    date_obj = pd.to_datetime(date.strip(), dayfirst=True)
    time_obj = pd.to_datetime(time.strip(), format='%H:%M').time()

    if ':' in chat:
        # Normal user message
        user, message = chat.split(':', 1)
        users.append(user.strip())
        messages.append(message.strip())
    else:
        # System/group message (e.g., "User added X", "You created group", etc.)
        users.append("group_notification")
        messages.append(chat.strip())

    dates.append(date_obj)
    times.append(time_obj)

df = pd.DataFrame({
    'user': users,
    'message': messages,
    'date': dates,
    'time': times
})

# Extract year, month, day, hour, minute
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month_name()
df['day'] = df['date'].dt.day.astype(str).str.zfill(2)
df['hour'] = pd.to_datetime(df['time'].astype(str), format='%H:%M:%S').dt.hour.astype(str).str.zfill(2)
df['minute'] = pd.to_datetime(df['time'].astype(str), format='%H:%M:%S').dt.minute.astype(str).str.zfill(2)

df = df.drop(columns=['date', 'time'])

print(df.shape)
print(df.head(10))

(2182, 7)
                 user                                            message  \
0  group_notification  Messages and calls are end-to-end encrypted. O...   
1  group_notification     Lombrata CS created group "Hardware Design 💦😋"   
2  group_notification                              Lombrata CS added you   
3         Lombrata CS                                    <Media omitted>   
4          Arjeesh CS                                Ekta jinish dekhbi?   
5          Arjeesh CS                                    <Media omitted>   
6          Arjeesh CS                                          Ei korlam   
7         Lombrata CS                                  Eta toh exp 1 toh   
8          Arjeesh CS                     555 timer: Condition - Astable   
9          Arjeesh CS                                    <Media omitted>   

   year    month day hour minute  
0  2025  January  07   15     25  
1  2025  January  07   15     25  
2  2025  January  07   15     25  
3  2025  Janu