In [None]:
import pandas as pd
import re

try:
    with open("xyz.txt", "r", encoding="utf-8") as file:
        chat_data = file.read()
except FileNotFoundError:
        chat_data = None

# Corrected pattern with narrow non-breaking space (u202f) before am/pm
date_time_pattern = r'\d{1,2}/\d{1,2}/\d{4}, \d{1,2}:\d{2}\u202f[ap]m - '

msg_list = re.split(date_time_pattern, chat_data)[1:]
date_list = re.findall(date_time_pattern, chat_data)
# Clean the date strings to remove Unicode spaces and keep valid datetime
cleaned_dates = [d.replace('\u202f', '').replace('am', 'AM').replace('pm', 'PM') for d in date_list]
df = pd.DataFrame({'msg_raw': msg_list, 'dt_raw': cleaned_dates})
# Convert to datetime with 12-hour format
df['dt_raw'] = pd.to_datetime(df['dt_raw'], format='%d/%m/%Y, %I:%M%p - ')
df.rename(columns={'dt_raw': 'date'}, inplace=True)
# Separate user and message
user_names = []
actual_msgs = []
for msg in df['msg_raw']:
    split_msg = re.split(r'([\w\W]+?): ', msg)
    if len(split_msg) > 2:
        user_names.append(split_msg[1])
        actual_msgs.append(' '.join(split_msg[2:]))
    else:
        user_names.append('Group_Message')
        actual_msgs.append(msg)
df['user'] = user_names
df['message'] = actual_msgs
df.drop(columns=['msg_raw'], inplace=True)
# Extract date parts
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month_name()
df['day'] = df['date'].dt.day
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
df['day_name'] = df['date'].dt.day_name()


In [None]:
df

In [None]:
df[df['message'] == '<Media omitted>\n'].shape[0]


In [None]:
#---------------- MOST ACTIVE USERS ----------------
user_activity = df['user'].value_counts().reset_index()
user_activity.columns = ['user', 'messages']
user_activity = user_activity[user_activity['user'] != 'Group_Message']
user_activity = user_activity.sort_values(by='messages', ascending=False)
user_activity


In [None]:
temp = df[df['message'] != '<Media omitted>\n']
temp = temp[temp['user'] != 'Group_Message']
temp = temp[temp['message'] != 'This message was deleted\n']

In [None]:
f = open("stop_words.txt", 'r')
stop_words = f.read()

In [None]:
msg = []

for message in temp['message']:
    for word in message.split():
        i = word.lower().strip('.,!?()[]{}"\'')
        if i not in stop_words:
            msg.append(i)

In [None]:
from collections import Counter
word_counts = pd.DataFrame(Counter(msg).most_common(50), columns=['word', 'count'])
word_counts

In [None]:
!pip install emoji

In [None]:
import emoji
emoji_list = []
for msg in df['message']:
    emoji_list.extend([c for c in msg if c in emoji.EMOJI_DATA])


In [None]:
emoji_df = pd.DataFrame(emoji_list, columns=['emoji'])
emoji_counts = emoji_df['emoji'].value_counts().reset_index(name='count')
emoji_counts.columns = ['emoji', 'count']
    
emoji_counts.head(20)

In [None]:
df

In [None]:
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
# Pivot: day vs hour range
# Create 'period' column: hour ranges like '0-1', '1-2', ...
df['period'] = df['hour'].astype(str) + '-' + (df['hour'] + 1).astype(str)

user_heatmap = df.pivot_table(
    index='day_name',
    columns='period',
    values='message',
    aggfunc='count',
    fill_value=0
)
# Reorder days
user_heatmap = user_heatmap.reindex(day_order, fill_value=0)
# Sort hour ranges properly (like '0-1', '1-2', ...)
user_heatmap = user_heatmap[sorted(user_heatmap.columns, key=lambda x: int(x.split('-')[0]))]
user_heatmap