<a href="https://colab.research.google.com/github/abhiruchipb/whatsappchatanalyzer/blob/main/whatsapp1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import emoji
from collections import Counter


In [None]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [None]:
def load_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return set(f.read().splitlines())

In [None]:
def preprocess_chat(file_path, stopwords_file):
    stopwords = load_stopwords(stopwords_file)
    user_messages = Counter()
    user_first_texts = Counter()
    word_counter = Counter()
    emoji_counter = Counter()

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    cleaned_messages = []
    message_pattern = re.compile(r'(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}) - (.*?): (.*)')
    system_messages = ["Messages and calls are end-to-end encrypted", "media omitted", "changed the subject"]

    last_date = None
    for line in lines:
        match = message_pattern.match(line)
        if match:
            timestamp, user, message = match.groups()
            date = timestamp.split(',')[0]

            if any(sys_msg in message.lower() for sys_msg in system_messages):
                continue  # Skip system messages

            user_messages[user] += 1
            words = re.findall(r'\b\w+\b', message.lower())
            words = [word for word in words if word not in stopwords]
            word_counter.update(words)

            emojis = [char for char in message if char in emoji.EMOJI_DATA]
            emoji_counter.update(emojis)

            cleaned_messages.append((timestamp, user, " ".join(words)))

            if date != last_date:
                user_first_texts[user] += 1
                last_date = date

    total_messages = sum(user_messages.values())
    user_percentages = {user: round((count / total_messages) * 100, 2) for user, count in user_messages.items()}
    most_common_words = [word for word, _ in word_counter.most_common(5)]
    most_common_emojis = [emoji for emoji, _ in emoji_counter.most_common(5)]
    most_first_texts_user = max(user_first_texts, key=user_first_texts.get, default=None)

    return user_percentages, most_common_words, most_common_emojis, most_first_texts_user


In [None]:

# Usage
chat_file = "WhatsApp Chat with batman.txt"
stopwords_file = "stopwords.txt"
user_percentages, most_common_words, most_common_emojis, most_first_texts_user = preprocess_chat(chat_file, stopwords_file)

print("User Activity Percentages:")
for user, percentage in user_percentages.items():
    print(f"{user}: {percentage}%")
print("Most Common Words:", most_common_words)
print("Most Used Emojis:", most_common_emojis)
print("User Who Texts First the Most:", most_first_texts_user)

User Activity Percentages:
batman: 46.9%
abhiruchi patil bhagat: 53.1%
Most Common Words: ['https', 'honey', 'morning', 'pookie', 'don']
Most Used Emojis: ['😭', '🥹', '🏼', '😘', '🫶']
User Who Texts First the Most: abhiruchi patil bhagat
