In [114]:
import os
import re
import pandas as pd
from PIL import Image   
from PIL.ExifTags import TAGS, GPSTAGS
import mimetypes
from datetime import datetime
import emoji
import spacy
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from pymongo import MongoClient

In [85]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akuma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akuma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\akuma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akuma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [86]:
WHATSAPP_SYSTEM_MESSAGES = {
        "encryption": [
            "Messages and calls are end-to-end encrypted",
            "Messages are end-to-end encrypted",
            "Messages and calls are secured with end-to-end encryption",
            "This call is secured with end-to-end encryption",
        ],
        "media_placeholders": [
            "<Media omitted>",
            "<image omitted>",
            "<video omitted>",
            "<audio omitted>",
            "<document omitted>",
            "<sticker omitted>",
            "<GIF omitted>",
        ],
        "group_events": [
            "You created group",
            "You changed the group description",
            "You changed the group icon",
            "You changed the group subject to",
            "You deleted this group",
            "You joined using this group's invite link",
            "joined using this group's invite link",
            "joined using the group's invite link you shared",
            "joined the group",
            "left",
            "was removed",
            "You added",
            "You removed",
            "You invited",
            "invited",
            "You are now an admin",
            "You made",
            "an admin",
            "made you an admin",
        ],
        "disappearing_messages": [
            "You turned on disappearing messages",
            "turned on disappearing messages",
            "You turned off disappearing messages",
            "turned off disappearing messages",
            "Message disappeared",
        ],
        "message_deletion": ["This message was deleted", "You deleted this message"],
        "calls": ["Missed voice call", "Missed video call"],
        "misc": [
            "Chat with",  # sometimes appears in iOS exports
        ],
    }

In [87]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [88]:
chats_file = "../experiments/testing.txt"
media_folder = "../experiments/testing_media"

In [89]:
def parse_whatsapp_chat(file_path):
    # Read exported WhatsApp chat file
    with open(file_path, "r", encoding="utf-8") as file:
        chat_data = file.readlines()

    # Updated regex pattern (handles dd/mm/yyyy and weird spaces before am/pm)
    pattern = r"^(\d{1,2}/\d{1,2}/\d{4}), (\d{1,2}:\d{2}\s?[apAP][mM]) - ([^:]+?): (.*)"

    dates, times, senders, messages = [], [], [], []

    for line in chat_data:
        line = line.strip()
        match = re.match(pattern, line)
        if match:
            date, time, sender, message = match.groups()
            dates.append(date)
            times.append(time.replace(" ", " "))  # normalize weird space
            senders.append(sender)
            messages.append(message)
        else:
            # Handle multi-line messages
            if messages:
                messages[-1] += " " + line

    # Create DataFrame
    df_chats = pd.DataFrame({
        "Date": dates,
        "Time": times,
        "Sender": senders,
        "Message": messages
    })
    df_chats["Datetime"] = pd.to_datetime(
        df_chats["Date"] + " " + df_chats["Time"], format="%d/%m/%Y %I:%M %p"
    )
    return df_chats

In [90]:
df_chats = parse_whatsapp_chat(chats_file)

In [91]:
df_chats

Unnamed: 0,Date,Time,Sender,Message,Datetime
0,14/06/2025,8:11 pm,Ashutosh Kumar Jha,Hey chirag,2025-06-14 20:11:00
1,14/06/2025,8:11 pm,Chirag Hira Mentor Me,Hii,2025-06-14 20:11:00
2,14/06/2025,8:11 pm,Ashutosh Kumar Jha,send the sheet in the group once so that every...,2025-06-14 20:11:00
3,14/06/2025,8:11 pm,Chirag Hira Mentor Me,Okay,2025-06-14 20:11:00
4,21/06/2025,8:15 pm,Chirag Hira Mentor Me,<Media omitted>,2025-06-21 20:15:00
...,...,...,...,...,...
123,25/08/2025,2:12 pm,Ashutosh Kumar Jha,"Ok. I will raise the PR, mostly today, once I ...",2025-08-25 14:12:00
124,25/08/2025,3:01 pm,Ashutosh Kumar Jha,what short of display animation should be added,2025-08-25 15:01:00
125,25/08/2025,3:02 pm,Ashutosh Kumar Jha,i mean are there any specifications or require...,2025-08-25 15:02:00
126,25/08/2025,3:15 pm,Ashutosh Kumar Jha,PR raised,2025-08-25 15:15:00


In [92]:
def extract_exif(image_path):
    try:
        image = Image.open(image_path)
        exif_data = image._getexif()
        if not exif_data:
            return {}
        exif = {}
        for tag, value in exif_data.items():
            decoded = TAGS.get(tag, tag)
            exif[decoded] = value
        return exif
    except Exception:
        return {}

In [93]:
# Main parse_media function
def parse_media(media_folder):
    file_names, file_types, sizes, datetimes, cameras, gps_data = [], [], [], [], [], []

    for root, _, files in os.walk(media_folder):
        for file in files:
            file_path = os.path.join(root, file)
            mime_type, _ = mimetypes.guess_type(file_path)
            file_size = os.path.getsize(file_path) / 1024  # KB

            # Defaults
            file_datetime, camera, gps = None, None, None

            if mime_type and mime_type.startswith("image"):
                exif = extract_exif(file_path)
                file_datetime = exif.get("DateTime")
                camera = exif.get("Model")
                if "GPSInfo" in exif:
                    gps = exif["GPSInfo"]

            file_names.append(file)
            file_types.append(mime_type if mime_type else "unknown")
            sizes.append(file_size)
            datetimes.append(file_datetime)
            cameras.append(camera)
            gps_data.append(gps)

    # Create DataFrame
    df_media = pd.DataFrame(
        {
            "FileName": file_names,
            "Type": file_types,
            "SizeKB": sizes,
            "Datetime": datetimes,
            "Camera": cameras,
            "GPS": gps_data,
        }
    )

    # Convert Datetime column; fallback to file modified time if EXIF missing
    mask = df_media["Datetime"].isna()
    df_media.loc[mask, "Datetime"] = df_media.loc[mask, "FileName"].apply(
        lambda f: datetime.fromtimestamp(
            os.path.getmtime(os.path.join(media_folder, f))
        )
    )

    # Ensure all values are datetime dtype
    df_media["Datetime"] = pd.to_datetime(df_media["Datetime"], errors="coerce")

    return df_media

In [94]:
df_media = parse_media(media_folder)

In [95]:
df_media

Unnamed: 0,FileName,Type,SizeKB,Datetime,Camera,GPS
0,2 contacts.vcf,text/x-vcard,0.320312,2025-08-25 16:19:39.073464,,
1,Aditya BH1.vcf,text/x-vcard,0.183594,2025-08-25 16:19:39.081510,,
2,Flowchart.pdf,application/pdf,16.69043,2025-08-25 16:09:49.913312,,
3,IMG-20240526-WA0002.jpg,image/jpeg,112.94043,2025-08-25 16:19:39.087643,,
4,IMG-20250801-WA0011.jpg,image/jpeg,81.517578,2025-08-25 16:19:39.151319,,
5,property paper with layout plan.pdf,application/pdf,124.858398,2025-08-25 16:19:39.156314,,
6,Use case diagram (1).pdf,application/pdf,21.269531,2025-08-25 16:09:49.908311,,
7,Use case diagram.pdf,application/pdf,19.423828,2025-08-25 16:09:49.902309,,
8,VID-20250301-WA0004.mp4,video/mp4,7907.033203,2025-08-25 16:19:39.142306,,


In [96]:
def remove_system_messages(df):
    system_msgs = sum(WHATSAPP_SYSTEM_MESSAGES.values(), [])
    pattern = "|".join(system_msgs)
    return df[~df["Message"].str.contains(pattern, case=False, na=False)]

In [97]:
def clean_text(text: str):
    # 1. Remove emojis
    text = emoji.replace_emoji(text, replace="")

    # 2. Lowercase
    text = text.lower()

    # 3. Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # 4. Remove numbers
    text = re.sub(r"\d+", "", text)

    # 5. Tokenize
    tokens = nltk.word_tokenize(text)

    # 6. Remove stopwords + Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return " ".join(tokens)

In [98]:
def preprocess_chats(df_chats):
    # Clean text
    df_chats["CleanMessage"] = df_chats["Message"].astype(str).apply(clean_text)

    # Feature engineering
    df_chats["WordCount"] = df_chats["CleanMessage"].apply(lambda x: len(x.split()))
    df_chats["CharCount"] = df_chats["CleanMessage"].apply(len)
    df_chats["EmojiCount"] = df_chats["Message"].apply(
        lambda x: len([c for c in x if c in emoji.EMOJI_DATA])
    )
    df_chats["IsQuestion"] = df_chats["Message"].apply(
        lambda x: str(x).strip().endswith("?")
    )
    df_chats["HourOfDay"] = df_chats["Datetime"].dt.hour
    df_chats["DayOfWeek"] = df_chats["Datetime"].dt.day_name()

    return df_chats

In [99]:
df_chats = preprocess_chats(df_chats)

In [100]:
df_chats

Unnamed: 0,Date,Time,Sender,Message,Datetime,CleanMessage,WordCount,CharCount,EmojiCount,IsQuestion,HourOfDay,DayOfWeek
0,14/06/2025,8:11 pm,Ashutosh Kumar Jha,Hey chirag,2025-06-14 20:11:00,hey chirag,2,10,0,False,20,Saturday
1,14/06/2025,8:11 pm,Chirag Hira Mentor Me,Hii,2025-06-14 20:11:00,hii,1,3,0,False,20,Saturday
2,14/06/2025,8:11 pm,Ashutosh Kumar Jha,send the sheet in the group once so that every...,2025-06-14 20:11:00,send sheet group everyone u see extract idea,8,44,0,False,20,Saturday
3,14/06/2025,8:11 pm,Chirag Hira Mentor Me,Okay,2025-06-14 20:11:00,okay,1,4,0,False,20,Saturday
4,21/06/2025,8:15 pm,Chirag Hira Mentor Me,<Media omitted>,2025-06-21 20:15:00,medium omitted,2,14,0,False,20,Saturday
...,...,...,...,...,...,...,...,...,...,...,...,...
123,25/08/2025,2:12 pm,Ashutosh Kumar Jha,"Ok. I will raise the PR, mostly today, once I ...",2025-08-25 14:12:00,ok raise pr mostly today done thing,7,35,0,False,14,Monday
124,25/08/2025,3:01 pm,Ashutosh Kumar Jha,what short of display animation should be added,2025-08-25 15:01:00,short display animation added,4,29,0,False,15,Monday
125,25/08/2025,3:02 pm,Ashutosh Kumar Jha,i mean are there any specifications or require...,2025-08-25 15:02:00,mean specification requirement,3,30,0,False,15,Monday
126,25/08/2025,3:15 pm,Ashutosh Kumar Jha,PR raised,2025-08-25 15:15:00,pr raised,2,9,0,False,15,Monday


In [105]:
def categorize_media(filename, mime_type):
    ext = filename.split(".")[-1].lower()
    if "image" in str(mime_type) or ext in [
        "jpg",
        "jpeg",
        "png",
        "webp",
        "heic",
        "gif",
    ]:
        return "Image"
    elif "video" in str(mime_type) or ext in ["mp4", "mov", "3gp"]:
        return "Video"
    elif "audio" in str(mime_type) or ext in [
        "mp3",
        "aac",
        "m4a",
        "opus",
        "wav",
        "amr",
    ]:
        return "Audio"
    elif ext in [
        "pdf",
        "doc",
        "docx",
        "xls",
        "xlsx",
        "ppt",
        "pptx",
        "txt",
        "csv",
        "zip",
        "rar",
        "apk",
    ]:
        return "Document"
    elif ext == "vcf":
        return "Contact"
    else:
        return "Other"

In [108]:
def preprocess_media(df_media, media_folder):
    # Category
    df_media["Category"] = df_media.apply(
        lambda row: categorize_media(row["FileName"], row["Type"]), axis=1
    )

    # Datetime from file
    for idx, row in df_media.iterrows():
        file_path = os.path.join(media_folder, row["FileName"])
        if os.path.exists(file_path):
            dt = datetime.fromtimestamp(os.path.getmtime(file_path))
            df_media.at[idx, "Datetime"] = dt

    # Extra Features
    df_media["FileExtension"] = df_media["FileName"].apply(
        lambda x: x.split(".")[-1].lower()
    )
    df_media["IsLargeFile"] = df_media["SizeKB"] > 1024
    df_media["HasGPS"] = df_media["GPS"].notna()
    df_media["HasCameraInfo"] = df_media["Camera"].notna()

    return df_media

In [109]:
df_media = preprocess_media(df_media,media_folder)

In [110]:
df_media

Unnamed: 0,FileName,Type,SizeKB,Datetime,Camera,GPS,Category,FileExtension,IsLargeFile,HasGPS,HasCameraInfo
0,2 contacts.vcf,text/x-vcard,0.320312,2025-08-25 16:19:39.073464,,,Contact,vcf,False,False,False
1,Aditya BH1.vcf,text/x-vcard,0.183594,2025-08-25 16:19:39.081510,,,Contact,vcf,False,False,False
2,Flowchart.pdf,application/pdf,16.69043,2025-08-25 16:09:49.913312,,,Document,pdf,False,False,False
3,IMG-20240526-WA0002.jpg,image/jpeg,112.94043,2025-08-25 16:19:39.087643,,,Image,jpg,False,False,False
4,IMG-20250801-WA0011.jpg,image/jpeg,81.517578,2025-08-25 16:19:39.151319,,,Image,jpg,False,False,False
5,property paper with layout plan.pdf,application/pdf,124.858398,2025-08-25 16:19:39.156314,,,Document,pdf,False,False,False
6,Use case diagram (1).pdf,application/pdf,21.269531,2025-08-25 16:09:49.908311,,,Document,pdf,False,False,False
7,Use case diagram.pdf,application/pdf,19.423828,2025-08-25 16:09:49.902309,,,Document,pdf,False,False,False
8,VID-20250301-WA0004.mp4,video/mp4,7907.033203,2025-08-25 16:19:39.142306,,,Video,mp4,True,False,False


In [111]:
def link_media_to_chat(media_df, chat_df, window_minutes=5):
    linked_messages = []

    for idx, media in media_df.iterrows():
        media_time = media["Datetime"]
        start_time = media_time - pd.Timedelta(minutes=window_minutes)
        end_time = media_time + pd.Timedelta(minutes=window_minutes)

        relevant_chats = chat_df[
            (chat_df["Datetime"] >= start_time) & (chat_df["Datetime"] <= end_time)
        ]
        linked_messages.append(relevant_chats["Message"].tolist())

    media_df["LinkedChats"] = linked_messages
    return media_df

In [112]:
df_linked = link_media_to_chat(df_media,df_chats)

In [113]:
df_linked

Unnamed: 0,FileName,Type,SizeKB,Datetime,Camera,GPS,Category,FileExtension,IsLargeFile,HasGPS,HasCameraInfo,LinkedChats
0,2 contacts.vcf,text/x-vcard,0.320312,2025-08-25 16:19:39.073464,,,Contact,vcf,False,False,False,[]
1,Aditya BH1.vcf,text/x-vcard,0.183594,2025-08-25 16:19:39.081510,,,Contact,vcf,False,False,False,[]
2,Flowchart.pdf,application/pdf,16.69043,2025-08-25 16:09:49.913312,,,Document,pdf,False,False,False,[]
3,IMG-20240526-WA0002.jpg,image/jpeg,112.94043,2025-08-25 16:19:39.087643,,,Image,jpg,False,False,False,[]
4,IMG-20250801-WA0011.jpg,image/jpeg,81.517578,2025-08-25 16:19:39.151319,,,Image,jpg,False,False,False,[]
5,property paper with layout plan.pdf,application/pdf,124.858398,2025-08-25 16:19:39.156314,,,Document,pdf,False,False,False,[]
6,Use case diagram (1).pdf,application/pdf,21.269531,2025-08-25 16:09:49.908311,,,Document,pdf,False,False,False,[]
7,Use case diagram.pdf,application/pdf,19.423828,2025-08-25 16:09:49.902309,,,Document,pdf,False,False,False,[]
8,VID-20250301-WA0004.mp4,video/mp4,7907.033203,2025-08-25 16:19:39.142306,,,Video,mp4,True,False,False,[]


In [115]:
## Dumping Pipeline
mongo_ssh = "mongodb://localhost:27017/"
client = MongoClient(mongo_ssh)

In [116]:
db = client["whatsapp_analysis"]

In [117]:
collection1 = db["preprocessed_chats_1"]

In [118]:
collection2 = db["preprocessed_chats_2"]

In [122]:
def get_collection_schema(collection, sample_size=1000):
    schema = set()
    cursor = collection.find({}, limit=sample_size)  # limit for performance
    for doc in cursor:
        schema.update(doc.keys())
    return schema

In [119]:
def dump_dataframe_to_mongo(df: pd.DataFrame, collection):
    # Convert DataFrame → list of dicts
    data_dict = df.to_dict("records")
    if data_dict:
        collection.insert_many(data_dict)

In [120]:
dump_dataframe_to_mongo(df_chats, collection1)
dump_dataframe_to_mongo(df_media, collection2)

In [125]:
schema1 = get_collection_schema(collection1)

In [126]:
schema1

{'CharCount',
 'CleanMessage',
 'Date',
 'Datetime',
 'DayOfWeek',
 'EmojiCount',
 'HourOfDay',
 'IsQuestion',
 'Message',
 'Sender',
 'Time',
 'WordCount',
 '_id'}

In [128]:
schema2 = get_collection_schema(collection2)

In [129]:
schema2

{'Camera',
 'Category',
 'Datetime',
 'FileExtension',
 'FileName',
 'GPS',
 'HasCameraInfo',
 'HasGPS',
 'IsLargeFile',
 'LinkedChats',
 'SizeKB',
 'Type',
 '_id'}