Whatsapp is a great source of data to analyse patterns and relationships between two or more people chatting personally or even in groups. 

Initially some helper functions have to be defined, as the data exported from the Whatsapp group is not readymade. So, some preprocessing is required.  

In [5]:
import re
import pandas as pd
import numpy as np 
import emoji 
from collections import Counter
import matplotlib.pyplot as plt 
from PIL import Image 
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator 

In [6]:
# extract time
def date_time(s):
    pattern = '^([0-9]+)(\/)([0-9]+)(\/)([0-9]+), ([0-9]+):([0-9]+)[ ]?(AM|PM|am|pm)? -'
    result = re.match(pattern, s)
    if result: return True
    return False 

In [7]:
# find authors or contacts 
def find_author(s):
    s = s.split(':')
    if len(s) == 2: return True
    else: return False 

In [10]:
# finding messages 
def getDatapoint(line):
    splitline = line.split(' - ')
    dateTime = splitline[0]
    date, time = dateTime.split(', ')
    message = ' '.join(splitline[1:])
    if find_author(message):
        splitmessage = message.split(': ')
        author = splitmessage[0]
        message = ' '.join(splitmessage[1:])
    else: author = None 
    return date, time, author, message

This model is agnostic to individual conversations or group chats. The above defined functions process data for conducting Sentiment Analysis.

In [11]:
data = []
conversation = 'WhatsApp Chat with Acad Feb 22 Int tue 2.txt'
with open(conversation, encoding = 'utf-8') as fp: 
    fp.readline()
    messageBuffer = []
    date, time, author = None, None, None 
    while True: 
        line = fp.readline()
        if not line: break
        line = line.strip()
        if date_time(line):
            if len(messageBuffer) > 0: 
                data.append([date, time, author, ' '.join(messageBuffer)])
            messageBuffer.clear()
            date, time, author, message = getDatapoint(line)
            messageBuffer.append(message)
        else: 
            messageBuffer.append(line)

In [17]:
# Sentiment Analysis 
df = pd.DataFrame(data, columns = ['Date', 'Time', 'Author', 'Message'])
df['Date'] = pd.to_datetime(df['Date'])

data = df.dropna()
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiments = SentimentIntensityAnalyzer()
data['Positive'] = [sentiments.polarity_scores(i)['pos'] for i in data['Message']]
data['Negative'] = [sentiments.polarity_scores(i)['neg'] for i in data['Message']]
data['Neutral'] = [sentiments.polarity_scores(i)['neu'] for i in data['Message']]
data.head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/animesh/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,Date,Time,Author,Message,Positive,Negative,Neutral
1,2022-03-05,8:41 am,Animesh,<Media omitted>,0.0,0.0,1.0
2,2022-03-05,8:48 am,sourabh,when it will give index out of bound range?,0.0,0.0,1.0
3,2022-03-05,8:48 am,sourabh,u r using x or x-1.. how it would give index o...,0.0,0.0,1.0
4,2022-03-05,8:48 am,sourabh,"but if u r talking about -1,-2 index then",0.0,0.0,1.0
5,2022-03-05,8:49 am,Animesh,Hold on I'll provide the sol,0.0,0.0,1.0


In [18]:
x = sum(data['Positive'])
y = sum(data['Negative'])
z = sum(data['Neutral'])

def sentiment_score(a, b, c):
    if a > b and a > c: 
        print('Positive 😊')
    elif b > a and b > c: 
        print('Negative 😠')
    else: 
        print('Neutral 🙂')

sentiment_score(x, y, z)

Neutral 🙂
