<a href="https://colab.research.google.com/github/Luckynirwan12/WhatsApp-Chat-Analyzer/blob/main/WhatsApp_Chat_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## WhatsApp Chat Analyzer

### Import Depencencies

In [None]:
import re  # for regular expression
import pandas as pd

In [None]:
f = open('Upload your WhatsApp .txt', 'r', encoding = 'utf-8')

In [None]:
# Read file in the form of string and store it in data
data = f.read()

In [None]:
# Print data
print(data)

### Convert File into DataFrame

In [None]:
# Regex pattern to split by date-time stamps (e.g., 12/03/25, 1:14 pm - )
pattern = r'\d{1,2}/\d{1,2}/\d{2},\s\d{1,2}:\d{2}[   ]?[ap]m\s-\s'

In [None]:
# Print all the messages
messages = re.split(pattern, data)[1:]
messages

In [None]:
# Print all the dates
dates = re.findall(pattern, data)
dates

In [None]:
df = pd.DataFrame({'user_message': messages, 'message_date': dates})
# convert message_date type
df['message_date'] = pd.to_datetime(df['message_date'], format='%d/%m/%y, %I:%M %p - ')
df.rename(columns={'message_date': 'date'}, inplace=True)
df.head()

In [None]:
# Get shape
df.shape

In [None]:
# Separate users and messages
users = []
messages = []
for message in df['user_message']:
    entry = re.split('([\w\W]+?):\s', message)
    if entry[1:]: # user name
        users.append(entry[1])
        messages.append(entry[2])
    else:
        users.append('group_notification')
        messages.append(entry[0])
df['user'] = users
df['message'] = messages
df.drop(columns=['user_message'], inplace=True)
df.head()

### Generate New Columns

In [None]:
# Extract year from the date
df['year'] = df['date'].dt.year
df.head()

In [None]:
# Extract month name from date
df['month'] = df['date'].dt.month_name()
df.head()

In [None]:
# Extract day
df['day'] = df['date'].dt.day
df.head()

In [None]:
# Extract hours
df['hour'] = df['date'].dt.hour
df.head()

In [None]:
# Extract minutes
df['minute'] = df['date'].dt.minute
df.head()

### Chat Statistics

In [None]:
# Count number of words
words = []
for message in df['message']:
    words.extend(message.split())

In [None]:
len(words)

In [None]:
!pip install urlextract

In [None]:
# Count the number of links
from urlextract import URLExtract
extractor = URLExtract()
urls = []
for message in df['message']:
    urls.extend(extractor.find_urls(message))

In [None]:
len(urls)

### Active User Analysis

In [None]:
# Get most active users
x = df[df['user'] != 'group_notification']['user'].value_counts()
active_users = x.head()

In [None]:
# plot the graph of 5 Active users
import matplotlib.pyplot as plt

In [None]:
plt.bar(active_users.index, active_users.values)
plt.xticks(rotation = 'vertical')
plt.show()

In [None]:
# Find the percentage
new_df = round((df[df['user'] != 'group_notification']['user'].value_counts()/df.shape[0])*100, 2).reset_index().rename(columns = { 'count': 'percentage'})
new_df.head()

### Word Analysis

In [None]:
# Find how many particular worlds are used in group chat
words = []
for message in df['message']:
    words.extend(message.split())

In [None]:
from collections import Counter
most_common_words = pd.DataFrame(Counter(words).most_common(20))
print(most_common_words)

In [None]:
# Now we remove the stop words like ommitted , Media, is, the etc
# 1. remove group notification
temp = df[df['user'] != 'group_notification']

In [None]:
# 2. remove media omitted message
temp = temp[temp['message'] != '<Media omitted>\n']

In [None]:
# 3. remove the stop words
f = open('/content/stop_hinglish.txt', 'r')
stop_words = f.read()
print(stop_words)

In [None]:
# remove punctuations
import string
punct = string.punctuation

temp['message'] = temp['message'].apply(lambda x: x.translate(str.maketrans('', '', punct)))

In [None]:
words = []
for message in temp['message']:
  for word in message.lower().split():
    if word not in stop_words:
      words.append(word)

In [None]:
from collections import Counter
most_common_words = pd.DataFrame(Counter(words).most_common(20))
print(most_common_words)

### Emoji Analysis

In [None]:
# Analysis for emoji
!pip install emoji --upgrade

In [None]:
import emoji

In [None]:
emojis = []
for messages in df['message']:
  emojis.extend([c for c in messages if c in emoji.EMOJI_DATA])

In [None]:
pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))

## Timeline Analysis

In [None]:
# Monthly time analysis
# 1. Extract month number from the date
df['month_num'] = df['date'].dt.month

In [None]:
timeline = df.groupby(['year', 'month_num', 'month']).count()['message'].reset_index()

In [None]:
# now we merge the year and the month
time = []
for i in range(timeline.shape[0]):
  time.append(timeline['month'][i] + '-' + str(timeline['year'][i]))

In [None]:
timeline['time'] = time

In [None]:
timeline

In [None]:
# now we plot graph
plt.plot(timeline['time'], timeline['message'])
plt.xticks(rotation = 'vertical')
plt.show()

In [None]:
# Now daily timeline
df['dates'] = df['date'].dt.date

In [None]:
daily_timeline = df.groupby(['dates']).count()['message'].reset_index()

In [None]:
# Plot graph
plt.plot(daily_timeline['dates'], daily_timeline['message'])
plt.xticks(rotation = 'vertical')
plt.show()

### Activity Map

In [None]:
# Active days
# Extract day name from the date
df['day_name'] = df['date'].dt.day_name()
df.head()

In [None]:
# Count the values
df['day_name'].value_counts()

In [None]:
# plot bar graph
plt.bar(df['day_name'].value_counts().index, df['day_name'].value_counts().values)
plt.xticks(rotation = 'vertical')
plt.show()