# Data Analytics from a WhatsApp Group chat

This notebook analysis the whatsapp chat from a group of close friends. The purpose is to do basic data analytics such as what is the frequency of messages, who is most active when, but then if interest develops then can also explore advanced NLP topics.

## Setup up imports

In [None]:
library(glue)
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
library(GGally)
library(ggthemes)
library(tidyverse)
library(lubridate)
library(tidytext)
set.seed(1234)
library(wordcloud)
#remotes::install_github("hadley/emo")
library("tidyr")
library(emo)

## Global constants

In [None]:
RAW_DATA <- "preprocessed_raw_data2.txt" #preprocessed__chat2.txt
GUESS_MAX <- 20000
GROUP_NAME <- "Sample Group"
TIMESTAMP_FORMAT <- "MDY_HM" #"DMY_HMS" # or MDY_HM
TZ <- "America/New_York" #"Asia/Kolkata" # "America/New_York"

## Data Wrangling

In [None]:
# read the raw data
chat <- readr::read_csv(RAW_DATA, guess_max=GUESS_MAX)

In [None]:
head(chat, 10)

In [None]:
if (TIMESTAMP_FORMAT == "DMY_HMS") {
    chat <- chat %>%
      mutate(date = dmy(date), timestamp = dmy_hms(timestamp, tz=TZ))
} else if (TIMESTAMP_FORMAT == "MDY_HM")  {
    chat <- chat %>%
      mutate(date = mdy(date), timestamp = mdy_hm(timestamp, tz=TZ))   
} else {
    print("timestamp format needs to be set!!!")
}

head(chat)

## Lets Explore 

In [None]:
options(repr.plot.width=15, repr.plot.height=8)
chat %>% 
  count(sender, sort=TRUE) %>%
  ggplot(aes(x=reorder(sender, -n), y=n, fill=sender)) +
  geom_bar(position="dodge", stat="identity") +
  theme_fivethirtyeight() +
  xlab("") + 
  labs(title=glue("Who sends the most messages in the \"{GROUP_NAME}\" group?"),
       subtitle=glue("Total number of messages sent by each person, Timespan: {min(chat$date)} to {max(chat$date)}"),
       caption="Source: WhatsApp message export") +
  theme(text = element_text(size=15), legend.position = "none") +
  scale_color_tableau()

### Frequency of messages per day

In [None]:
options(repr.plot.width=15, repr.plot.height=8)

p <- chat %>%
  count(date) %>%
  ggplot(aes(x=date, y=n)) +
  geom_point() + 
  stat_smooth(method = "loess", formula = y ~ x, size = 1) + 
  theme_fivethirtyeight() +
  xlab("") + 
  ylab("Messages/day") + 
  labs(title=glue("Message activity on the \"{GROUP_NAME}\" group"),
       subtitle=glue("Messages/day, Timespan: {min(chat$date)} to {max(chat$date)}"),
       caption="Source: WhatsApp message export") +
  theme(text = element_text(size=15))


p

In [None]:
options(repr.plot.width=15, repr.plot.height=8)

p <- chat %>%
  count(sender, date) %>%
  ggplot(aes(x=date, y=n, col=sender)) +
  geom_line() +
  facet_wrap(~sender) +
  theme_fivethirtyeight() + 
  xlab("") + 
  ylab("Messages/day") + 
  labs(title=glue("Message activity on the \"{GROUP_NAME}\" group"),
       subtitle=glue("Messages/day, Timespan: {min(chat$date)} to {max(chat$date)}"),
       caption="Source: WhatsApp message export") +
  theme(text = element_text(size=15), legend.position = "none") + 
  scale_color_tableau()
p

In [None]:
options(repr.plot.width=20, repr.plot.height=10)

p <- chat %>%
  mutate(day_of_the_week=wday(date, label=TRUE)) %>%
  count(day_of_the_week, sender) %>%
  ggplot(aes(x=day_of_the_week, y=n, fill=sender)) +
  geom_bar(position="dodge", stat="identity") +
  theme_fivethirtyeight() +
  xlab("") + 
  ylab("Messages/day") + 
  labs(title=glue("Message activity on the \"{GROUP_NAME}\" group"),
       subtitle=glue("Total messages by the day, Timespan: {min(chat$date)} to {max(chat$date)}"),
       caption="Source: WhatsApp message export") +
  theme(text = element_text(size=15), legend.position = "none") + 
  facet_wrap(~sender) +
  scale_color_tableau()
p

In [None]:
options(repr.plot.width=15, repr.plot.height=8)

p <- chat %>%
  mutate(hour=hour(timestamp)) %>%
  count(hour, sender) %>%
  ggplot(aes(x=hour, y=n, fill=sender)) +
  geom_bar(position="dodge", stat="identity") +
  theme_fivethirtyeight() +
  xlab("") + 
  ylab("Messages/hour") + 
  labs(title=glue("Message activity on the \"{GROUP_NAME}\" group"),
       subtitle=glue("Total messages by the hour, Timespan: {min(chat$date)} to {max(chat$date)}"),
       caption="Source: WhatsApp message export") +
  theme(text = element_text(size=15), legend.position = "none") + 
  facet_wrap(~sender) +
  scale_color_tableau()
p

In [None]:
#options(repr.plot.width=10, repr.plot.height=8)
chat_mv_timeseries <- chat %>%
  mutate(hour=round_date(timestamp, unit="hour")) %>%
  count(hour, sender) %>%
  spread(sender, n) %>%
  replace(is.na(.), 0) %>%
  select(-hour)

p <- ggcorr(chat_mv_timeseries, method = c("everything", "pearson"), label=TRUE) +
  theme_fivethirtyeight() +
  labs(title=glue("Correlation in the \"{GROUP_NAME}\" group"),
       subtitle=glue("Correlation in messages per hour, Timespan: {min(chat$date)} to {max(chat$date)}"),
       caption="Source: WhatsApp message export") +
  theme(text = element_text(size=15), legend.position = "none") +
  scale_color_tableau()
p


In [None]:
options(repr.plot.width=15, repr.plot.height=8)
chat_message_len <- chat %>%
  mutate(sender = str_trim(sender)) %>%
  rowwise() %>%
  mutate(num_of_words = length(str_split(message, " ", simplify = TRUE))) %>%
  select(sender, num_of_words) %>%
  group_by(sender) %>%
  summarize(num_of_words = quantile(num_of_words, 0.5)) %>%
  ungroup()

chat_message_len %>%
  ggplot(aes(x=reorder(sender, num_of_words), y=num_of_words, fill=sender)) +
  geom_bar(position="dodge", stat="identity") +
  theme_fivethirtyeight() +
  xlab("") + 
  ylab("Messages/hour") + 
  labs(title=glue("How long is a usual message on the \"{GROUP_NAME}\" group?"),
       subtitle=glue("Median number of words in each person's messages, Timespan: {min(chat$date)} to {max(chat$date)}"),
       caption="Source: WhatsApp message export") +
  theme(text = element_text(size=15), legend.position = "none") +
  scale_color_tableau()


In [None]:
options(repr.plot.width=15, repr.plot.height=15)
tidy_chat <- chat %>%
  select(message) %>%
  unnest_tokens(word, message) %>%
  group_by(word) %>%
  ungroup()

tidy_chat %>%
  count(word, sort=T) %>%
  filter(!(word %in% c("media", "omitted"))) %>%
  anti_join(get_stopwords()) %>%
  top_n(20) %>%
  ungroup() %>%
  ggplot(aes(x=reorder(word, n), y=n,
    fill = I("lightblue")
  )) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  coord_flip() +

  scale_y_continuous(expand = c(0, 0)) +
  labs(
    x = NULL, y = "Word count",
    title = "Most frequent words after removing stop words",
    subtitle = glue("Median number of words in each person's messages, Timespan: {min(chat$date)} to {max(chat$date)}"),
      caption="Source: WhatsApp message export"
  ) + 
theme_fivethirtyeight() +
  theme(text = element_text(size=15), legend.position = "none") +
  scale_color_tableau()

In [None]:
chat %>%
  select(message) %>%
  unnest_tokens(bigram, message, token = "ngrams", n = 2) %>%
  filter(!str_detect(bigram, "omitted") & !str_detect(bigram, "deleted") & !str_detect(bigram, "message")) %>%
  count(bigram, sort=TRUE)


In [None]:

word_counts <- tidy_chat %>%
  count(word, sort=T) %>%
  filter(!(word %in% c("media", "omitted"))) %>%
  anti_join(get_stopwords()) %>%
  top_n(50) %>%
  ungroup()

wordcloud(words = word_counts$word, freq = word_counts$n, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))


In [None]:
options(repr.plot.width=15, repr.plot.height=10)

chat %>%
  select(sender, emojis) %>%
  filter(!is.na(emojis)) %>%
  unnest(emojis = strsplit(emojis, ","))  %>%
  count(sender, emojis, sort=TRUE) %>%
  group_by(sender) %>%
  top_n(5) %>%
  arrange(sender) %>%
  #filter(sender == "Person8") %>%
  #mutate(emojis=emoji(emojis)) %>%
  ggplot(aes(x = reorder(emojis, -n), y = n, fill = sender, family = 'EmojiOne')) +
  geom_col(show.legend = FALSE) +
  ylab("") +
  xlab("") +
  coord_flip() +
  facet_wrap(~sender, ncol = 2, scales = "free_y")  +
  ggtitle("Most often used emojis") + 
  theme_fivethirtyeight() +
  theme(text = element_text(size=15), legend.position = "none") +
  scale_color_tableau()

In [None]:
chat %>%
  select(sender, emojis) %>%
  filter(!is.na(emojis)) %>%
  unnest(emojis = strsplit(emojis, ","))  %>%
  count(sender, emojis, sort=TRUE) %>%
  group_by(sender) %>%
  top_n(5) %>%
  arrange(sender)

In [None]:
chat %>%
  select(sender, emojis) %>%
  filter(!is.na(emojis)) %>%
  unnest(emojis = strsplit(emojis, ","))  %>%
  count(emojis, sort=TRUE) %>%
  top_n(5)