In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
#Red txt data file 
with open("WhatsApp Chat with 2NYP Data Project.txt", encoding = "utf-8") as file:
    chats = file.readlines()
print(len(chats))

In [None]:
#Remove new lines
chats = [line.strip() for line in chats]

#Getting join notification lines
jn = [line for line in chats if  not "joined using this" in line]

#Further cleaning
#Remove empty lines
jn = [line for line in jn if len(line) > 1]
print(len(jn))

In [None]:
jn = [line for line in jn if not "You changed" in line]
print(len(jn))

In [None]:
#Drop media file lines
media = [line for line in jn if 'omitted' in line]
print("There were {} media files in this time".format(len(media)))
convo = [line for line in jn if not 'omitted' in line]
print(len(convo))

In [None]:
#Merge messages that belong together
msgs = [] #message container
pos = 0 #counter for position of msgs in the container
"""
Flow:
For every line, see if it matches the expression which is starting with the format "number(s)+slash" eg "12/"
If it does, it is a new line of conversion as they begin with dates, add it to msgs container
Else, it is a continuation of the previous line, add it to the previous line and append to msgs, then pop previous line.
"""
for line in convo:
    if re.findall("\A\d+[/]", line):
        msgs.append(line)
        pos += 1
    else:
        take = msgs[pos-1] + ". " + line
        msgs.append(take)
        msgs.pop(pos-1)

len(msgs)

In [None]:
msgs[:10]

In [None]:
#Drop first two lines: auto msg
msgs = msgs[2:]
len(msgs)

In [None]:
msgs[:3]

In [None]:
msgs[0].split()[3:9]

In [None]:
#make arrays of different parts of the messages
date = []
time = []
number = []
msg = []
counter = 0
for line in msgs:
    x = line.split()
    if len(x) != 1:
        #Get the date
        dating = x[0]
        date.append(dating.replace(',',''))

        #Get the time
        try:
            timing = x[1] + " " +x[2]
            time.append(timing)
        except:
            print(line)
            break

        #Get number
        num = " ".join(x[3:9]) #the parts numbers/names fall in
        try:
            match = re.search(r"- ([^']*):", num).group(1) #match between '- ' and ':' where they are per se
            number.append(match)
        except:
            number.append("-")
            pass
        #Get message
        #num = " ".join(x[3:8]) #num/name position
        #match = re.search(r"- ([^']*):", num).group(1) #isolate the num/name per se
        part = line.partition(match)[2] #partition according to the match and extract the last part, the msg
        msging = part[2:] #strip off the first two characters as they are ": "
        msg.append(msging)

    #     counter += 1
    #     print(counter)
    else:
        print("This is a bad lie.\n{}".format(line))
print(len(date), len(time), len(number), len(msg))

In [None]:
#Put in a dataframe
_2nyp = pd.DataFrame()
_2nyp["date"] = date
_2nyp["time"] = time
_2nyp["number_author"] = number
_2nyp["msg"] = msg

_2nyp.head()

In [None]:
top_msg_times = _2nyp.time.value_counts(ascending=False).head(10)
ax1 = top_msg_times.plot(kind="bar", color='Darkblue', figsize=(12,5))
ax1.set_xlabel ('Time')
ax1.set_ylabel ('Frequency')
ax1.set_title("Top 10 Time of Engagement")


plt.show()
%matplotlib inline

In [None]:
top10messengers = _2nyp.number_author.value_counts(ascending=False).head(10)
# print(top10messengers)
ax = top10messengers.plot.barh(color='Gold', figsize=(12,5))
ax.set_xlabel ('Number of sent message')
ax.set_ylabel("Users")
ax.set_title("Top 10 Users that sent more messages")
plt.show()

## NLP

In [None]:
import nltk 
from nltk.corpus import stopwords 
from nltk import ngrams
from collections import Counter

In [None]:
words = _2nyp.msg.tolist()

In [None]:
# Create stopwords
stop_words = list(set(stopwords.words('english')))
stop_words.extend(['axamansard', 'st', 'axa', 'mansard'])

In [None]:
tokens = []
for line in words:
    token = [i for i in line.split()]
    tokens.append(' '.join(token)) #Convert back to string        
len(tokens)

In [None]:
tokens[0]

In [None]:
# Preprocessing 
def clean1(s):    
    # removes special characters with ' ' 
    stripped = re.sub('[^a-zA-z\s]', '', s) 
    stripped = re.sub('_', '', stripped) 
      
    # Change any white space to one space 
    stripped = re.sub('\s+', ' ', stripped) 
      
    # Remove start and end white spaces
    stripped = stripped.strip()
    
    #Remove links, links and convert to lowercase
    stripped = ' '.join([line for line in stripped.split() if not line.startswith('http')])
    words = [line.lower() for line in stripped.split() if not line.lower() in stop_words]
    tokens = ' '.join(words)
    
    return tokens

In [None]:
#Clean the lines
cleaned = []
for line in tokens:
    words = clean1(line)
    if words != '':
        cleaned.append(words)
len(cleaned)

In [None]:
cleaned[:5]

In [None]:
#Make tokens of the cleaned tweets
corpus = ' '.join(cleaned)

In [None]:
#Word cloud
print("2NYP Whatsapp Group Chat WC")
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
%matplotlib inline
wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stop_words,
                          max_words=50,
                          max_font_size=50, 
                          random_state=42
                         ).generate(str(corpus))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
fig.savefig("word1.png", dpi=900)