# WhatsApp Chat Analysis

## 1. Import Packages & Define Functions

File to be used: '../Data/whatsapplog.txt'

In [1]:
from __future__ import print_function

from datetime import datetime
import re

import numpy as np
import pandas as pd

In [2]:
def read_file(filename):
    """Reads in a text file and separates all the chat lines.
    
    ::filename [str]:: Path to file
    ::return [list]:: List containing all the content in the file.
    Each element of the list is a line in the file.
    """
    
    raw_data = []
    file = open(filename, encoding = 'utf8')
    for line in file:
        line = line.rstrip()
        raw_data.append(line)
        
    return raw_data

In [14]:
def clean_dataset(data):
    """Parses a line of data to a usable format.
    
    ::data[list]:: List of all file contents. Each element is a new line.
    ::return[Pandas DF]:: Returns a Pandas dataframe of the message. It contains three columns:
        *datetime[datetime]* When a message was sent.
        *name[str]* Who sent the message.
        *message[str]* Content of the message.
        
    TODO: make data lowercase, remove punctuation, what format is data in each col?
    """
    
    regexHeader = re.compile(r"(?P<datetime>\d{2}/\d{2}/\d{4}[^-]+)\s+-\s+(?P<name>[^:]+):\s+(?P<message>[\s\S]+?)(?=^\d{2}|\Z)", flags=re.M)
    
    columns = ['datetime','name', 'message']
    df = pd.DataFrame(columns = columns)
    
    for line in data:
        try:
            dict = regexHeader.match(line)
            dict = dict.groupdict()
            dict['message'] = dict['message'].lower()  #Makes content of the message lowercase.
            df = df.append(dict, ignore_index = True)
        except:
            continue
    
    return df
    

## 2. Read in Data

In [15]:
text = read_file('../Data/whatsapplog_smoldata.txt')

In [18]:
data = clean_dataset(text)
data.head()

Unnamed: 0,datetime,name,message
0,"29/08/2018, 16:57",David,and any tomato?
1,"29/08/2018, 16:58",Iubire,yep
2,"29/08/2018, 16:59",David,nice
3,"29/08/2018, 16:59",David,omw
4,"29/08/2018, 17:00",David,<media omitted>


## Annex: Useful Code

Code for dealing with timestamp data. For later use.

In [None]:
timestamp = "02/09/2018, 16:43"
timestamp = header[0:17]
dt = datetime.strptime(timestamp, '%d/%m/%Y, %H:%M')
dt.year

Maybe some useful links:

https://www.zeolearn.com/magazine/introduction-to-text-mining-in-whatsapp-chats-using-python-part-1 --Sentiment analysis tutorial

https://regexr.com/ --Useful resource on regular expressions

https://github.com/tirkarthi/chat-analyzer/blob/master/analyze.py --Git code -- count of words per person, emoticon use

https://github.com/lucasrodes/whatstk/tree/master/notebooks --Git code 