# WhatsApp Chat Analysis

## 1. Import Packages & Define Functions

File to be used: '../Data/whatsapplog.txt'

In [1]:
from __future__ import print_function

from datetime import datetime
import re

import numpy as np
import pandas as pd

In [2]:
def read_file(filename):
    """Reads in a text file and separates all the chat lines.
    
    ::filename [str]:: Path to file
    ::return [list]:: List containing all the content in the file.
    Each element of the list is a line in the file.
    """
    
    raw_data = []
    file = open(filename, encoding = 'utf8')
    for line in file:
        line = line.rstrip()
        raw_data.append(line)
        
    return raw_data

In [4]:
def clean_dataset(data):
    """Parses a line of data to a usable format.
    
    ::data[list]:: List of all file contents. Each element is a new line.
    ::return[Pandas DF]:: Returns a Pandas dataframe of the message. It contains three columns:
        *datetime[datetime]* When a message was sent.
        *name[str]* Who sent the message.
        *message[str]* Content of the message.
        
    TODO: make data lowercase, remove punctuation, what format is data in each col?
    """
    
    regexHeader = re.compile(r"(?P<datetime>\d{2}/\d{2}/\d{4}[^-]+)\s+-\s+(?P<name>[^:]+):\s+(?P<message>[\s\S]+?)(?=^\d{2}|\Z)", flags=re.M)
    
    columns = ['datetime','name', 'message']
    df = pd.DataFrame(columns = columns)
    
    for line in data:
        try:
            dict = regexHeader.match(line)
            dict = dict.groupdict()
            df = df.append(dict, ignore_index = True)
        except:
            continue
    
    return df
    

## 2. Read in Data

In [None]:
text = read_file('../Data/whatsapplog_smoldata.txt')

In [5]:
clean_dataset(text).head()

Unnamed: 0,datetime,message,name
0,"29/08/2018, 16:57",And any tomato?,David
1,"29/08/2018, 16:58",yep,Iubire
2,"29/08/2018, 16:59",Nice,David
3,"29/08/2018, 16:59",Omw,David
4,"29/08/2018, 17:00",<Media omitted>,David


## Annex: Useful Code

Code for dealing with timestamp data. For later use.

In [None]:
timestamp = "02/09/2018, 16:43"
timestamp = header[0:17]
dt = datetime.strptime(timestamp, '%d/%m/%Y, %H:%M')
dt.year