In [1]:
import os
import re
import pandas as pd
import numpy as np

In [2]:
with open('kolaaruChat.txt', 'r') as f:
    chat_text = f.read()

In [3]:
MESSAGE_REGEX = r"(?P<date>\d{2}\/\d{2}\/\d{4}),\s(?P<time>\d{1,2}:\d{2}\s(?:am|pm))\s-\s(?P<sender>[^:]*):[\s](?P<message>[^\\\n]+)"

In [4]:
messages = re.findall(MESSAGE_REGEX, chat_text)

In [5]:
messages

[('27/11/2015', '2:31 pm', 'Raghul 1', '2'),
 ('27/11/2015', '2:31 pm', 'Shiva Nandham', '6'),
 ('27/11/2015', '2:31 pm', 'Sankaran', 'Let all reply'),
 ('27/11/2015', '2:32 pm', 'Shiva Nandham', 'Pvt LA anupu da'),
 ('27/11/2015', '2:32 pm', 'Sankaran', 'K'),
 ('27/11/2015', '2:33 pm', 'Shiva Nandham', '😂😂😂😂😂'),
 ('27/11/2015', '2:33 pm', 'Sankaran', 'Raghul pavam'),
 ('27/11/2015', '2:34 pm', 'Vignesh', '1'),
 ('27/11/2015', '2:35 pm', 'Iniyan', '7'),
 ('27/11/2015', '2:35 pm', 'Vignesh', '😂😂😛'),
 ('27/11/2015', '2:35 pm', 'Iniyan', 'Answer daah'),
 ('27/11/2015', '2:36 pm', 'Sankaran', '2 min silence fr raghul'),
 ('27/11/2015', '2:36 pm', 'Iniyan', '😂'),
 ('27/11/2015',
  '2:38 pm',
  'Sankaran',
  "Let's c what Sumesh,Arun,nav r goin to choose"),
 ('27/11/2015', '2:38 pm', 'Shiva Nandham', '😂😂😂😂'),
 ('27/11/2015', '2:39 pm', 'Sumesh', '5'),
 ('27/11/2015', '2:41 pm', 'Sumesh', 'ada potta shiva'),
 ('27/11/2015', '2:41 pm', 'Sankaran', 'I chose 8😎'),
 ('27/11/2015', '2:41 pm', 'San

In [6]:
df_input = {
    'date': pd.Series(list(map(lambda x: x[0], messages))),
    'time': pd.Series(list(map(lambda x: x[1], messages))),
    'sender': pd.Series(list(map(lambda x: x[2], messages))),
    'content': pd.Series(list(map(lambda x: x[3], messages)))
}
df = pd.DataFrame(df_input)

In [7]:

senders_array = df['sender'].unique()

In [8]:
x = df.loc[df['sender'] == 'arun Leo']

In [9]:
y = x.groupby('date').agg('time')

In [10]:
df

Unnamed: 0,date,time,sender,content
0,27/11/2015,2:31 pm,Raghul 1,2
1,27/11/2015,2:31 pm,Shiva Nandham,6
2,27/11/2015,2:31 pm,Sankaran,Let all reply
3,27/11/2015,2:32 pm,Shiva Nandham,Pvt LA anupu da
4,27/11/2015,2:32 pm,Sankaran,K
5,27/11/2015,2:33 pm,Shiva Nandham,😂😂😂😂😂
6,27/11/2015,2:33 pm,Sankaran,Raghul pavam
7,27/11/2015,2:34 pm,Vignesh,1
8,27/11/2015,2:35 pm,Iniyan,7
9,27/11/2015,2:35 pm,Vignesh,😂😂😛


In [11]:
class Message:
    def __init__(self, matched_tuple):
        self.date = matched_tuple[0]
        self.time = matched_tuple[1]
        self.sender = matched_tuple[2]
        self.content = matched_tuple[3]
        
    def is_link(self):
        return self.content.startswith('http')
    
    def is_media(self):
        return self.content == "<Media omitted>"

In [12]:
messages = list(map(lambda x: Message(x), messages))

In [13]:
user_message_count = {}
for sender in senders_array:
    sender_df = df.loc[df['sender'] == sender]
    user_message_count[sender] = {
        'total': sender_df.shape[0],
        'active_days': sender_df.agg('date').unique().shape[0],
        'links': 0,
        'media': 0
    }
for message in messages:
    user_message_count[message.sender]['links'] += 1 if message.is_link() else 0
    user_message_count[message.sender]['media'] += 1 if message.is_media() else 0

In [14]:
user_message_count

{'Raghul 1': {'total': 5135, 'active_days': 637, 'links': 63, 'media': 320},
 'Shiva Nandham': {'total': 4803,
  'active_days': 617,
  'links': 44,
  'media': 320},
 'Sankaran': {'total': 1241, 'active_days': 332, 'links': 10, 'media': 210},
 'Vignesh': {'total': 3159, 'active_days': 476, 'links': 45, 'media': 232},
 'Iniyan': {'total': 4335, 'active_days': 449, 'links': 16, 'media': 184},
 'Sumesh': {'total': 1144, 'active_days': 146, 'links': 1, 'media': 23},
 'Naveen': {'total': 4945, 'active_days': 709, 'links': 121, 'media': 499},
 'arun Leo': {'total': 8541, 'active_days': 851, 'links': 71, 'media': 594},
 '+91 73581 90635': {'total': 547, 'active_days': 41, 'links': 0, 'media': 16},
 'Vishnu': {'total': 397, 'active_days': 23, 'links': 0, 'media': 0},
 'Mag Amma': {'total': 130, 'active_days': 25, 'links': 0, 'media': 8},
 '+91 99401 34173': {'total': 2, 'active_days': 1, 'links': 0, 'media': 0},
 'Gokul': {'total': 1169, 'active_days': 252, 'links': 15, 'media': 125},
 'Magesh'