In [5]:
import re
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import emoji
import itertools 
from collections import Counter
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

In [24]:
def rawToDf(file, key):
    '''Converts raw .txt file into a Data Frame'''
    
    split_formats = {
        '12hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s',
        '24hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s',
        'custom' : ''
    }
    datetime_formats = {
        '12hr' : '%d/%m/%Y, %I:%M %p - ',
        '24hr' : '%d/%m/%Y, %H:%M - ',
        'custom': ''
    }
    
    with open(file, 'r', encoding='utf-8') as raw_data:
        # print(raw_data.read())
        raw_string = ' '.join(raw_data.read().split('\n')) # converting the list split by newline char. as one whole string as there can be multi-line messages
        user_msg = re.split(split_formats[key], raw_string) [1:] # splits at all the date-time pattern, resulting in list of all the messages with user names
        date_time = re.findall(split_formats[key], raw_string) # finds all the date-time patterns
        
        df = pd.DataFrame({'date_time': date_time, 'user_msg': user_msg}) # exporting it to a df
        
    # converting date-time pattern which is of type String to type datetime,
    # format is to be specified for the whole string where the placeholders are extracted by the method 
    df['date_time'] = pd.to_datetime(df['date_time'], format=datetime_formats[key])
    
    # split user and msg 
    usernames = []
    msgs = []
    for i in df['user_msg']:
        a = re.split('([\w\W]+?):\s', i) # lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user
        if(a[1:]): # user typed messages
            usernames.append(a[1])
            msgs.append(a[2])
        else: # other notifications in the group(eg: someone was added, some left ...)
            usernames.append("group_notification")
            msgs.append(a[0])

    # creating new columns         
    df['user'] = usernames
    df['message'] = msgs

    # dropping the old user_msg col.
    df.drop('user_msg', axis=1, inplace=True)
    
    return df



In [25]:
df = rawToDf('Advika.txt', '12hr')
print(df)

               date_time                user  \
0    2021-12-09 16:32:00  group_notification   
1    2021-12-09 16:32:00  group_notification   
2    2022-01-21 21:57:00  group_notification   
3    2022-01-27 15:07:00  group_notification   
4    2022-09-13 21:19:00              Aditya   
...                  ...                 ...   
2361 2023-04-06 16:41:00              Aditya   
2362 2023-04-06 16:41:00              Advika   
2363 2023-04-06 16:42:00              Aditya   
2364 2023-04-06 16:43:00              Aditya   
2365 2023-04-06 16:43:00              Aditya   

                                                message  
0     Messages and calls are end-to-end encrypted. N...  
1     Your security code with Advika changed. Tap to...  
2     Your security code with Advika changed. Tap to...  
3     Your security code with Advika changed. Tap to...  
4                                      <Media omitted>   
...                                                 ...  
2361             

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2366 entries, 0 to 2365
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date_time  2366 non-null   datetime64[ns]
 1   user       2366 non-null   object        
 2   message    2366 non-null   object        
dtypes: datetime64[ns](1), object(2)
memory usage: 55.6+ KB


In [27]:
df.sample(10)

Unnamed: 0,date_time,user,message
1500,2022-12-20 23:56:00,Aditya,No while printing i will do that
1081,2022-12-11 20:00:00,Aditya,I have the code for login
1695,2023-02-12 12:52:00,Advika,<Media omitted>
138,2022-09-28 22:47:00,Aditya,Should I help u with this page or design login...
762,2022-12-04 11:49:00,Advika,I'll do it in ja
1789,2023-02-14 21:50:00,Aditya,Yea ok 👍
1760,2023-02-14 21:14:00,Aditya,U made it??
2251,2023-03-21 21:13:00,Aditya,Now it's showing
2168,2023-03-20 14:13:00,Advika,in that?
1438,2022-12-20 18:40:00,Advika,So if any one is satisfied it'll display


In [28]:

df[df['message'] == ""].shape[0]

2

In [29]:
df['day'] = df['date_time'].dt.strftime('%a')
df['month'] = df['date_time'].dt.strftime('%b')
df['year'] = df['date_time'].dt.year
df['date'] = df['date_time'].apply(lambda x: x.date())

In [30]:
df

Unnamed: 0,date_time,user,message,day,month,year,date
0,2021-12-09 16:32:00,group_notification,Messages and calls are end-to-end encrypted. N...,Thu,Dec,2021,2021-12-09
1,2021-12-09 16:32:00,group_notification,Your security code with Advika changed. Tap to...,Thu,Dec,2021,2021-12-09
2,2022-01-21 21:57:00,group_notification,Your security code with Advika changed. Tap to...,Fri,Jan,2022,2022-01-21
3,2022-01-27 15:07:00,group_notification,Your security code with Advika changed. Tap to...,Thu,Jan,2022,2022-01-27
4,2022-09-13 21:19:00,Aditya,<Media omitted>,Tue,Sep,2022,2022-09-13
...,...,...,...,...,...,...,...
2361,2023-04-06 16:41:00,Aditya,Yea it doesn't,Thu,Apr,2023,2023-04-06
2362,2023-04-06 16:41:00,Advika,Yea only NLP for sentiment analysis,Thu,Apr,2023,2023-04-06
2363,2023-04-06 16:42:00,Aditya,See it's doing few thing no like sentiment ana...,Thu,Apr,2023,2023-04-06
2364,2023-04-06 16:43:00,Aditya,So we can tell her we have done for like predi...,Thu,Apr,2023,2023-04-06
