In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
# !pip install emoji
import emoji
from collections import Counter



## Data Preprocessing

* Regex cheatsheet
    * https://www.rexegg.com/regex-quickstart.html
* Datetime format
    * http://strftime.org/
    
```
def rawToDf(file):
    with open(file, 'r') as raw_data:
        raw_string = ' '.join(raw_data.read().split('\n')) # converting the list split by newline char. as one whole string as there can be multi-line messages
        user_msg = re.split('\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s', raw_string) [1:] # splits at all the date-time pattern, resulting in list of all the messages with user names
        date_time = re.findall('\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s', raw_string) # finds all the date-time patterns
        
        df = pd.DataFrame({'date_time': date_time, 'user_msg': user_msg}) # exporting it to a df
   
    # converting date-time pattern which is of type String to type datetime, format is to be specified for the whole string where the placeholders are extracted by the method    
    try:
        df['date_time'] = pd.to_datetime(df['date_time'], format='%m/%d/%y, %I:%M %p - ') #10/20/19, 10:24 pm - 
    except:
        df['date_time'] = pd.to_datetime(df['date_time'], format='%d/%m/%Y, %I:%M %p - ') #20/10/2019, 10:24 pm -
    
    # split user and msg 
    usernames = []
    msgs = []
    for i in df['user_msg']:
        a = re.split('([\w\W]+?):\s', i) # lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user
        if(a[1:]): # user typed messages
            usernames.append(a[1])
            msgs.append(a[2])
        else: # other notifications in the group(eg: someone was added, some left ...)
            usernames.append("grp_notif")
            msgs.append(a[0])

    # creating new columns         
    df['user'] = usernames
    df['msg'] = msgs

    # dropping the old user_msg col.
    df.drop('user_msg', axis=1, inplace=True)
    
    return df
```

In [11]:
file=open('WhatsApp Chat with Bca 2017 batch.txt', encoding='utf8')
x1=file.read()


In [3]:
def rawToDf(file):
    with open(file, encoding='utf8') as raw_data:
        raw_string = ' '.join(raw_data.read().split('\n')) # converting the list split by newline char. as one whole string as there can be multi-line messages
        user_msg = re.split('\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s', raw_string) [1:] # splits at all the date-time pattern, resulting in list of all the messages with user names
        date_time = re.findall('\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s', raw_string) # finds all the date-time patterns

        df = pd.DataFrame({'date_time': date_time, 'user_msg': user_msg}) # exporting it to a df

    # converting date-time pattern which is of type String to type datetime, format is to be specified for the whole string where the placeholders are extracted by the method    
    try:
        df['date_time'] = pd.to_datetime(df['date_time'], format='%m/%d/%y, %I:%M %p - ') #10/20/19, 10:24 pm - %H:%M %p
    except:
        df['date_time'] = pd.to_datetime(df['date_time'], format='%d/%m/%Y, %I:%M %p - ') #20/10/2019, 10:24 pm - %I:%M %p

    # split user and msg 
    usernames = []
    msgs = []
    for i in df['user_msg']:
        a = re.split('([\w\W]+?):\s', i) # lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user
        if(a[1:]): # user typed messages
            usernames.append(a[1])
            msgs.append(a[2])
        else: # other notifications in the group(eg: someone was added, some left ...)
            usernames.append("grp_notif")
            msgs.append(a[0])

    # creating new columns         
    df['user'] = usernames
    df['msg'] = msgs

    # dropping the old user_msg col.
    df.drop('user_msg', axis=1, inplace=True)

    return df

In [4]:
me = "ankit (ip)"

In [5]:
file='WhatsApp Chat with Bca 2017 batch.txt'
df = pd.DataFrame()
df = rawToDf(file)

In [6]:
# https://regex101.com/     .........we can use this to check our regular expression

In [9]:
df[:20]

Unnamed: 0,date_time,user,msg
0,2017-03-20 22:12:00,+91 97170 21451,😁😁
1,2017-03-21 12:01:00,+91 86838 58555,Abb konsa aane denge tjhe
2,2017-03-21 12:01:00,ankit (ip),@918586850740 ni aaya?
3,2017-03-21 12:01:00,+91 86838 58555,Extempore ho liya
4,2017-03-21 12:01:00,ankit (ip),Aane degge✌🏻
5,2017-03-21 12:02:00,ankit (ip),TC bhi kr lio
6,2017-03-21 21:21:00,ankit (ip),Bhai iss se bhi khtrnak hota h darr🤣 jb teri b...
7,2017-03-21 21:28:00,+91 86838 58555,Bc bhott mushkil se bcha tha mai
8,2017-03-21 21:28:00,+91 86838 58555,Saala ldko pe b gandi nazar rkhta h ye toh
9,2017-03-21 21:28:00,akshay (clg),


### Data

In [6]:
df.shape

(6025, 3)

### No. of Images, group notifications and dropping them

In [7]:
images = df[df['msg']=="<Media omitted> "] #no. of images, images are represented by <media omitted>
images.shape

(610, 3)

In [8]:
grp_notif = df[df['user']=="grp_notif"] #no. of grp notifications
grp_notif.shape

(107, 3)

In [9]:
df.drop(images.index, inplace=True) #removing images
df.drop(grp_notif.index, inplace=True) #removing grp_notif

In [10]:
df.reset_index(inplace=True, drop=True)
df.shape

(5308, 3)

In [11]:
df.head()

Unnamed: 0,date_time,user,msg
0,2017-03-20 22:12:00,+91 97170 21451,😁😁
1,2017-03-21 12:01:00,+91 86838 58555,Abb konsa aane denge tjhe
2,2017-03-21 12:01:00,ankit (ip),@918586850740 ni aaya?
3,2017-03-21 12:01:00,+91 86838 58555,Extempore ho liya
4,2017-03-21 12:01:00,ankit (ip),Aane degge✌🏻


## Q 1)
## Who is the most active member of the group. Who is the least active. Is it same on weekday weekend?

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.weekday.html

> 1. Get Birth Date

In [12]:
df[df.msg.str.contains('birthday')].head()

Unnamed: 0,date_time,user,msg
702,2017-04-10 12:07:00,+91 85878 51712,Happy birthday bhai... GBU..
704,2017-04-10 12:08:00,priyanka clg,Happy birthday Harman💐🎂
705,2017-04-10 12:21:00,+91 95400 96186,Happy birthday harman
707,2017-04-10 12:25:00,Abhishek clg new,Happy birthday Bhai...🎉🎊🎀🎏
711,2017-04-10 12:32:00,geetanshu (ip),Happy birthday paaji 😇🎂


> 2. longest inactive person

In [13]:
c = 0
for i in df['user']:
    if i !="geetanshu (ip)":
        c = c + 1
    else:
        print(c)
        c = 0
        
        
# that means the person is not active at that particular msg index

420
0
2
1
1
282
511
4
0
2
0
0
5
0
404
3
7
54
52
7
34
42
67
0
5
1
97
5
12
47
7
112
21
65
7
167
2
23
72
97
103
64
598
6
21
327
209
53
15
5
1
2
4
3
20
42
23
30
4
3
46
15
22
46
1
0
4
0
0
1
0
3
427
30
7
22
154
61
104
93
8
2
0
0
0
3


> Who is the most active member of the group. Who is the least active. Is it same on weekday weekend?

In [15]:
df.groupby("user")["msg"].count().sort_values(ascending=False)

user
ankit (ip)           987
+91 86838 58555      959
Arshpreet clg        593
Rainy (ip)           360
harsh (ip)           342
+91 97170 21451      212
Jasmeet clg          207
Aman sir             177
akshay (clg)         111
+91 98912 39561      106
Divyansh Clg         101
khim (ip)             97
anisha clg new        96
geetanshu (ip)        86
Abhishek clg new      81
Rachit Clg            81
priyanka clg          74
shubham bisht clg     67
+91 96503 88232       65
danish clg            56
Suraj clg             55
+91 97187 85832       55
jagpreet clg          54
Param Clg             39
+91 95400 96186       31
+91 85878 51712       26
Rahul (ip)            21
Sandeep Clg           21
aakash😉               18
+91 95821 09417       18
+91 78386 74714       18
Raman clg             14
+91 88268 81006       14
+91 85869 24394       10
+91 75036 90358       10
Durghesh Clg           8
Harman Clg             8
+91 99689 60593        7
Suraj Clg New          7
khim ip 2           

In [16]:
df['weekday'] = df['date_time'].apply(lambda x: x.day_name()) # can use day_name or weekday from datetime 

In [17]:
df.weekday.value_counts(sort=True)

Thursday     1047
Wednesday     893
Monday        863
Tuesday       734
Friday        711
Sunday        684
Saturday      376
Name: weekday, dtype: int64

In [18]:
df['is_weekend'] = df.weekday.isin(['Sunday', 'Saturday'])

In [19]:
df.head()

Unnamed: 0,date_time,user,msg,weekday,is_weekend
0,2017-03-20 22:12:00,+91 97170 21451,😁😁,Monday,False
1,2017-03-21 12:01:00,+91 86838 58555,Abb konsa aane denge tjhe,Tuesday,False
2,2017-03-21 12:01:00,ankit (ip),@918586850740 ni aaya?,Tuesday,False
3,2017-03-21 12:01:00,+91 86838 58555,Extempore ho liya,Tuesday,False
4,2017-03-21 12:01:00,ankit (ip),Aane degge✌🏻,Tuesday,False


## Q 2)
## Count of all the emoticons that i have used till date.

In [20]:
df["user"].unique()

array(['+91 97170 21451', '+91 86838 58555', 'ankit (ip)', 'akshay (clg)',
       'Arshpreet clg', 'anisha clg new', '+91 96503 88232',
       'Divyansh Clg', 'Jasmeet clg', 'harsh (ip)', '+91 97187 85832',
       'Rainy (ip)', 'danish clg', 'Sandeep Clg', 'Aman sir', 'Param Clg',
       '+91 88268 81006', 'Abhishek clg new', 'priyanka clg',
       '+91 98912 39561', '+91 99689 60593', '+91 75036 90358',
       'Rachit Clg', '+91 95821 09417', '+91 99686 47566',
       'shubham bisht clg', 'aakash😉', 'Rahul (ip)', 'geetanshu (ip)',
       'jagpreet clg', '+91 78386 74714', '+91 85878 51712',
       '+91 95400 96186', 'Harman Clg', 'khim (ip)', 'Suraj clg',
       'Raman clg', '+91 85869 24394', '+91 99996 81576', 'niranjan (ip)',
       'Durghesh Clg', 'khim ip 2', '+91 99993 37057', 'Suraj Clg New'],
      dtype=object)

In [21]:
me = "ankit (ip)"

In [22]:
emoji_ctr = Counter()
emojis_list = map(lambda x: ''.join(x.split()), emoji.UNICODE_EMOJI.keys())
r = re.compile('|'.join(re.escape(p) for p in emojis_list))
for idx, row in df.iterrows():
    if row["user"] == me:
        emojis_found = r.findall(row["msg"])
        for emoji_found in emojis_found:
            emoji_ctr[emoji_found] += 1

In [52]:
for item in emoji_ctr.most_common():    # we use emoji_ctr.most_common(27) ..no of emoji
    print(item[0] + " - " + str(item[1]))

🤣 - 479
😒 - 359
😂 - 293
🏻 - 119
♂ - 57
☺ - 45
🤦 - 43
😅 - 42
😌 - 39
😍 - 30
😏 - 28
😆 - 21
😄 - 20
😪 - 20
👊 - 19
🤪 - 17
🙂 - 16
🤭 - 14
😊 - 12
😎 - 11
😓 - 9
😶 - 9
🙋 - 9
😡 - 9
😼 - 9
😝 - 8
😕 - 8
😁 - 8
🙄 - 7
🙏 - 7
✌ - 6
👇 - 6
🙊 - 5
🚶 - 5
🤚 - 5
😭 - 5
🤘 - 4
🙁 - 4
😇 - 4
😋 - 3
😜 - 3
🤓 - 3
🤔 - 3
👍 - 3
😀 - 3
😖 - 3
🤷 - 3
😠 - 3
😴 - 2
😃 - 2
😣 - 2
🙈 - 2
😤 - 2
🙆 - 2
🖕 - 2
✋ - 2
🍾 - 2
🎂 - 2
😘 - 2
☹ - 1
📱 - 1
😛 - 1
😥 - 1
🏃 - 1
👆 - 1
👀 - 1
🥃 - 1
🥂 - 1
🍻 - 1
🍺 - 1
🤫 - 1
🙇 - 1
