## Import Libraries

In [1]:
import pandas as pd
import plotly.express as px
import plotly

## Load Data

In [2]:
with open('data/headchopper.txt') as f:
    data = f.read()

data = data.split('\n')
data = pd.DataFrame(data)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3968 entries, 0 to 3967
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       3968 non-null   object
dtypes: object(1)
memory usage: 31.1+ KB


In [249]:
# Sample the last 500 rows
#data = data.sample(500)
data = data.iloc[-500:]
#data.head()

## Clean Data

In [4]:
cleaned_data = data.copy()

In [5]:
# If it starts with [U+200E], remove the row
ltr_mark = '\u200e'
location_str = f'{ltr_mark}Location'
voice_call_str = f'{ltr_mark}Voice call'
missed_call_str = f'{ltr_mark}Missed voice call'

filtered_data = data[
    ~data.iloc[:, 0].str.startswith(ltr_mark) &
    ~data.iloc[:, 0].str.contains(location_str) &
    ~data.iloc[:, 0].str.contains(voice_call_str) &
    ~data.iloc[:, 0].str.contains(missed_call_str)
]
cleaned_data = filtered_data.reset_index(drop=True)

In [6]:
# Aggregate messages that span multiple lines
aggregated_messages = []
current_message = ""

for i in range(len(data)):
    line = data.iloc[i].values[0]
    if line.startswith('['):  # Detects the start of a new message
        if current_message:
            aggregated_messages.append(current_message)
        current_message = line
    else:  # Continuation of the previous message
        current_message += ' ' + line

# Don't forget to append the last message
if current_message:
    aggregated_messages.append(current_message)

# Convert the list back to a DataFrame
cleaned_data = pd.DataFrame(aggregated_messages, columns=[data.columns[0]])


In [7]:
# truncate messages that are too long
for i in range(0, len(cleaned_data)):
    message = cleaned_data.iloc[i].values[0]
    if len(message) > 500:
        #print(message[:500])
        cleaned_data.iloc[i] = message[:500]

In [8]:
# total number of messages
len(cleaned_data)

3618

In [255]:
# Save as CSV
#cleaned_data.to_csv('cleaned_file.csv', index=False, header=False)

## Feature Engineering

### Dates

In [9]:
sp_data = cleaned_data.copy()
sp_data['date'] = sp_data[0].str.extract(r'\[(\d+/\d+/\d+, \d+:\d+:\d+)\]')
sp_data['date'] = pd.to_datetime(sp_data['date'], format="%m/%d/%y, %H:%M:%S")
sp_data['day'] = sp_data['date'].dt.date.astype('str')
sp_data['hour'] = sp_data['date'].dt.hour
sp_data['day_of_week'] = sp_data['date'].dt.day_name()

#sp_data.head()

### Messages

In [10]:
dt_data = sp_data.copy()
dt_data['sender'] = sp_data[0].str.extract(r'\[\d+/\d+/\d+, \d+:\d+:\d+\] ([a-zA-z]+):.*')
dt_data['message'] = sp_data[0].str.extract(r'\[\d+/\d+/\d+, \d+:\d+:\d+\] [a-zA-z]+:(.*)')
dt_data['sender'] = dt_data['sender'].str.strip()
dt_data['message'] = dt_data['message'].str.strip()
#dt_data.head(10)

## Data Store


In [11]:
ds_data = dt_data.drop(columns=0)
#dt_data.head(10)

# Data Exploration

In [12]:
ds_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3618 entries, 0 to 3617
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         3618 non-null   datetime64[ns]
 1   day          3618 non-null   object        
 2   hour         3618 non-null   int32         
 3   day_of_week  3618 non-null   object        
 4   sender       3618 non-null   object        
 5   message      3618 non-null   object        
dtypes: datetime64[ns](1), int32(1), object(4)
memory usage: 155.6+ KB


In [42]:
ds_data.head()
# First message is an automated one from WhatsApp, should be removed

Unnamed: 0,date,day,hour,day_of_week,sender,message
0,2023-09-03 00:30:58,2023-09-03,0,Sunday,Headchopper,‎Messages and calls are end-to-end encrypted. ...
1,2023-09-03 00:30:58,2023-09-03,0,Sunday,Headchopper,Hey Cesar! My event is wrapping up and I’m thi...
2,2023-09-03 00:32:43,2023-09-03,0,Sunday,Cesar,Heyy
3,2023-09-03 00:32:52,2023-09-03,0,Sunday,Cesar,Come
4,2023-09-03 00:32:55,2023-09-03,0,Sunday,Cesar,Diego de leon 40


In [44]:
ds_data = ds_data.drop(0,axis="index")

In [46]:
ds_data['sender'].value_counts()

sender
Headchopper    1864
Cesar          1753
Name: count, dtype: int64

In [47]:
top_10 = ds_data['sender'].value_counts()
#plot a pie chart
fig = px.pie(values=top_10, names=top_10.index, title='Number of messages sent by each user')
fig.show()


In [50]:
#plot the number of messages over time
fig = px.bar(ds_data['date'].dt.date.value_counts(), title='Number of messages over time')
fig.update_layout(xaxis_title='Date', yaxis_title='Number of messages',showlegend=False)
fig.show()


In [263]:
#mask = ds_data['day'] == '2023-12-15'
#ds_data[mask]

In [38]:
ds_data 


Unnamed: 0,date,day,hour,day_of_week,sender,message
0,2023-09-03 00:30:58,2023-09-03,0,Sunday,Headchopper,‎Messages and calls are end-to-end encrypted. ...
1,2023-09-03 00:30:58,2023-09-03,0,Sunday,Headchopper,Hey Cesar! My event is wrapping up and I’m thi...
2,2023-09-03 00:32:43,2023-09-03,0,Sunday,Cesar,Heyy
3,2023-09-03 00:32:52,2023-09-03,0,Sunday,Cesar,Come
4,2023-09-03 00:32:55,2023-09-03,0,Sunday,Cesar,Diego de leon 40
...,...,...,...,...,...,...
3613,2024-05-14 13:28:46,2024-05-14,13,Tuesday,Headchopper,Sad
3614,2024-05-14 13:28:52,2024-05-14,13,Tuesday,Headchopper,We can get a room?
3615,2024-05-14 13:28:56,2024-05-14,13,Tuesday,Cesar,I got a room
3616,2024-05-14 13:29:03,2024-05-14,13,Tuesday,Cesar,First floor


In [52]:
#plot the number of messages by day of week
fig = px.bar(ds_data['day_of_week'].value_counts(), title='Number of messages per weekday')
fig.update_layout(xaxis_title='Day', yaxis_title='Number of messages',showlegend=False)
fig.update_xaxes(categoryorder='array', categoryarray= ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
fig.show()

In [53]:
#plot the number of messages per hour
fig = px.bar(ds_data['hour'].value_counts(), title='Number of messages per hour')
fig.update_layout(xaxis_title='Hour', yaxis_title='Number of messages',showlegend=False)
fig.show()

# Sentiment Analysis

In [None]:
from transformers import pipeline

In [266]:
sent_pipeline = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [267]:
example = ds_data['message'].tolist()[1]
#print(example)
sent_pipeline(example)

[{'label': 'NEGATIVE', 'score': 0.9643456935882568}]

In [268]:
%%time
#print number of messages
total_messages = len(ds_data['message'])
print(f'Total of {total_messages} messages')

#Apply the sentiment analysis to all the messages
messages = dt_data['message'].tolist()
for index, message in enumerate(messages):
    if type(message) != str:
        print('Not a string')
        print(f"Index: {index}")
        
sentiments = sent_pipeline(messages)

#Add the sentiment to the dataframe
ds_data['sentiment'] = [sent['label'] for sent in sentiments]
ds_data['score'] = [sent['score'] for sent in sentiments]
#dt_data.head()

Total of 474 messages
CPU times: user 7.8 s, sys: 6.54 s, total: 14.3 s
Wall time: 13.6 s


In [274]:
# if sentiment is negative, then turn score to negative
ds_data['score'] = ds_data['score'].where(ds_data['sentiment'] == 'POSITIVE', -ds_data['score'])
#dt_data.head()

In [275]:
#Average sentiment score by sender
avg_sentiment = ds_data.groupby('sender')['score'].mean()
avg_sentiment = avg_sentiment.sort_values(ascending=False)
avg_sentiment

sender
Headchopper   -0.030730
Cesar         -0.209388
Name: score, dtype: float64

In [276]:
#Percentage of sentiment per sender
sentiment_counts = ds_data.groupby('sender')['sentiment'].value_counts(normalize=True)
sentiment_counts

sender       sentiment
Cesar        NEGATIVE     0.603604
             POSITIVE     0.396396
Headchopper  NEGATIVE     0.515873
             POSITIVE     0.484127
Name: proportion, dtype: float64