## Import Libraries

In [495]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

import emoji
import re

from transformers import pipeline
import nltk
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/cesargarcia/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cesargarcia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cesargarcia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load Data

In [496]:
with open('data/headchopper.txt') as f:
    data = f.read()

data = data.split('\n')
data = pd.DataFrame(data)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3968 entries, 0 to 3967
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       3968 non-null   object
dtypes: object(1)
memory usage: 31.1+ KB


In [497]:
# Sample the last 500 rows
#data = data.sample(500)
#data = data.iloc[-500:]
#data.head()

## Clean Data

In [498]:
cleaned_data = data.copy()

In [499]:
# If it starts with [U+200E], remove the row
ltr_mark = '\u200e'
location_str = f'{ltr_mark}Location'
voice_call_str = f'{ltr_mark}Voice'
missed_call_str = f'{ltr_mark}Missed'
deleted_str = f'{ltr_mark}This message was deleted'

filtered_data = data[
    ~data.iloc[:, 0].str.startswith(ltr_mark) &
    ~data.iloc[:, 0].str.contains(location_str) &
    ~data.iloc[:, 0].str.contains(voice_call_str) &
    ~data.iloc[:, 0].str.contains(missed_call_str) &
    ~data.iloc[:, 0].str.contains(deleted_str)
]
cleaned_data = filtered_data.reset_index(drop=True)

In [500]:
# Aggregate messages that span multiple lines
aggregated_messages = []
current_message = ""

for i in range(len(cleaned_data)):
    line = cleaned_data.iloc[i].values[0]
    if line.startswith('['):  # Detects the start of a new message
        if current_message:
            aggregated_messages.append(current_message)
        current_message = line
    else:  # Continuation of the previous message
        current_message += ' ' + line

# Don't forget to append the last message
if current_message:
    aggregated_messages.append(current_message)

# Convert the list back to a DataFrame
cleaned_data = pd.DataFrame(aggregated_messages, columns=[data.columns[0]])


In [501]:
# truncate messages that are too long
for i in range(0, len(cleaned_data)):
    message = cleaned_data.iloc[i].values[0]
    if len(message) > 500:
        #print(message[:500])
        cleaned_data.iloc[i] = message[:500]

In [502]:
# total number of messages
len(cleaned_data)

3602

In [503]:
# Save as CSV
cleaned_data.to_csv('cleaned_file.csv', index=False, header=False)

## Feature Engineering

### Dates

In [504]:
sp_data = cleaned_data.copy()
sp_data['date'] = sp_data[0].str.extract(r'\[(\d+/\d+/\d+, \d+:\d+:\d+)\]')
sp_data['date'] = pd.to_datetime(sp_data['date'], format="%m/%d/%y, %H:%M:%S")
sp_data['day'] = sp_data['date'].dt.date.astype('str')
sp_data['hour'] = sp_data['date'].dt.hour
sp_data['day_of_week'] = sp_data['date'].dt.day_name()

#sp_data.head()

### Messages

In [505]:
dt_data = sp_data.copy()
dt_data['sender'] = sp_data[0].str.extract(r'\[\d+/\d+/\d+, \d+:\d+:\d+\] ([a-zA-z]+):.*')
dt_data['message'] = sp_data[0].str.extract(r'\[\d+/\d+/\d+, \d+:\d+:\d+\] [a-zA-z]+:(.*)')
dt_data['sender'] = dt_data['sender'].str.strip()
dt_data['message'] = dt_data['message'].str.strip()
#dt_data.head(10)

## Data Store


In [506]:
ds_data = dt_data.drop(columns=0)
#dt_data.head(10)

# Data Exploration

In [507]:
ds_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3602 entries, 0 to 3601
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         3602 non-null   datetime64[ns]
 1   day          3602 non-null   object        
 2   hour         3602 non-null   int32         
 3   day_of_week  3602 non-null   object        
 4   sender       3602 non-null   object        
 5   message      3602 non-null   object        
dtypes: datetime64[ns](1), int32(1), object(4)
memory usage: 154.9+ KB


In [508]:
#filter the data so that it is only one user
users = ds_data['sender'].unique()
user1 = users[0]
user2 = users[1]
user1_data = ds_data[ds_data['sender'] == user1]
user2_data = ds_data[ds_data['sender'] == user2]

### Messages

In [299]:
ds_data['sender'].value_counts()

sender
Headchopper    1865
Cesar          1753
Name: count, dtype: int64

In [300]:
top_10 = ds_data['sender'].value_counts()
#plot a pie chart
fig = px.pie(values=top_10, names=top_10.index, title='Number of messages sent by each user')
fig.show()


In [301]:
#plot the date and time of the messages per day
fig = px.bar(ds_data['date'].dt.date.value_counts(), title='Number of messages per day')
fig.update_layout(xaxis_title='Date', yaxis_title='Number of messages')
fig.show()


In [303]:
#plot the number of messages per hour
fig = px.bar(ds_data['hour'].value_counts(), title='Number of messages per hour')
fig.update_layout(xaxis_title='Hour', yaxis_title='Number of messages')
fig.show()

In [327]:
# Plot number of messages per day of the week
fig = px.bar(ds_data['day_of_week'].value_counts(), title='Number of messages per day of the week')
fig.update_layout(xaxis_title='Day of the week', yaxis_title='Number of messages')
fig.update_xaxes(categoryorder='array', categoryarray=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
fig.show()

### Emojis

In [467]:
# most used emojis per user
def get_emojis(text):
        emojis = emoji.distinct_emoji_list(text)
        return emojis

In [469]:
# Get the favorite emoji of each user
user1_data = user1_data.copy()
user2_data = user2_data.copy()
user1_data['emoji'] = user1_data['message'].apply(get_emojis)
user2_data['emoji'] = user2_data['message'].apply(get_emojis)

user1_emojis = sum(user1_data['emoji'].tolist(), [])
user2_emojis = sum(user2_data['emoji'].tolist(), [])

user1_emojis = pd.Series(user1_emojis).value_counts()
user2_emojis = pd.Series(user2_emojis).value_counts()

# get the most used emoji
print(f'{user1} most used emoji is {user1_emojis.idxmax()}')
print(f'{user2} most used emoji is {user2_emojis.idxmax()}')



Headchopper most used emoji is 😂
Cesar most used emoji is 😉


### Words

In [516]:
# remove symbols and emojis from the messages
def remove_symbols(text):
    text = emoji.replace_emoji(text, replace='')
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

def tokenize(text):
    return nltk.word_tokenize(text)

stop_words = set(stopwords.words('english'))
extra_stop_words = {'im', 'u', 'get', 'dont', 'thats', 'ill'}
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words and word not in extra_stop_words]

def clean_words(df):
    df['message'] = df['message'].apply(remove_symbols)
    df['message'] = df['message'].str.lower()
    df['tokens'] = df['message'].apply(tokenize)
    df['tokens'] = df['tokens'].apply(remove_stopwords)
    return df



In [None]:
# This takes too long
'''
from spellchecker import SpellChecker

def correct_spelling(tokens):
    spell = SpellChecker()
    corrected_tokens = [spell.correction(word) for word in tokens]
    return corrected_tokens

wd_data['tokens'] = wd_data['tokens'].apply(correct_spelling)
'''

In [431]:
# This will not work properly. You need to specify if the word is a noun, verb, etc.
'''
from nltk.stem import WordNetLemmatizer

def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

wd_data['tokens'] = wd_data['tokens'].apply(lemmatize_text)
'''

In [517]:
# count then number of unique words
user1_data = clean_words(user1_data.copy())
user2_data = clean_words(user2_data.copy())

def number_of_unique_words(df):
    unique_words = set()
    for tokens in df['tokens']:
        unique_words.update(tokens)
    return len(unique_words)

print(f'{user1} has {number_of_unique_words(user1_data)} unique words')
print(f'{user2} has {number_of_unique_words(user2_data)} unique words')

Headchopper has 2123 unique words
Cesar has 1926 unique words


In [518]:
# top 5 favorite words
user1_words = sum(user1_data['tokens'].tolist(), [])
user2_words = sum(user2_data['tokens'].tolist(), [])

user1_words = pd.Series(user1_words).value_counts()
user2_words = pd.Series(user2_words).value_counts()

print(f'{user1} favorite words are {user1_words.index[:10].tolist()}')
print(f'{user2} favorite words are {user2_words.index[:10].tolist()}')

Headchopper favorite words are ['ok', 'like', 'lol', 'go', 'na', 'ur', 'yea', 'good', 'think', 'oh']
Cesar favorite words are ['like', 'want', 'going', 'know', 'go', 'right', 'time', 'got', 'one', 'think']


# Sentiment Analysis

### NLTK Vader

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
%%time
# Calculate sentiment scores
vd_data = ds_data.copy()
vd_data['sentiment_2'] = vd_data['message'].apply(lambda x: sid.polarity_scores(x))
vd_data['compound'] = vd_data['sentiment_2'].apply(lambda x: x['compound'])
vd_data['neg'] = vd_data['sentiment_2'].apply(lambda x: x['neg'])
vd_data['neu'] = vd_data['sentiment_2'].apply(lambda x: x['neu'])
vd_data['pos'] = vd_data['sentiment_2'].apply(lambda x: x['pos'])
vd_data['sentiment_3'] = vd_data['compound'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))
vd_data.head()

### Huggingface Pipeline

In [304]:
sent_pipeline = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [305]:
example = ds_data['message'].tolist()[1]
#print(example)
sent_pipeline(example)

[{'label': 'POSITIVE', 'score': 0.9977076053619385}]

In [306]:
%%time
#print number of messages
total_messages = len(ds_data['message'])
print(f'Total of {total_messages} messages')

#Apply the sentiment analysis to all the messages
messages = dt_data['message'].tolist()
for index, message in enumerate(messages):
    if type(message) != str:
        print('Not a string')
        print(f"Index: {index}")
        
sentiments = sent_pipeline(messages)

#Add the sentiment to the dataframe
ds_data['sentiment'] = [sent['label'] for sent in sentiments]
ds_data['score'] = [sent['score'] for sent in sentiments]
#dt_data.head()

Total of 3618 messages
CPU times: user 58.8 s, sys: 49 s, total: 1min 47s
Wall time: 1min 40s


In [317]:
# if sentiment is negative, then turn score to negative
ds_data['score'] = ds_data['score'].where(ds_data['sentiment'] == 'POSITIVE', -ds_data['score'])
#ds_data.head()

Unnamed: 0,date,day,hour,day_of_week,sender,message,sentiment,score
0,2023-09-03 00:30:58,2023-09-03,0,Sunday,Headchopper,‎Messages and calls are end-to-end encrypted. ...,NEGATIVE,-0.996267
1,2023-09-03 00:30:58,2023-09-03,0,Sunday,Headchopper,Hey Cesar! My event is wrapping up and I’m thi...,POSITIVE,0.997708
2,2023-09-03 00:32:43,2023-09-03,0,Sunday,Cesar,Heyy,POSITIVE,0.996839
3,2023-09-03 00:32:52,2023-09-03,0,Sunday,Cesar,Come,POSITIVE,0.99841
4,2023-09-03 00:32:55,2023-09-03,0,Sunday,Cesar,Diego de leon 40,POSITIVE,0.991599


### Data Visualization

In [318]:
#Average sentiment score by sender
avg_sentiment = ds_data.groupby('sender')['score'].mean()
avg_sentiment = avg_sentiment.sort_values(ascending=False)
avg_sentiment

sender
Headchopper   -0.098224
Cesar         -0.126383
Name: score, dtype: float64

In [319]:
#Percentage of sentiment per sender
sentiment_counts = ds_data.groupby('sender')['sentiment'].value_counts(normalize=True)
sentiment_counts

sender       sentiment
Cesar        NEGATIVE     0.565317
             POSITIVE     0.434683
Headchopper  NEGATIVE     0.551206
             POSITIVE     0.448794
Name: proportion, dtype: float64

In [320]:
# Sample DataFrame structure
# df = pd.read_csv('your_file.csv', parse_dates=['date'])

# Separate data by sender
person1_data = ds_data[ds_data['sender'] == 'Cesar']
person2_data = ds_data[ds_data['sender'] == 'Headchopper']

# Calculate average sentiment scores
avg_sentiment_person1 = person1_data['score'].mean()
avg_sentiment_person2 = person2_data['score'].mean()

print(f"Average sentiment score for Cesar: {avg_sentiment_person1}")
print(f"Average sentiment score for Headchopper: {avg_sentiment_person2}")

# Create a rolling average to smooth out the trend
person1_data['rolling_avg'] = person1_data['score'].rolling(window=20).mean()
person2_data['rolling_avg'] = person2_data['score'].rolling(window=20).mean()

# Create a Plotly figure
fig = go.Figure()

# Add rolling average sentiment for Cesar
fig.add_trace(go.Scatter(
    x=person1_data['date'], y=person1_data['rolling_avg'],
    mode='lines+markers', name='Cesar (Rolling Avg)',
    line=dict(color='blue'), marker=dict(symbol='circle')
))

# Add rolling average sentiment for Headchopper
fig.add_trace(go.Scatter(
    x=person2_data['date'], y=person2_data['rolling_avg'],
    mode='lines+markers', name='Headchopper (Rolling Avg)',
    line=dict(color='red'), marker=dict(symbol='x')
))

# Customize the layout
fig.update_layout(
    title='Sentiment Scores Over Time (Rolling Average)',
    xaxis_title='Date',
    yaxis_title='Sentiment Score (Rolling Average)',
    legend_title='Sender',
    template='plotly_white'
)

# Show the figure
fig.show()


Average sentiment score for Cesar: -0.12638314638692721
Average sentiment score for Headchopper: -0.09822446190959327




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [323]:
# Convert the date column to date only (no time)
ds_data['date'] = pd.to_datetime(ds_data['date']).dt.date

# Group by date and sender, then calculate the average sentiment score for each group
daily_sentiment = ds_data.groupby(['date', 'sender'])['score'].mean().reset_index()

# Separate data by sender
person1_data = daily_sentiment[daily_sentiment['sender'] == 'Cesar']
person2_data = daily_sentiment[daily_sentiment['sender'] == 'Headchopper']

# Create a Plotly figure
fig = go.Figure()

# Add daily average sentiment for Cesar
fig.add_trace(go.Scatter(
    x=person1_data['date'], y=person1_data['score'],
    mode='lines+markers', name='Cesar (Daily Avg)',
    line=dict(color='blue'), marker=dict(symbol='circle')
))

# Add daily average sentiment for Headchopper
fig.add_trace(go.Scatter(
    x=person2_data['date'], y=person2_data['score'],
    mode='lines+markers', name='Headchopper (Daily Avg)',
    line=dict(color='red'), marker=dict(symbol='x')
))

# Customize the layout
fig.update_layout(
    title='Daily Average Sentiment Scores Over Time',
    xaxis_title='Date',
    yaxis_title='Average Sentiment Score',
    legend_title='Sender',
    template='plotly_white'
)

# Show the figure
fig.show()