First test in notebook for correlation

In [16]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from loguru import logger
import warnings
from textblob import TextBlob

warnings.filterwarnings("ignore", category=FutureWarning)

In [17]:
import tomllib

configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)
datafile = (Path("..") / Path(config["processed"]) / config["current"]).resolve()
if not datafile.exists():
    logger.warning(
        "Datafile does not exist. First run src/preprocess.py, and check the timestamp!"
    )
df = pd.read_parquet(datafile)
df.head()

Unnamed: 0,timestamp,message,living_in_city,has_emoji,word_count,react_time_sec,react_time_min,react_time_hr,is_image,is_empty_message,is_removed_message,sentiment_polarity,sentiment_category,tech_background,author
0,2022-09-16 08:21:00+00:00,Wachten op dit bericht,0,False,4.0,,,,0,1,0,0.0,Neutral,0,eye-catching-pelican
1,2022-09-16 08:23:00+00:00,Wachten op dit bericht,0,False,4.0,120.0,2.0,0.033333,0,1,0,0.0,Neutral,0,piebald-coyote
2,2022-09-16 09:08:00+00:00,Wachten op dit bericht,0,False,4.0,2700.0,45.0,0.75,0,1,0,0.0,Neutral,0,chuckling-ibis
3,2022-09-16 09:23:00+00:00,Wachten op dit bericht,0,False,4.0,900.0,15.0,0.25,0,1,0,0.0,Neutral,0,roguish-shark
4,2022-09-16 09:24:00+00:00,Wachten op dit bericht,0,False,4.0,60.0,1.0,0.016667,0,1,0,0.0,Neutral,0,roguish-shark


options:
- word_count and has_emoji (maybe a negative correlation)
- word_count and react_time (short messages can be responded quicker) - Straight forward
- react_time and user activity (negative correlation)
- Sentiment between users

Sentiment correlation between users

In [10]:
# Group by author
sentiment_counts = df.groupby(['author', 'sentiment_category']).size().unstack(fill_value=0)

# add total messages
sentiment_counts['Total Messages'] = sentiment_counts.sum(axis=1)

# apply order
sentiment_order = ['Positive', 'Neutral', 'Negative', 'Total Messages']

result_df = sentiment_counts.reindex(columns=sentiment_order, fill_value=0)
print(result_df)

sentiment_category  Positive  Neutral  Negative  Total Messages
author                                                         
giggly-xenops            112     1061        22            1195
goofy-chimpanzee         130     1027        25            1182
goofy-wombat              68     1219        13            1300
hilarious-human           75      745        17             837
jubilant-goshawk         190     1406        31            1627
quirky-pony              130      914        18            1062
radiant-bee              128     1100        53            1281
rubbery-butterfly         91     1028        31            1150
whimsical-gorilla         33      459         9             501


Not really useful insights from sentiment. Let's see other correlations. 

In [14]:
# Use boolean indexing to filter the DataFrame
negative_messages = df[df['sentiment_category'] == 'Negative']

# Print the author, sentiment category, and the actual message
print(negative_messages[['author', 'sentiment_category', 'message']])

                  author sentiment_category  \
10           quirky-pony           Negative   
54      goofy-chimpanzee           Negative   
141          quirky-pony           Negative   
348    rubbery-butterfly           Negative   
405      hilarious-human           Negative   
...                  ...                ...   
9856        goofy-wombat           Negative   
9953         radiant-bee           Negative   
9954        goofy-wombat           Negative   
10083   jubilant-goshawk           Negative   
10125  rubbery-butterfly           Negative   

                                                 message  
10                  Sorry man ik heb feestje van familie  
54                                         Nee man sorry  
141        Hoe the fuck krijg je brobbey tegen de vlakte  
348           Tegen half 9 - 9 uur zou ik nog wel kunnen  
405                                      Half 9 is prima  
...                                                  ...  
9856                  

Word count per user

In [18]:
word_count_stats = df.groupby('author')['word_count'].agg(
    [('Min Word Count', 'min'), 
     ('Median Word Count', 'median'), 
     ('Average Word Count', 'mean'), 
     ('Max Word Count', 'max')]
)

print(word_count_stats)

                               Min Word Count  Median Word Count  \
author                                                             
carbonated-red-eyed tree frog             1.0                5.0   
chuckling-ibis                            1.0                4.0   
eye-catching-pelican                      1.0                4.0   
frothy-barracuda                          1.0                5.0   
hypnotic-stinkbug                         1.0                3.0   
piebald-coyote                            1.0                5.0   
roguish-shark                             1.0                5.0   
silky-jellyfish                           1.0                4.0   
whimsical-human                           1.0                5.0   

                               Average Word Count  Max Word Count  
author                                                             
carbonated-red-eyed tree frog            6.688172           161.0  
chuckling-ibis                           5.4269

split into group of tech and non-technical backbround

In [None]:
tech_word_count_stats = df.groupby('tech_background')['word_count'].agg(
    [('Min Word Count', 'min'), 
     ('Median Word Count', 'median'), 
     ('Average Word Count', 'mean'), 
     ('Max Word Count', 'max')]
)

print(tech_word_count_stats)