First test in notebook for correlation

In [39]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from loguru import logger
import warnings
from textblob import TextBlob

warnings.filterwarnings("ignore", category=FutureWarning)

In [40]:
import tomllib

configfile = Path("../config.toml").resolve()
with configfile.open("rb") as f:
    config = tomllib.load(f)
datafile = (Path("..") / Path(config["processed"]) / config["current"]).resolve()
if not datafile.exists():
    logger.warning(
        "Datafile does not exist. First run src/preprocess.py, and check the timestamp!"
    )
df = pd.read_parquet(datafile)
df.head()

Unnamed: 0,timestamp,message,living_in_city,tech_background,author,has_emoji,word_count,react_time_sec,react_time_min,react_time_hr,is_image,is_empty_message,is_removed_message,sentiment_polarity,sentiment_category
0,2022-09-16 07:31:00+00:00,Wachten op dit bericht,1,1,spattered-duck,False,4.0,,,,0,1,0,0.0,Neutral
1,2022-09-16 07:41:00+00:00,Wachten op dit bericht,0,1,riotous-dingo,False,4.0,600.0,10.0,0.166667,0,1,0,0.0,Neutral
2,2022-09-16 08:21:00+00:00,Wachten op dit bericht,0,0,translucent-dog,False,4.0,2400.0,40.0,0.666667,0,1,0,0.0,Neutral
3,2022-09-16 08:23:00+00:00,Wachten op dit bericht,0,0,hypnotic-rabbit,False,4.0,120.0,2.0,0.033333,0,1,0,0.0,Neutral
4,2022-09-16 09:08:00+00:00,Wachten op dit bericht,0,1,crystalline-uakari,False,4.0,2700.0,45.0,0.75,0,1,0,0.0,Neutral


options:
- word_count and has_emoji (maybe a negative correlation)
- word_count and react_time (short messages can be responded quicker) - Straight forward
- react_time and user activity (negative correlation)
- Sentiment between users

Sentiment correlation between users

In [10]:
# Group by author
sentiment_counts = df.groupby(['author', 'sentiment_category']).size().unstack(fill_value=0)

# add total messages
sentiment_counts['Total Messages'] = sentiment_counts.sum(axis=1)

# apply order
sentiment_order = ['Positive', 'Neutral', 'Negative', 'Total Messages']

result_df = sentiment_counts.reindex(columns=sentiment_order, fill_value=0)
print(result_df)

sentiment_category  Positive  Neutral  Negative  Total Messages
author                                                         
giggly-xenops            112     1061        22            1195
goofy-chimpanzee         130     1027        25            1182
goofy-wombat              68     1219        13            1300
hilarious-human           75      745        17             837
jubilant-goshawk         190     1406        31            1627
quirky-pony              130      914        18            1062
radiant-bee              128     1100        53            1281
rubbery-butterfly         91     1028        31            1150
whimsical-gorilla         33      459         9             501


Not really useful insights from sentiment. Let's see other correlations. 

In [14]:
# Use boolean indexing to filter the DataFrame
negative_messages = df[df['sentiment_category'] == 'Negative']

# Print the author, sentiment category, and the actual message
print(negative_messages[['author', 'sentiment_category', 'message']])

                  author sentiment_category  \
10           quirky-pony           Negative   
54      goofy-chimpanzee           Negative   
141          quirky-pony           Negative   
348    rubbery-butterfly           Negative   
405      hilarious-human           Negative   
...                  ...                ...   
9856        goofy-wombat           Negative   
9953         radiant-bee           Negative   
9954        goofy-wombat           Negative   
10083   jubilant-goshawk           Negative   
10125  rubbery-butterfly           Negative   

                                                 message  
10                  Sorry man ik heb feestje van familie  
54                                         Nee man sorry  
141        Hoe the fuck krijg je brobbey tegen de vlakte  
348           Tegen half 9 - 9 uur zou ik nog wel kunnen  
405                                      Half 9 is prima  
...                                                  ...  
9856                  

Word count per user

In [18]:
word_count_stats = df.groupby('author')['word_count'].agg(
    [('Min Word Count', 'min'), 
     ('Median Word Count', 'median'), 
     ('Average Word Count', 'mean'), 
     ('Max Word Count', 'max')]
)

print(word_count_stats)

                               Min Word Count  Median Word Count  \
author                                                             
carbonated-red-eyed tree frog             1.0                5.0   
chuckling-ibis                            1.0                4.0   
eye-catching-pelican                      1.0                4.0   
frothy-barracuda                          1.0                5.0   
hypnotic-stinkbug                         1.0                3.0   
piebald-coyote                            1.0                5.0   
roguish-shark                             1.0                5.0   
silky-jellyfish                           1.0                4.0   
whimsical-human                           1.0                5.0   

                               Average Word Count  Max Word Count  
author                                                             
carbonated-red-eyed tree frog            6.688172           161.0  
chuckling-ibis                           5.4269

split into group of tech and non-technical backbround

In [29]:
tech_word_count_stats = df.groupby('tech_background')['word_count'].agg(
    [('Min Word Count', 'min'), 
     ('Median Word Count', 'median'), 
     ('Average Word Count', 'mean'), 
     ('Max Word Count', 'max')]
)

print(tech_word_count_stats)

                 Min Word Count  Median Word Count  Average Word Count  \
tech_background                                                          
0                           1.0                5.0            6.932708   
1                           1.0                4.0            5.531656   

                 Max Word Count  
tech_background                  
0                         238.0  
1                          95.0  


Calculate the correlation between two attributes. Check data types

In [41]:
print(df['tech_background'].dtype)
print(df['word_count'].dtype)

int64
float64


In [42]:
from scipy import stats

# Assuming your DataFrame is named 'df'
correlation, p_value = stats.pointbiserialr(df['tech_background'], df['word_count'])

print(f"Point-Biserial Correlation: {correlation:.4f}")
print(f"P-value: {p_value:.4f}")

Point-Biserial Correlation: nan
P-value: nan


Test if variance might be 0 so correlation cannot be calculated

In [33]:
print(f"Standard deviation of 'technical_background': {df['tech_background'].std()}")
print(f"Standard deviation of 'word_count': {df['word_count'].std()}")

Standard deviation of 'technical_background': 0.498668555488144
Standard deviation of 'word_count': 8.195553198526575


Check for missing values

In [43]:
print(f"Missing values in 'technical_background': {df['tech_background'].isnull().sum()}")
print(f"Missing values in 'word_count': {df['word_count'].isnull().sum()}")

Missing values in 'technical_background': 0
Missing values in 'word_count': 6


In [44]:
import pandas as pd
from scipy import stats
import numpy as np

# Assuming 'df' is your DataFrame
# First, ensure your columns have the correct data types.
# This step is crucial if the columns are of 'object' or other non-numeric types.
df['tech_background'] = df['tech_background'].astype(int)
df['word_count'] = pd.to_numeric(df['word_count'], errors='coerce')

# Drop the records (rows) where 'word_count' is NaN
df_cleaned = df.dropna(subset=['word_count'])

# Now, calculate the Point-Biserial Correlation on the cleaned DataFrame
correlation, p_value = stats.pointbiserialr(df_cleaned['tech_background'], df_cleaned['word_count'])

print(f"Point-Biserial Correlation: {correlation:.4f}")
print(f"P-value: {p_value:.4f}")

Point-Biserial Correlation: -0.0852
P-value: 0.0000


In [46]:
import pandas as pd
from scipy import stats
import numpy as np

# Assuming 'df' is your DataFrame
# First, ensure your columns have the correct data types.
# This step is crucial if the columns are of 'object' or other non-numeric types.
df['tech_background'] = df['tech_background'].astype(int)
df['word_count'] = pd.to_numeric(df['word_count'], errors='coerce')

# Drop the records (rows) where 'word_count' is NaN
df_cleaned = df.dropna(subset=['word_count'])

# Filter the DataFrame in a single step using boolean indexing
df_filtered = df_cleaned[
    (df_cleaned['is_image'] == 0) &
    (df_cleaned['is_empty_message'] == 0) &
    (df_cleaned['is_removed_message'] == 0)
]

# Now, calculate the Point-Biserial Correlation on the cleaned DataFrame
correlation, p_value = stats.pointbiserialr(df_filtered['tech_background'], df_filtered['word_count'])

print(f"Point-Biserial Correlation: {correlation:.4f}")
print(f"P-value: {p_value:.4f}")

Point-Biserial Correlation: -0.0827
P-value: 0.0000


Correlation between words count and react_time

In [51]:
print(df['react_time_min'].dtype)
print(df['word_count'].dtype)

float64
float64


In [55]:
# Count NaN values in all columns
nan_counts = df.isnull().sum()
print("NaN counts per column:")
print(nan_counts)

NaN counts per column:
timestamp             0
message               6
living_in_city        0
tech_background       0
author                0
has_emoji             0
word_count            6
react_time_sec        1
react_time_min        1
react_time_hr         1
is_image              0
is_empty_message      0
is_removed_message    0
sentiment_polarity    0
sentiment_category    0
dtype: int64


In [None]:
from scipy.stats import pearsonr

# Drop the records (rows) where 'word_count' is NaN
df_cleaned = df.dropna(subset=['word_count', 'react_time_min'])


# Calculate correlation coefficient and p-value
correlation, p_value = pearsonr(df_cleaned['word_count'], df_cleaned['react_time_min'])

print(f"The Pearson correlation coefficient is: {correlation:.2f}")
print(f"The p-value is: {p_value:.4f}")

The Pearson correlation coefficient is: nan
The p-value is: nan
