In [220]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from datetime import date
import datetime
from datetime import timedelta
import numpy as np
import re
from emoji import UNICODE_EMOJI
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
data = pd.read_csv("Combined.csv", index_col=0)
data = data[data['content'].notna()]
data = data[data['author'] != 'Groovy']
users = np.unique(data['author'])
data['time'] = pd.to_datetime(data['time'])
data = data.sort_values('time')
data = data.reset_index(drop=True)
data = data.drop('attachment', axis=1)
data = data.replace(regex={'\\r\\r\\n': ' '})
moddata = data[~data['content'].str.match('[A-Za-z\s.,!?\"\'#*:()0-9|^]+')]
emojidata = moddata[moddata['content'].apply(lambda x: contains_emoji(x))]
notemoji = moddata[moddata['content'].apply(lambda x: not contains_emoji(x))]

In [None]:
def count_emote(emote_dict, msg):
    counted = set()
    for token in msg.split():
        if token[0] == '<' and token[-1] == '>' and '@' not in token and token.count(':') == 2:
            emote = token[1:token.rfind(':') + 1]
            if emote not in counted:
                counted.add(emote)
                if emote in emote_dict:
                    emote_dict[emote] += 1
                else:
                    emote_dict[emote] = 1

emote_dict = {}
for msg in notemoji['content']:
    count_emote(emote_dict, msg)
emote_list = sorted(emote_dict.items(), key=lambda item: item[1], reverse=True)
emote_df = pd.DataFrame(emote_list, columns=['Emote', 'Count'])
emote_df.to_csv('Emote Usage.csv')

In [None]:
def count_emoji(emoji_dict, msg):
    counted = set()
    for char in msg:
        if char in UNICODE_EMOJI['en'] and char not in counted:
            counted.add(char)
            if char in emoji_dict:
                emoji_dict[char] += 1
            else:
                emoji_dict[char] = 1

emoji_dict = {}
for msg in emojidata['content']:
    count_emoji(emoji_dict, msg)
emoji_list = sorted(emoji_dict.items(), key=lambda item: item[1], reverse=True)
emoji_df = pd.DataFrame(emoji_list, columns=['Emoji', 'Count'])
emoji_df.to_csv('Emoji Usage.csv')

In [44]:
data = pd.read_csv("Combined.csv", index_col=0)
data = data[data['content'].notna()]
data = data[data['author'] != 'Groovy']
users = np.unique(data['author'])
data = data.replace(regex={'<.*:.+:.+>': '§', r'http[^\s]+': '¤', '<@.+>': '@'})
data = data[data['content'] != '¤']
data = data.replace(regex={'\\r\\r\\n': ' '})
data['time'] = pd.to_datetime(data['time'])
data = data.sort_values('time')

In [285]:
output_df = pd.DataFrame(index=users, columns=['Total msgs', 'Spoilers', 'Mean msg length', 'Median msg length', 
                                               'Msg length standard dev', 'Short msgs', 'Long msgs', 
                                               'Msgs with emotes', 'Emote-only msgs', 'Longest msg chain', 
                                               'Longest chain day', 'Longest msg chain (not #voice-chat)', 
                                               'Longest chain day (not #voice-chat)']) 
# Also most active day in general?

In [286]:
# Total msgs
for user in users:
    output_df.loc[user, 'Total msgs'] = len(data[data['author'] == user])

In [287]:
# Most spoiler tags
spoilerdata = data[data['content'].str.contains('\|\|')]
for user in users:
    user_spoilerdata = spoilerdata[spoilerdata['author'] == user]
    output_df.loc[user, 'Spoilers'] = len(user_spoilerdata)

In [288]:
# Mean, median, standard dev of message length (only messages containing non-link letters are counted)
for user in users:
    user_data = data[data['author'] == user]
    user_data = user_data[user_data['content'].str.contains('[A-Za-z]')]
    user_data['msg len'] = user_data['content'].str.len()
    output_df.loc[user, 'Median msg length'] = np.median(user_data['msg len'])
    output_df.loc[user, 'Mean msg length'] = np.mean(user_data['msg len'])
    output_df.loc[user, 'Msg length standard dev'] = np.std(user_data['msg len'])

In [289]:
# Short (including emote-only) and long messages (longer than average mean msg length + average msg length std)
LONG_MESSAGE_THRESHOLD = np.mean(output_df['Msg length standard dev']) + np.mean(output_df['Mean msg length'])
for user in users:
    user_data = data[data['author'] == user]
    user_data['msg len'] = user_data['content'].str.len()
    user_data['spaces'] = user_data['content'].apply(lambda x: len(re.findall('\s+', x)))
    output_df.loc[user, 'Short msgs'] = len(user_data[user_data['spaces'] < 6])
    output_df.loc[user, 'Long msgs'] = len(user_data[user_data['msg len'] > LONG_MESSAGE_THRESHOLD])

In [290]:
def contains_emoji(msg):
    for char in msg:
        if char in UNICODE_EMOJI['en']:
            return True
    return False

def only_emojis(msg):
    nospace = "".join(msg.split())
    found_emoji = False
    for char in nospace:
        if char == '§' or char in UNICODE_EMOJI['en']:
            found_emoji = True
        elif char != '?' and char != '.' and char != ',' and char != '!':
            return False
    
    return found_emoji

In [291]:
# Messages with emotes/emojis and messages with only emotes/emojis and punctuation
for user in users:
    user_data = data[data['author'] == user]
    user_data['has emoji'] = user_data['content'].apply(lambda x: contains_emoji(x))
    user_data['has emoji'] = (user_data['has emoji']) | (user_data['content'].apply(lambda x: '§' in x))
    user_data['only emoji'] = user_data['content'].apply(lambda x: only_emojis(x))
    output_df.loc[user, 'Emote-only msgs'] = len(user_data[user_data['only emoji']])
    output_df.loc[user, 'Msgs with emotes'] = len(user_data[user_data['has emoji']])

In [294]:
# The most consecutive messages a user sent with no more than a 20 minute pause,
# in #voice-chat and outside of #voice-chat
for user in users:
    user_data = data[data['author'] == user]
    user_data = user_data[user_data['channel'] != '#voice-chat']
    user_data['next time'] = user_data['time'].shift(-1)
    user_data['next time'].iloc[-1] = datetime.datetime(2022, 12, 31)
    user_data['not last msg'] = user_data['next time'] - user_data['time'] < timedelta(minutes=20)
    user_data = user_data.drop('next time', axis=1)
    user_data['trail len'] = np.zeros(len(user_data))
    for row in range(len(user_data) - 1, -1, -1):
        if not user_data['not last msg'].iloc[row]:
            user_data['trail len'].iloc[row] = 1
        else:
            user_data['trail len'].iloc[row] = user_data['trail len'].iloc[row + 1] + 1
    user_data = user_data.sort_values('trail len', ascending=False)
    output_df.loc[user, 'Longest msg chain (not #voice-chat)'] = user_data.iloc[0, 6]
    output_df.loc[user, 'Longest chain day (not #voice-chat)'] = user_data.iloc[0, 4].date()

In [303]:
# Day with the most messages
busy_data = data.copy()
busy_data['time'] = busy_data['time'].apply(lambda x: x.date())
busiest_date = busy_data['time'].mode()[0]
len(busy_data[busy_data['time'] == busiest_date])


2020-08-15


6043