In [4]:
# Cleans up a csv file of discord chat logs
# Removes all messages that contain a link or non ascii characters and punctuations in a message

import os
import csv
import re
import sys
import random
import urllib.request

import json

MESSAGE_FILES = 3

messages_json = []

for i in range(1, MESSAGE_FILES + 1):
    with open(f"Training Data/crackhome_2887990084562599/message_{i}.json") as f:
        messages_json.append(json.load(f))

print(messages_json[0]["participants"])

messages = []
authors = []

for message_json in messages_json:
    for msg in message_json["messages"]:
        if "content" in msg:
            messages.append(msg["content"])
            authors.append(msg["sender_name"])

print(len(messages))
print(len(authors))

[{'name': 'Instagram User'}, {'name': 'jeremy'}, {'name': 'bn'}, {'name': 'Simon'}, {'name': 'matt_m_h'}, {'name': 'Parvish Ravindran'}, {'name': 'Rue'}, {'name': 'Dylan .T.'}, {'name': 's.onnyngo'}, {'name': 'Jess *â\x9c§ï½¥ï¾\x9f'}, {'name': 'Shrimpy Raccoon'}, {'name': 'Dimitri .T.'}, {'name': 'heathh.xtraa'}]
25910
25910


In [5]:
CLEANED_FILEPATH = "ch_cleaned.csv"

# Load the stop words
#STOP_WORDS = urllib.request.urlopen("https://raw.githubusercontent.com/igorbrigadir/stopwords/master/en/atire_puurula.txt").read().decode("utf-8").split("\n")
STOP_WORDS = urllib.request.urlopen("https://github.com/igorbrigadir/stopwords/blob/master/en/postgresql.txt").read().decode("utf-8").split("\n")

def clean_message(message):
    message = message.replace(".", "")
    message = message.replace(",", "")
    message = message.replace(";", "")
    message = message.replace("!", "")
    message = message.replace("?", "")
    message = message.replace("(", "")
    message = message.replace(")", "")
    message = message.replace("\\", "")
    message = message.replace("\"", "")

    # remove discord effects
    message = message.replace("*", "")
    message = message.replace("_", "")
    message = message.replace("~", "")
    message = message.replace("`", "")
    message = message.replace(">", "")
    message = message.replace("<", "")
    message = message.replace("||", "")
    message = message.replace("```", "")
    message = message.replace("~~", "")
    message = message.replace(":", "")
    message = message.replace("#", "")
    message = message.replace("@", "")


    # remove stopwords
    message = message.lower()
    message = ' '.join([word for word in message.split() if word not in STOP_WORDS])

    return message

participating_users = {}
same_user = {}

try:
    # We are using a users.json file to map the user names to the actual names
    # and to keep the participating users anonymous
    with open("users.json") as f:
        users = json.load(f)
        participating_users = users["participants"]
        same_user = users["participants_aliases"]
except FileNotFoundError:
    print("users.json not found")
    sys.exit(1)

not_message = 0
empty_message = 0
reaction = 0
unknown_user = 0
link = 0
image = 0
not_message_after_stop = 0
code = 0
skipped = 0
non_ascii = 0

messsage_author_pairs = []

for i in range(len(messages)):
    message = messages[i]
    author = authors[i]

    if not message:
        empty_message += 1
        skipped += 1
        continue

    if not all(ord(char) < 128 for char in message):
        non_ascii += 1
        skipped += 1
        continue

    if message.strip() == "":
        empty_message += 1
        skipped += 1
        continue

    if message == "This message was deleted.":
        empty_message += 1
        skipped += 1
        continue

    if "sent an attachment." in message:
        image += 1
        skipped += 1
        continue

    if "liked a message" in message:
        reaction += 1
        skipped += 1
        continue

    if "added a reaction." in message:
        reaction += 1
        skipped += 1
        continue

    if "to your message" in message:
        reaction += 1
        skipped += 1
        continue

    # Check if the message contains a link
    if "http" in message:
        link += 1
        skipped += 1
        continue

    if author in same_user:
        author = same_user[author]

    if author not in participating_users:
        unknown_user += 1
        skipped += 1
        continue

    message = clean_message(message)

    # Check again if the message is empty after removing the stop words and punctuations
    if not message:
        not_message_after_stop += 1
        skipped += 1
        continue

    messsage_author_pairs.append([message, participating_users[author]])


print(messsage_author_pairs[0])
print(f"Skipped {skipped} messages")
print(f"Empty messages: {empty_message}")
print(f"Non ascii messages: {non_ascii}")
print(f"Reactions: {reaction}")
print(f"Unknown user: {unknown_user}")
print(f"Messages with links: {link}")
print(f"Images: {image}")
print(f"Messages that are empty after removing stop words: {not_message_after_stop}")
print(f"Total messages: {len(messsage_author_pairs)}")
print("\n")
print(f"Shuffling messages...")
random.shuffle(messsage_author_pairs)

# sort the messages by author
print(f"Sorting messages by author...")
messsage_author_pairs.sort(key=lambda x: x[1])

print(f"Writing to file...")
with open(CLEANED_FILEPATH, "w") as f_out:
    writer = csv.writer(f_out)
    writer.writerow(["message", "author"])
    for msg_and_auth in messsage_author_pairs:
        writer.writerow(msg_and_auth)

print("Done!")



['the end of half life alyx is essentially half life 3 confirmed', 0]
Skipped 11110 messages
Empty messages: 0
Non ascii messages: 1694
Reactions: 529
Unknown user: 8796
Messages with links: 0
Images: 64
Messages that are empty after removing stop words: 27
Total messages: 14800


Shuffling messages...
Sorting messages by author...
Writing to file...
Done!
