In [1]:
import requests
import json
import pandas as pd
import re
import configur
import time

In [131]:
class TextCleaner:
    def __init__(self):
        self.emoji_pattern = re.compile(
            "["
            "\U0001F600-\U0001F64F"  # emoticons
            "\U0001F300-\U0001F5FF"  # symbols & pictographs
            "\U0001F680-\U0001F6FF"  # transport & map symbols
            "\U0001F700-\U0001F77F"  # alchemical symbols
            "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
            "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
            "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
            "\U0001FA00-\U0001FA6F"  # Chess Symbols
            "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
            "\U00002702-\U000027B0"  # Dingbats
            "\U000024C2-\U0001F251"
            "]+", flags=re.UNICODE
        )

    def remove_emojis(self, message):
        return self.emoji_pattern.sub(r'', message)

    def clean_message(self, message):
        # Centralized message cleaning
        for substring in ['!', '.', ':', ',', '\r', '\n', '*', '|','_','$',
                          '@everyone','NEW SIGNAL',"Bishop's Ideas"]:
            message = message.replace(substring, '')
        for pattern in [r"<@&\d+>",r'https?//\S+']:
            message = re.sub(pattern, "", message)
        final = self.remove_emojis(message).strip()
        if re.match(r'\s\s+', final):
            return ''
        return final


In [132]:
# Uses Requests package to get the last 50 messages of a Discord channel
def retrieve_messages(channel_id, last_message_id=None, message_set = None,cleaner = None):
    auth_str = str(configur.thepass['disc_api'])
    headers = {'authorization': auth_str}
    url = f'https://discord.com/api/v9/channels/{channel_id}/messages'
    params = {'limit': 50}
    if last_message_id:
        params['before'] = last_message_id  # Fetch messages before this ID

    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        messages = response.json()
        if cleaner is None:
            cleaner = TextCleaner() # Custom class to get rid of emojis, common patterns, etc.
        if message_set is None:
            message_set = set()
        for message in messages:
            try:
                content = cleaner.clean_message(message.get('content', ''))
                if len(content.split()) <= 1: # Checks if content is small
                    embeds = message.get('embeds', [])
                    if embeds and 'title' in embeds[0]:
                        title_content = cleaner.clean_message(embeds[0]['title'])
                        if len(title_content.split()) <= 1: # No meaningful content in this message
                            continue
                        content = ' '.join(title_content.split()[:20]) if (len(title_content.split()) > 20) else title_content
                elif len(content.split()) > 20:
                    content = ' '.join(content.split()[:20])
            except Exception as e:
                print(f"Error: {str(e)}")
                continue
            
            if len(content.replace(' ','')) > 0: 
                username = message['author'].get('username', 'Unknown')
                timestamp = message.get('timestamp', '')
                message_set.add((username,content,timestamp))

        # If any received messages, recursively calls to see if there are more
        if messages:
            last_id = messages[-1].get('id')  # Gets the ID of the last message in this batch
            if last_id:
                retrieve_messages(channel_id, last_message_id=last_id, message_set=message_set)
    return message_set

In [133]:
def main():
    all_messages_set = set()
    cleaner = TextCleaner()
    # These are all from the discord rooms that we want messages from
    message_room_ID_list = ['1226712190548180992','1135983105665159178','1130040044388884610','1128313244797382696',
                        '1154858987460775946','1090776258071240745','1219035439625928744','1217864641527550073',
                        '1133173656109993984','682259216861626378','1120826825313112114']
    
    for channel_id in message_room_ID_list:
        all_messages_set = retrieve_messages(channel_id, message_set=all_messages_set,cleaner=cleaner)
    
    # After all discord rooms have been processed, saves to df and CSV
    if all_messages_set:
        msg_df = pd.DataFrame(list(all_messages_set), columns=['Username', 'Message', 'Timestamp'])
        display(msg_df)
        msg_df.to_csv(r"C:\Users\amoog\Desktop\Project_X\Project_X\disc_msg_df.csv")
main()

Unnamed: 0,Username,Message,Timestamp
0,Waxui Alerts 🍭,AMD Day Trade idea Bit of an early Lotto idea ...,2024-01-24T22:46:10.450000+00:00
1,Waxui Alerts 🍭,Closed SNOW hereSolid catch,2023-07-14T14:05:13.101000+00:00
2,Waxui Alerts 🍭,Chip Chop Sitting back,2023-12-15T17:20:50.715000+00:00
3,Market Bishop,I'm entering,2024-03-22T16:23:53.904000+00:00
4,waysotheraccount,watch jd test that high of day,2023-07-13T15:51:16.112000+00:00
...,...,...,...
5884,waxui,BULLPRINTER Planned Plays are posted above^ Pe...,2023-12-04T00:51:05.771000+00:00
5885,waysotheraccount,taking some some tlry 2 c lottos at 17/18 for ...,2023-07-27T14:54:50.002000+00:00
5886,MEE6,Good work <@395362132432257025> you just advan...,2023-08-24T23:38:40.328000+00:00
5887,Waxui Alerts 🍭,COST Swing idea Of course still keeping en eye...,2023-06-11T20:35:23.205000+00:00
