In [327]:
from pymongo import MongoClient
import pprint

import os, sys
# Add parent directory to path to import modules from src
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

from src.loader import SlackDataLoader
import src.utils as utils
import json
import pandas as pd

data_loader = SlackDataLoader("../data")


In [328]:
class DBWithSchema:
    def __init__(self) -> None:
        self.client = MongoClient("mongodb://localhost:27017/")
        self.db = self.client["SlackDb"]

        self.user_validator = {
            "$jsonSchema": {
                "bsonType": "object",
                "required": ["user_id", "real_name"],
                "properties": {
                    "user_id": {
                        "bsonType": "string",
                        "description": "must be a string and is required"
                    },
                    "real_name": {
                        "bsonType": "string",
                        "description": "must be a string and is required"
                    }
                }
            }
        }

        self.channel_validator = {
            "$jsonSchema": {
                "bsonType": "object",
                "required": ["channel_id", "name"],
                "properties": {
                    "channel_id": {
                        "bsonType": "string",
                        "description": "must be a string and is required"
                    },
                    "name": {
                        "bsonType": "string",
                        "description": "must be a string and is required"
                    }
                }
            }
        }

        self.message_validator = {
            "$jsonSchema": {
                "bsonType": "object",
                "required": ["ts", "msg_id", "text", "user_id", "channel_id", "mentions", "reactions", "replies"],
                "properties": {
                    "ts": {
                        "bsonType": "string",
                        "description": "must be a string and is required"
                    },

                    "msg_id": {
                        "bsonType": "string",
                        "description": "must be a string and is required"
                    },
                    "text": {
                        "bsonType": "string",
                        "description": "must be a string and is required"
                    },
                    "user_id": {
                        "bsonType": "string",
                        "description": "must be a string and is required"
                    },
                    "channel_id": {
                        "bsonType": "string",
                        "description": "must be a string and is required"
                    },
                    "mentions": {
                        'bsonType': ['array', 'null'],

                        "items": {
                            "bsonType": "string"
                        },
                        "description": "must be an array of strings"
                    },
                    "reactions": {
                        'bsonType': ['array', 'null'],

                        "items": {
                            "bsonType": "object",
                            "required": ["name", "users", "count"],
                            "properties": {
                                "name": {
                                    "bsonType": "string",
                                    "description": "must be a string and is required"
                                },
                                "users": {
                                    "bsonType": "array",
                                    "items": {
                                        "bsonType": "string"
                                    },
                                    "description": "must be an array of strings"
                                },
                                "count": {
                                    "bsonType": "number",
                                    "description": "must be a number and is required"
                                }
                            }
                        }
                    },
                    "replies": {
                        'bsonType': ['array', 'null'],

                        "items": {
                            "bsonType": "object",
                            "required": ["user", "ts"],
                            "properties": {
                                "user": {
                                    "bsonType": "string",
                                    "description": "must be a string and is required"
                                },
                                "ts": {
                                    "bsonType": "string",
                                    "description": "must be a string and is required"
                                }
                            }
                        }
                    }
                }
            }
        }


        try:
            self.db.create_collection("users")
            self.db.create_collection("channels")
            self.db.create_collection("messages")
        except Exception as e:
            print("error: ", e)

        self.db.command("collMod", "users", validator=self.user_validator)
        self.db.command("collMod", "channels", validator=self.channel_validator)
        self.db.command("collMod", "messages", validator=self.message_validator)



    def list_collections(self):
        return self.db.list_collection_names()
    
    def get_validation(self, collection_name: str) -> dict:
        self.check_if_collection_exist(collection_name)
        return self.db.get_collection(collection_name).options()
    
    def check_if_collection_exist(self, collection_name: str):
        if not self.list_collections().__contains__(collection_name):
            raise Exception(f"Collection, {collection_name} not found.")

    def insert_to_collection(self, collection_name, data):
        self.check_if_collection_exist(collection_name)
        collection = self.db[collection_name]
        return collection.insert_one(data)

    def insert_many_to_collection(self, collection_name, data):
        self.check_if_collection_exist(collection_name)
        result = self.db[collection_name].insert_many(data)
        return result.inserted_ids



    def find_all(self, collection_name):
        self.check_if_collection_exist(collection_name)
        return self.db[collection_name].find()

    def find(self, collection_name, key, value):
        self.check_if_collection_exist(collection_name)
        return self.db[collection_name].find({key: value})
    
    def find_by_id(self, collection_name, _id):
        self.check_if_collection_exist(collection_name)
        return self.db[collection_name].find

    def find_one(self, collection_name, key, value):
        self.check_if_collection_exist(collection_name)
        return self.db[collection_name].find_one({key: value})
    


In [329]:
db = DBWithSchema()

error:  collection users already exists


## Insert all users into the mongo database

In [330]:
all_users = []
for user in data_loader.users:
    cur_user = {"user_id": user["id"], "real_name": user["real_name"]}
    all_users.append(cur_user)
    
print("total users: ", len(all_users))
print(all_users[:10])

total users:  71
[{'user_id': 'U03T89ACUUW', 'real_name': 'Carlos Gross'}, {'user_id': 'U03TEPYRM2P', 'real_name': 'Garrett Bell'}, {'user_id': 'U03TNP8Q8CT', 'real_name': 'Bethany George'}, {'user_id': 'U03TT5KEYCF', 'real_name': 'Luis Ingram'}, {'user_id': 'U03TX2VN6H5', 'real_name': 'Nicole Kim'}, {'user_id': 'U03U1FNPEUX', 'real_name': 'Joshua Rhodes'}, {'user_id': 'U03U1FQKEMV', 'real_name': 'Steven Garcia'}, {'user_id': 'U03U1GHT39V', 'real_name': 'Joseph Diaz'}, {'user_id': 'U03U1HAG9TR', 'real_name': 'Robert Carter'}, {'user_id': 'U03U1J51VFZ', 'real_name': 'Cheryl Hudson'}]


In [331]:

inserted_records = db.insert_many_to_collection('users', all_users)
print(inserted_records)

[ObjectId('656780b6cd32dd8246eb8e1c'), ObjectId('656780b6cd32dd8246eb8e1d'), ObjectId('656780b6cd32dd8246eb8e1e'), ObjectId('656780b6cd32dd8246eb8e1f'), ObjectId('656780b6cd32dd8246eb8e20'), ObjectId('656780b6cd32dd8246eb8e21'), ObjectId('656780b6cd32dd8246eb8e22'), ObjectId('656780b6cd32dd8246eb8e23'), ObjectId('656780b6cd32dd8246eb8e24'), ObjectId('656780b6cd32dd8246eb8e25'), ObjectId('656780b6cd32dd8246eb8e26'), ObjectId('656780b6cd32dd8246eb8e27'), ObjectId('656780b6cd32dd8246eb8e28'), ObjectId('656780b6cd32dd8246eb8e29'), ObjectId('656780b6cd32dd8246eb8e2a'), ObjectId('656780b6cd32dd8246eb8e2b'), ObjectId('656780b6cd32dd8246eb8e2c'), ObjectId('656780b6cd32dd8246eb8e2d'), ObjectId('656780b6cd32dd8246eb8e2e'), ObjectId('656780b6cd32dd8246eb8e2f'), ObjectId('656780b6cd32dd8246eb8e30'), ObjectId('656780b6cd32dd8246eb8e31'), ObjectId('656780b6cd32dd8246eb8e32'), ObjectId('656780b6cd32dd8246eb8e33'), ObjectId('656780b6cd32dd8246eb8e34'), ObjectId('656780b6cd32dd8246eb8e35'), ObjectId('6

In [332]:

records = db.find_all('users')
for record in records:
    pprint.pprint(record)

{'_id': ObjectId('656765d0309ab36f1dbec82b'),
 'real_name': 'Carlos Gross',
 'user_id': 'U03T89ACUUW'}
{'_id': ObjectId('656765d0309ab36f1dbec82c'),
 'real_name': 'Garrett Bell',
 'user_id': 'U03TEPYRM2P'}
{'_id': ObjectId('656765d0309ab36f1dbec82d'),
 'real_name': 'Bethany George',
 'user_id': 'U03TNP8Q8CT'}
{'_id': ObjectId('656765d0309ab36f1dbec82e'),
 'real_name': 'Luis Ingram',
 'user_id': 'U03TT5KEYCF'}
{'_id': ObjectId('656765d0309ab36f1dbec82f'),
 'real_name': 'Nicole Kim',
 'user_id': 'U03TX2VN6H5'}
{'_id': ObjectId('656765d0309ab36f1dbec830'),
 'real_name': 'Joshua Rhodes',
 'user_id': 'U03U1FNPEUX'}
{'_id': ObjectId('656765d0309ab36f1dbec831'),
 'real_name': 'Steven Garcia',
 'user_id': 'U03U1FQKEMV'}
{'_id': ObjectId('656765d0309ab36f1dbec832'),
 'real_name': 'Joseph Diaz',
 'user_id': 'U03U1GHT39V'}
{'_id': ObjectId('656765d0309ab36f1dbec833'),
 'real_name': 'Robert Carter',
 'user_id': 'U03U1HAG9TR'}
{'_id': ObjectId('656765d0309ab36f1dbec834'),
 'real_name': 'Cheryl Huds

In [333]:
def get_messages_dict(msgs):
    msg_list = {
            "msg_id":[],
            "text":[],
            "user":[],
            "mentions":[],
            "reactions":[],
            "replies":[],
            "replies_to":[],
            "ts":[],
            }


    for msg in msgs:
        if "subtype" not in msg:
            try:
                msg_list["msg_id"].append(msg["client_msg_id"])
            except:
                msg_list["msg_id"].append(None)
            msg_list["text"].append(msg["text"])

            msg_list["user"].append(msg["user"])
            msg_list["ts"].append(msg["ts"])

            
            if "reactions" in msg:
                msg_list["reactions"].append(msg["reactions"])
            else:

                msg_list["reactions"].append(None)

            if "parent_user_id" in msg:
                msg_list["replies_to"].append(msg["ts"])
            else:
                msg_list["replies_to"].append(None)

            if "thread_ts" in msg and "reply_users" in msg:
                msg_list["replies"].append(msg["replies"])
            else:
                msg_list["replies"].append(None)
            
            if "blocks" in msg:
                mention_list = []
                
                for blk in msg["blocks"]:
                    if "elements" in blk:
                        for elm in blk["elements"]:
                            if "elements" in elm:
                                for elm_ in elm["elements"]:
                                    
                                    if "type" in elm_:
                                      
                                        if elm_["type"] == "user":
                                            mention_list.append(elm_["user_id"])
                                       


                msg_list["mentions"].append(mention_list)
            else:
                msg_list["mentions"].append(None)
    
    return msg_list


In [334]:
def get_messages_from_channel(channel_path):
    '''
    get all the messages from a channel        
    
    '''
    json_files = [
        f"{channel_path}/{pos_json}" 
        for pos_json in os.listdir(channel_path) 
        if pos_json.endswith('.json')
    ]    
    combined = []

    for json_file in json_files:
        with open(json_file, 'r', encoding="utf8") as slack_data:
            json_content = json.load(slack_data)
            combined.extend(json_content)
        
    msg_list = get_messages_dict(combined)
    df = pd.DataFrame(msg_list)
    
    return df
    

## Insert all channels into the mongo database

In [335]:
all_channels = []
for channel in data_loader.channels:
    cur_channel = {"channel_id": channel["id"], "name": channel["name"]}
    all_channels.append(cur_channel)
    
print("total channels: ", len(all_channels))
print(all_channels[:10])

total channels:  39
[{'channel_id': 'C03T0APHX63', 'name': 'all-community-building'}, {'channel_id': 'C03T0AX4K6K', 'name': 'all-technical-support'}, {'channel_id': 'C03T89KDGA2', 'name': 'all-career-exercises'}, {'channel_id': 'C03T89PMJKG', 'name': 'all-resources'}, {'channel_id': 'C03TBUCU4UD', 'name': 'random'}, {'channel_id': 'C03TEQM38HH', 'name': 'all-ideas'}, {'channel_id': 'C03TEQQS9NF', 'name': 'all-week1'}, {'channel_id': 'C03U4J8J4LQ', 'name': 'all-broadcast'}, {'channel_id': 'C03UG4LHM8A', 'name': 'tenx-bot'}, {'channel_id': 'C03V3LK61QX', 'name': 'team-10'}]


In [336]:

inserted_channels = db.insert_many_to_collection('channels', all_channels)
print(inserted_channels)

[ObjectId('656780b8cd32dd8246eb8e63'), ObjectId('656780b8cd32dd8246eb8e64'), ObjectId('656780b8cd32dd8246eb8e65'), ObjectId('656780b8cd32dd8246eb8e66'), ObjectId('656780b8cd32dd8246eb8e67'), ObjectId('656780b8cd32dd8246eb8e68'), ObjectId('656780b8cd32dd8246eb8e69'), ObjectId('656780b8cd32dd8246eb8e6a'), ObjectId('656780b8cd32dd8246eb8e6b'), ObjectId('656780b8cd32dd8246eb8e6c'), ObjectId('656780b8cd32dd8246eb8e6d'), ObjectId('656780b8cd32dd8246eb8e6e'), ObjectId('656780b8cd32dd8246eb8e6f'), ObjectId('656780b8cd32dd8246eb8e70'), ObjectId('656780b8cd32dd8246eb8e71'), ObjectId('656780b8cd32dd8246eb8e72'), ObjectId('656780b8cd32dd8246eb8e73'), ObjectId('656780b8cd32dd8246eb8e74'), ObjectId('656780b8cd32dd8246eb8e75'), ObjectId('656780b8cd32dd8246eb8e76'), ObjectId('656780b8cd32dd8246eb8e77'), ObjectId('656780b8cd32dd8246eb8e78'), ObjectId('656780b8cd32dd8246eb8e79'), ObjectId('656780b8cd32dd8246eb8e7a'), ObjectId('656780b8cd32dd8246eb8e7b'), ObjectId('656780b8cd32dd8246eb8e7c'), ObjectId('6

In [337]:

records = db.find_all('channels')
for record in records:
    pprint.pprint(record)

{'_id': ObjectId('65676695309ab36f1dbec872'),
 'channel_id': 'C03T0APHX63',
 'name': 'all-community-building'}
{'_id': ObjectId('65676695309ab36f1dbec873'),
 'channel_id': 'C03T0AX4K6K',
 'name': 'all-technical-support'}
{'_id': ObjectId('65676695309ab36f1dbec874'),
 'channel_id': 'C03T89KDGA2',
 'name': 'all-career-exercises'}
{'_id': ObjectId('65676695309ab36f1dbec875'),
 'channel_id': 'C03T89PMJKG',
 'name': 'all-resources'}
{'_id': ObjectId('65676695309ab36f1dbec876'),
 'channel_id': 'C03TBUCU4UD',
 'name': 'random'}
{'_id': ObjectId('65676695309ab36f1dbec877'),
 'channel_id': 'C03TEQM38HH',
 'name': 'all-ideas'}
{'_id': ObjectId('65676695309ab36f1dbec878'),
 'channel_id': 'C03TEQQS9NF',
 'name': 'all-week1'}
{'_id': ObjectId('65676695309ab36f1dbec879'),
 'channel_id': 'C03U4J8J4LQ',
 'name': 'all-broadcast'}
{'_id': ObjectId('65676695309ab36f1dbec87a'),
 'channel_id': 'C03UG4LHM8A',
 'name': 'tenx-bot'}
{'_id': ObjectId('65676695309ab36f1dbec87b'),
 'channel_id': 'C03V3LK61QX',
 '

## Insert all messages into the mongo database

In [338]:
import uuid

all_messages = []
for channel in  data_loader.channels:

    channel_messages = get_messages_from_channel(f"../data/{channel["name"]}")


    formatted_messages = channel_messages.apply(lambda msg: {
        "msg_id": msg["msg_id"] if msg["msg_id"] else str(uuid.uuid4()),
        "text": msg["text"],
        "user_id": msg["user"],
        "mentions": msg["mentions"] if msg["mentions"] else [],
        "reactions": msg["reactions"] if msg["reactions"] else [],
        "replies": msg["replies"] if msg["replies"] else [],
        "ts": msg["ts"],
        "channel_id": channel["id"]
    }, axis=1)

    all_messages.extend(formatted_messages.to_list())


In [339]:

print("total messages: ", len(all_messages))
print(all_messages[:3])

total messages:  18944
[{'msg_id': '16f68d4e-0ceb-448a-b660-d5ef2eb05305', 'text': '*HOTSEAT ANNOUNCEMENT*', 'user_id': 'U03V1AM5TFA', 'mentions': [], 'reactions': [], 'replies': [], 'ts': '1662620680.298449', 'channel_id': 'C03T0APHX63'}, {'msg_id': '7c641275-2e52-4074-9894-744f049d5377', 'text': "*<!here>* Good morning Community! We are very happy and excited to announce that today's hot seat will be hosting our very own <@U03U1GHT39V>  :wink::clap::skin-tone-2::confetti_ball:   Let's prep our burning questions for him!", 'user_id': 'U03V1AM5TFA', 'mentions': ['U03U1GHT39V'], 'reactions': [{'name': 'fire', 'users': ['U03U9FWPNCE'], 'count': 1}], 'replies': [], 'ts': '1662620806.359419', 'channel_id': 'C03T0APHX63'}, {'msg_id': '245ecc4d-2c1b-4bee-b280-a1fd5ab7fee3', 'text': '*<!here> Community Building Session REMINDER!*:timer_clock:\n*Please note that CBS is on in the next 10min*\n• *Session:* Scavenger hunt\n• *Time*: From 12:00 PM - 12:30 PM UTC\n• *Platform: Gmeet &amp; Slack*\n'

In [340]:
def remove_duplicates(json_list, key):
    seen = set()
    unique_list = []

    for item in json_list:
        item_key = item.get(key)
        if item_key not in seen:
            unique_list.append(item)
            seen.add(item_key)

    return unique_list


# Remove duplicates based on 'msg_id'
unique_json_list = remove_duplicates(all_messages, key="msg_id")


In [341]:
for i in range(len(unique_json_list)):
    db.insert_to_collection("messages", unique_json_list[i])

In [342]:
records = db.find_all('messages')

# Print the first 5 records
for i, record in enumerate(records):
    if i < 5:
        pprint.pprint(record)
    else:
        break


{'_id': ObjectId('65676f5e309ab36f1dbf5e56'),
 'channel_id': 'C03T0APHX63',
 'mentions': [],
 'msg_id': '16f68d4e-0ceb-448a-b660-d5ef2eb05305',
 'reactions': None,
 'replies': None,
 'text': '*HOTSEAT ANNOUNCEMENT*',
 'ts': '1662620680.298449',
 'user_id': 'U03V1AM5TFA'}
{'_id': ObjectId('65676f5e309ab36f1dbf5e57'),
 'channel_id': 'C03T0APHX63',
 'mentions': ['U03U1GHT39V'],
 'msg_id': '7c641275-2e52-4074-9894-744f049d5377',
 'reactions': [{'count': 1, 'name': 'fire', 'users': ['U03U9FWPNCE']}],
 'replies': None,
 'text': '*<!here>* Good morning Community! We are very happy and excited to '
         "announce that today's hot seat will be hosting our very own "
         "<@U03U1GHT39V>  :wink::clap::skin-tone-2::confetti_ball:   Let's "
         'prep our burning questions for him!',
 'ts': '1662620806.359419',
 'user_id': 'U03V1AM5TFA'}
{'_id': ObjectId('65676f5e309ab36f1dbf5e58'),
 'channel_id': 'C03T0APHX63',
 'mentions': [],
 'msg_id': '245ecc4d-2c1b-4bee-b280-a1fd5ab7fee3',
 'reac