# Converting Q&A Slack messages into text format
### Alice Yu, 09/02/23

In [1]:
import os
import pandas as pd
import numpy as np
import pandas.io.json as pd_json

In [2]:
# Function for cleaning message, replacing chars, etc.

def cleanMsg(msg, replace_dict):
    
    # Replace username ID with usernames
    for keys in replace_dict:
        msg = msg.replace(keys, replace_dict[keys])
        
    # Remove slack formatting characters
    msg = msg.replace("&gt; ", "") 
    msg = msg.strip()
    msg = msg.capitalize()
    msg = msg.replace("&amp;", "&")
    msg = msg.replace("<", "")
    msg = msg.replace(">", "")
    
    return(msg)

# Function that converts data frame into dictionary that stores timestamped messages
# key is timestamp
# value is user, text, and reply list

def convertDFtoDict(df_curr):
    msg_dict = {}
    for i in range(0,df_curr.shape[0]):

        # Saving variables
        timestamp = str(df_curr["ts"].iloc[i,])
        username = users_dict[df_curr["user"].iloc[i,]]
        msg_text = str(df_curr["text"].iloc[i,])
        
        # Appending time stamps that are associated with each thread
        replies_ts = []
        try:
            for reply in df_curr["replies"].iloc[i,]:
                replies_ts.append(reply["ts"])
        except TypeError:
            var = 0
                        
        # Creating timestamp message entry
        msg_dict[timestamp] = {"user_id":username, 
                               "msg_text":msg_text, 
                               "reply_count":replies_ts}
        
    return(msg_dict)

# USER VARIABLES

In [3]:
# Change variables based on your analysis

# OPTIONAL: setting path to folder where files are located
filepath = "/Users/aliceyu/Documents/BitsInBio/"

# Path to json files with Slack messages
jsonfiles = os.listdir(filepath + "q-and-a/")

outputfolder = "converted-q-and-a/"

interview_company = "Poly" #name of company. this is used for the output file
qanda_dates = ["2022-01-20", "2022-01-21", "2022-01-27"] # dates that the interview spanned across



### Creating a data frame with all the dates

In [4]:
df_list = []
for i in range(0, len(jsonfiles)):
    if ".json" in jsonfiles[i]:
        df_curr = pd.read_json(filepath + "q-and-a/" + jsonfiles[i]) #, dtype = "object", encoding="latin-1")
        df_curr["date"] = jsonfiles[i].replace(".json", "")
        df_list.append(df_curr)
    
df = pd.concat(df_list)

# Filter out default messages that are not relevant to the Q-and-A
df = df[df["text"].str.contains("has joined the channel") == False]
df = df[df["text"].str.contains("Set the channel topic") == False]

In [5]:
df.head(2)

Unnamed: 0,client_msg_id,type,text,user,ts,team,user_team,source_team,user_profile,attachments,...,subtype,root,bot_id,bot_profile,files,upload,display_as_bot,purpose,inviter,topic
0,5de6bc88-704b-4941-81de-2b06abc4d2ae,message,"Hey,\n\nI would kindly invite you to join our ...",U03RA8H1RMG,1661327000.0,T02RKFYQV5L,T02RKFYQV5L,T02RKFYQV5L,"{'avatar_hash': 'aa45bbd49037', 'image_72': 'h...","[{'from_url': 'http://scverse.zulipchat.com/',...",...,,,,,,,,,,
1,ed247811-15b0-43ae-92a8-362cbb1f9018,message,"This is great, thanks a lot!",U03URSK2PJR,1661329000.0,T02RKFYQV5L,T02RKFYQV5L,T02RKFYQV5L,"{'avatar_hash': '077a2bba0f9c', 'image_72': 'h...",,...,,,,,,,,,,


### Creating a user dictionary that matches ID to name

In [6]:
users_dict = {}
for i in range(0,df.shape[0]):
    user_id = str(df["user"].iloc[i,])

    if user_id not in users_dict:
        try:
            username = str(df["user_profile"].iloc[i,]["first_name"]).capitalize()
            users_dict[user_id] = username
        except TypeError:
            users_dict[user_id] = "None"


### Pull out q-and-a dates of interest from df 

In [7]:
df_qanda = df[df["date"].isin(qanda_dates)]
qandamsg_dict = convertDFtoDict(df_qanda)
allmsg_dict = convertDFtoDict(df)

### Parse through messages and output Q&A text 

In [8]:
output_file = open(filepath + 
                   outputfolder + 
                   interview_company + 
                   "_interview.txt", "w")

visited_ts = []
question_ticker = 1

# Sort through dictionary where the messages are stored
for ts in dict(sorted(qandamsg_dict.items())):
    if ts not in visited_ts:
        msg_text = cleanMsg(qandamsg_dict[ts]["msg_text"], users_dict)
        
        # Designate when the next question within the main thread was asked
        #if "?" in msg_text:
        output_file.write("\n")
        output_file.write("### Main Thread Question " + str(question_ticker) + ":\n\n")
        question_ticker = question_ticker + 1
                        
        output_file.write(str(qandamsg_dict[ts]["user_id"]) + 
                          ":\t" + msg_text + "\n\n")
        visited_ts.append(ts)
        
        if len(qandamsg_dict[ts]["reply_count"]) > 0:
            for ts_reply in qandamsg_dict[ts]["reply_count"]:
                try:
                    reply_msg_text = cleanMsg(allmsg_dict[ts_reply]["msg_text"], users_dict)
                    output_file.write(str(allmsg_dict[ts_reply]["user_id"]) + 
                                      ":\t" + reply_msg_text + "\n\n")
                except KeyError:
                    output_file.write("Message Outside of Time Window Provided\n\n")
                    
                visited_ts.append(ts_reply)