Preliminary code to extract the check-in messages in the Slack channel "think-biver-sunday-checkins".

Many of the steps/functions used in "AG_slack-export-data-compilation.ipynb" could be use to clean further this data (PENDING).

The main objective so far was to separate the text into the expected categories: 'project_name', 'working_on', 'progress_and_roadblocks', 'plans_for_following_week', 'meetings'.

    1) When parsing the text, it is assumed that each category starts with the category_name followed by a semicolon. It works in most of the cases but there are exceptions where another symbol, or no symbol at all, is used. PENDING to generalize the first separation of the categories.

    2) There are entries that do not correspond to a real check-in, most of these entries were dropped. They can be SlackBot messages, or messages sent multiple times as a reminder of the expected format for the check-ins.

    3) Some check-in messages contain more than one project. For these cases, each project is assign to a different row in the final dataframe (preserving all relevant info as user, msg_id, ...). PENDING to include some edge cases.

    4) Some messages split "progress" from "roadblocks". These cases where combined keeping the format:
        
        Progress: aaaaaaaaa.
        
        (new_line)
        
        Roadblocks: bbbbbbb.
    
    5) Preliminary stage. Some rows have been added to the dataframe for developing/debugging purposes.

In [None]:
import pandas as pd 
import numpy as np

from os import listdir
from os.path import getmtime, exists, isdir, isfile
from pathlib import Path
import re
import sys

In [None]:
##-- Global variables:
missing_value = 'n/d'

source_path = "/home/agds/Documents/RebeccaEverleneTrust/RebeccaEverlene_Slack_export/think-biver-sunday-checkins"

In [None]:
##-- Introduce expected/possible keywords per report's category:
keywords_dictionary = {
    'header' : ['weekly report', 'report', "week's report"],
    'project_name': ['project name', 'project'],
    'working_on' : ['working on', 'working'],
    'progress_and_roadblocks' : ['progress and roadblocks', 'progress and roadblock'],#, 'progress', 'roadblocks'],
    'progress' : ['progress'],
    'roadblocks' : ['roadblocks', 'roadblock'],
    'plans_for_following_week' : ['plans for the following week', 'plans for next week', 'plans', 'following week', 'next week'],
    'meetings' : ['meetings', 'meet', 'met']
}
all_keywords = ['project_name', 'working_on', 'progress_and_roadblocks', 'progress', 'roadblocks','plans_for_following_week', 'meetings']

In [None]:
##-- Extract messages from the Slack channel "think-biver-sunday-checkins":

def check_format_of_json_names(list_names):
    """ Iterates over all the json files in a channel's directory, and returns a list with the names of the json files 
    that have the correct format 'yyyy-mm-dd.json' """
    list_names_dates = []
    for i in range(len(list_names)):
        match = re.match(r'(\d{4})(-)(\d{2})(-)(\d{2})(.)(json)',list_names[i])
        if match!=None:
            list_names_dates.append(list_names[i])
    return list_names_dates

##-- Initialize dataframe with first json file:
json_names = check_format_of_json_names(listdir(source_path))
checkins_df = pd.read_json(source_path+'/'+json_names[0])
checkins_df['json_name'] = json_names[0]

##-- Iterate over the remaining json files and concat info to checkins_df:
for file in json_names[1:]:
    file_df = pd.read_json(source_path+'/'+file)
    file_df['json_name'] = file
    checkins_df = pd.concat([checkins_df,file_df], axis=0, ignore_index=True)

##-- Keep relevant columns:
checkins_df = checkins_df[['user', 'client_msg_id', 'ts', 'json_name', 'text']]

##-- Set dtypes:
checkins_column_names = list(checkins_df.columns)
checkins_column_dtypes = ['string','string','float64','string','string']
for i in range(len(checkins_column_names)):
    checkins_df[checkins_column_names[i]] = checkins_df[checkins_column_names[i]].astype(checkins_column_dtypes[i])

##-- Fix the dtype of each column:
checkins_column_types = [checkins_df[feature].dtypes for feature in list(checkins_df.columns)]

checkins_df.info()

In [None]:
def handle_missing_values(df, missing_value):
    df = df.replace(pd.NaT, missing_value)
    df = df.replace(np.nan, missing_value) 
    df = df.fillna(missing_value)
    return df
    
def get_indices_with_repeated_text(df):
    """ Function to get the dataframe's indices of the rows that have exactly the same text """
    indices_before_drop = list(df.index)
    indices_after_drop = list(df[['text']].drop_duplicates(subset=['text'], keep='last').index )
    indices_same_text = []
    for i in indices_before_drop:
        flag = False
        for j in indices_after_drop:
            if i == j and flag == False:
                flag = True
        if flag == False:
            indices_same_text.append(i)
    return np.array(indices_same_text)

##-- Check for messages that have repeated text:
indices_same_text = get_indices_with_repeated_text(checkins_df)
print('indices_same_text: ', np.array(indices_same_text), '\n')

##-- Messages explaining how the format of the checkins should be:
sample_format_msg_text = checkins_df.at[10,'text']
sample_format_msg_indices = checkins_df[checkins_df['text']==sample_format_msg_text].index
print('sample_format_msg_indices: ', np.array(sample_format_msg_indices), '\n')

##-- Messages from USLACKBOT:
bot_indices = checkins_df[checkins_df['user']=='USLACKBOT'].index
print('bot_indices: ', np.array(bot_indices), '\n')

##-- Joined-the-channel messages:
joined_channel_indices = []
for i in list(checkins_df.index):
    if 'has joined the channel' in checkins_df.at[i,'text']:
      joined_channel_indices.append(i)
print('joined_channel_indices: ', joined_channel_indices, '\n')

##-- Drop from dataframe:
for msg_type_indices in [sample_format_msg_indices, bot_indices, joined_channel_indices]:
    checkins_df = checkins_df.drop(msg_type_indices,axis=0)

##-- Remaining messages:
#indices_same_text_remaining = get_indices_with_repeated_text(checkins_df)
#print('indices_same_text_remaining: ', np.array(indices_same_text_remaining), '\n')
#checkins_df.loc[indices_same_text_remaining]

##-- Handle missing values:
checkins_df = handle_missing_values(checkins_df, missing_value)

##-- Reset indices:
checkins_df.index = np.arange(0,len(checkins_df),1)
checkins_df.info()

In [None]:
def split_report_into_blocks(report_text):
    """ Separates the original text into blocks (before identifying their categories)
    Assumes that each keyword is followed by ":".
    Lines containing ":" signal the beginning of a block. The text that follows the semicolon can have multiple lines.
    """
    if report_text!='':
        lines_list = report_text.splitlines()
        report_by_blocks = [lines_list[0]]
        for line in lines_list[1:]:
            if ":" not in line:
                report_by_blocks[-1] = report_by_blocks[-1]+'\n'+line
            else:
                report_by_blocks.append(line)
        return report_by_blocks
    else:
        return missing_value
        

def identify_categories(report_by_blocks):
    """ Reads the list generated by split_report_into_blocks and matches each block to a report's category.
    Stores matching in python dictionary 
    (Needs lots of improvement)
    """
    df = pd.DataFrame(columns=all_keywords)
    project = -1
    for item in report_by_blocks:
        item_key = re.sub(r'[0-9.*-/]', '', item.partition(":")[0]).lstrip()
        item_text = item.partition(":")[2].replace('*', '').lstrip()
        ##-- Autocorrect key if necessary: (PENDING)
        ##-- Compares the block's keyword to expected keywords:
        for type in all_keywords:
            keys = [i.lower().replace(' ','') for i in keywords_dictionary[type]]
            if item_key.lower().replace(" ", "") in keys:
                if type == 'project_name':
                    project += 1
                df.at[project,type] = item_text.rstrip()
                break
    return df


def print_test(df):
    """ Function use for debugging to make sure that the text was correctly parsed and 
    that entries with multiple projects were correctly separated into individual rows"""
    missmatch = []
    for i in range(len(df)):
        text = df.at[i,'text']
        project = df.at[i,'project_name']
        working = df.at[i,'working_on']
        pr = df.at[i,'progress_and_roadblocks']
        plans = df.at[i,'plans_for_following_week']
        meeting = df.at[i,'meetings']
        index = df.at[i,'index']
        index_ = df.at[i,'index_']
        print(i, index, index_,'\n -------------------------')
        print(text,'\n -------------------------')
        print(project,'\n -------------------------')
        print(working,'\n -------------------------')
        print(pr,'\n -------------------------')
        print(plans,'\n -------------------------')
        print(meeting,'\n ======================================')
        if index != index_:
            missmatch.append(i)
    return missmatch


def get_indices_progress_roadblocks(df, missing_value):
    """
    Function to collect the dataframe's indices that contain:
        progress_roadblocks = entries that have both "Progress" and "Roadblocks"
        progress = entries that have only "Progress"
        roadblocks = entries that have only "Roadblocks"
        progress_and_roadblocks_true = entries that have the desire label "progress_and_roadblocks"
        progress_and_roadblocks_other = []    
    """
    progress = []
    roadblocks = []
    progress_roadblocks = []
    progress_and_roadblocks_true = []
    progress_and_roadblocks_other = []
    for i in range(len(df)):
        try:
            if df.at[i, 'progress'] != missing_value and df.at[i, 'roadblocks'] != missing_value:
                progress_roadblocks.append(i)
            else:
                if df.at[i, 'progress'] != missing_value :
                    progress.append(i)
                if df.at[i, 'roadblocks'] != missing_value :
                    roadblocks.append(i)
            if df.at[i, 'progress_and_roadblocks'] == missing_value and df.at[i, 'progress'] != missing_value:
                progress_and_roadblocks_true.append(i)
            if df.at[i, 'progress_and_roadblocks'] == missing_value and df.at[i, 'roadblocks'] != missing_value:
                progress_and_roadblocks_true.append(i)
            if df.at[i, 'progress_and_roadblocks'] == missing_value and df.at[i, 'progress'] != missing_value and df.at[i, 'roadblocks'] != missing_value:
                progress_and_roadblocks_other.append(i)
        except:
            continue
    return [progress, roadblocks, progress_roadblocks, progress_and_roadblocks_true, progress_and_roadblocks_other]


def combine_progress_and_roadblocks(df, missing_value):
    """ Combines the information in 'progress' and 'roadblocks' into 'progress_and_roadblocks', such that
    the text in progress_and_roadblocks becomes:
        "Progress: progress_text
         new_line
         Roadblocks: roadblocks_text"
    An alternative is to split 'progress_and_roadblocks' although it is much more complicated.
    """
    for i in range(len(df)):
        pr_text = ''
        if df.at[i, 'progress_and_roadblocks'] == missing_value and df.at[i, 'progress'] != missing_value:
            pr_text += 'Progress: ' + df.at[i, 'progress'] + '\n'
        if df.at[i, 'progress_and_roadblocks'] == missing_value and df.at[i, 'roadblocks'] != missing_value:
            pr_text += 'Roadblocks: ' + df.at[i, 'roadblocks']
        
        if df.at[i, 'progress_and_roadblocks'] == missing_value and df.at[i, 'roadblocks'] == missing_value:
            pr_text = missing_value

        if df.at[i, 'progress_and_roadblocks'] != missing_value and df.at[i, 'progress'] == missing_value and df.at[i, 'roadblocks'] == missing_value:
            pr_text = df.at[i, 'progress_and_roadblocks']
        
        df.at[i, 'progress_and_roadblocks_combined'] = pr_text
        
    return df


In [None]:
##-- MAIN ANALYSIS:

##-- Initialize a dataframe to collect the original and parsed information:
checkins_parsed_df = pd.DataFrame(columns=list(checkins_df)+list(all_keywords)+['n_projects','index_'])

for i in range(len(checkins_df)):
    ##-- Dataframe with the parsed checkin message, with as many rows as projects in the message: 
    text = checkins_df.at[i,'text']
    report_by_blocks = split_report_into_blocks(text)
    df_i_blocks = identify_categories(report_by_blocks)
    df_i_blocks['n_projects'] = len(df_i_blocks)
    df_i_blocks['index_'] = i
    if len(df_i_blocks) == 0:
        df_i_blocks.loc[0] = [missing_value]*len(df_i_blocks.columns)
        df_i_blocks['index_'] = i
    df_i_blocks = handle_missing_values(df_i_blocks, missing_value)
    
    ##-- Dataframe with the original text. Rows are dublicated as many times as projects in the checkin:
    df_i_text = pd.DataFrame([list(checkins_df.loc[i].values)]*len(df_i_blocks))
    df_i_text.columns = checkins_df.columns
    df_i_text['index'] = i
    df_i_text = handle_missing_values(df_i_text, missing_value)

    ##-- Concatenate df_i_text and df_i_blocks for i-th message:
    df_i_all = pd.concat([df_i_text, df_i_blocks], axis=1, ignore_index=True)
    df_i_all.columns = list(df_i_text.columns) + list(df_i_blocks.columns)
    df_i_all = handle_missing_values(df_i_all, missing_value)

    ##-- Concatenate to checkins_parsed_df:
    checkins_parsed_df = pd.concat([checkins_parsed_df, df_i_all], axis=0, ignore_index=True)

##-- Combine "Progress" and "Roadblocks":
checkins_parsed_df = combine_progress_and_roadblocks(checkins_parsed_df, missing_value)
checkins_parsed_df = handle_missing_values(checkins_parsed_df, missing_value)

checkins_parsed_df

In [None]:
missmatch = print_test(checkins_parsed_df)

In [None]:
np.array(missmatch)