In [None]:
import pandas as pd 
import numpy as np

from os import listdir
from os.path import getmtime, exists, isdir, isfile
from pathlib import Path
import re

In [None]:
##-- Introduce expected/possible keywords per report's category:
keywords_dictionary = {
    'header' : ['weekly report', 'report', "week's report"],
    'project_name': ['project name', 'project'],
    'working_on' : ['working on', 'working'],
    'progress_and_roadblocks' : ['progress and roadblocks', 'progress and roadblock'],#, 'progress', 'roadblocks'],
    'progress' : ['progress'],
    'roadblocks' : ['roadblocks', 'roadblock'],
    'plans_for_following_week' : ['plans for the following week', 'plans for next week', 'plans', 'following week', 'next week'],
    'meetings' : ['meetings', 'meet', 'met']
}
all_keywords = ['project_name', 'working_on', 'progress_and_roadblocks', 'progress', 'roadblocks','plans_for_following_week', 'meetings']

In [None]:
def check_format_of_json_names(list_names):
    """ Iterates over all the json files in a channel's directory, and returns a list with the names of the json files 
    that have the correct format 'yyyy-mm-dd.json' """
    list_names_dates = []
    for i in range(len(list_names)):
        match = re.match(r'(\d{4})(-)(\d{2})(-)(\d{2})(.)(json)',list_names[i])
        if match!=None:
            list_names_dates.append(list_names[i])
    return list_names_dates
    

source_path = "/home/agds/Documents/RebeccaEverleneTrust/RebeccaEverlene_Slack_export/think-biver-sunday-checkins"

json_names = check_format_of_json_names(listdir(source_path))
##-- Initialize dataframe with first json file:
checkins_df = pd.read_json(source_path+'/'+json_names[0])
checkins_df['json_name'] = json_names[0]

##-- Iterate over the remaining json files and concat info to checkins_df:
for file in json_names[1:]:
    file_df = pd.read_json(source_path+'/'+file)
    file_df['json_name'] = file
    checkins_df = pd.concat([checkins_df,file_df], axis=0, ignore_index=True)

checkins_df = checkins_df[['user', 'ts', 'json_name', 'text']]
initial_length = len(checkins_df)

In [None]:
missing_value = 'n/d'

def split_report_into_blocks(report_text):
    """ Separates the original text into blocks (before identifying their categories)
    Assumes that each keyword is followed by ":".
    Lines containing ":" signal the beginning of a block. The text that follows the semicolon can have multiple lines.
    """
    if report_text!='':
        lines_list = report_text.splitlines()
        report_by_blocks = [lines_list[0]]
        for line in lines_list[1:]:
            if ":" not in line:
                report_by_blocks[-1] = report_by_blocks[-1]+'\n'+line
            else:
                report_by_blocks.append(line)
        return report_by_blocks
    else:
        return missing_value
        
def identify_categories(report_by_blocks):
    """ Reads the list generated by split_report_into_blocks and matches each block to a report's category.
    Stores matching in python dictionary """
    dict_out = {}
    for item in report_by_blocks:
        item_key = item.partition(":")[0].replace('*', '').lstrip()
        item_text = item.partition(":")[2].replace('*', '').lstrip()
        #print('----------------------------------------', '\n', item, '\n', item_key, '\n', item_text)
        ##-- Autocorrect key if necessary: (PENDING)
        ##-- Compares the block's keyword to expected keywords:
        for type in all_keywords:   
            keys = [i.lower().replace(' ','') for i in keywords_dictionary[type]] 
            if item_key.lower().replace(" ", "") in keys:
                #print('--------------------------- \n',dict_out, list(dict_out.keys()), type)
                if type in list(dict_out.keys()):
                    dict_out[type] += [item_text.rstrip()]
                else:
                    dict_out[type] = [item_text.rstrip()]
    
                break
    #print('====================================================')
    if dict_out == {}:
        dict_out['review'] = 1
        
    return dict_out



In [None]:
joined_channel = []
review_text = []

for i in range(len(checkins_df)):
    try:
        text = checkins_df.at[i,'text']
        ##-- For now let's drop the "has joined the channel" messages to debug the rest:
        if 'has joined the channel' in text:
            checkins_df.drop(i, inplace=True)
            joined_channel.append(i)
        elif text!='':
            report_by_blocks = split_report_into_blocks(text)
            report_dict = identify_categories(report_by_blocks)
            for key in list(report_dict.keys()):
                #print('-------------------------- \n', i, '\n', key)
                if report_dict[key] != 1 and len(report_dict[key]) == 1:
                    checkins_df.at[i, key] = report_dict[key][0]
                elif report_dict[key]!= 1 and len(report_dict[key]) > 1:
                    checkins_df.at[i, key] = report_dict[key]
                    #print(i, report_dict, '---------------')
                elif report_dict[key] == 1:
                    review_text.append(i)
    except:
        continue

## Fill missing values with 0:
checkins_df.fillna(0, inplace=True)
checkins_df = checkins_df.replace('nan', 0)

checkins_df.info()

In [None]:
##-- Separate checkins into as many rows as projects are in a single text:
def split_projects(df):
    ##-- Initialize df_out with first element of df:
    df_out = df[:1].copy()
    counter = []
    for index in list(df.index)[1:]:
        split_row = False
        ##-- Check if the row needs to be splitted:
        for feature in all_keywords:
            if type(df.at[index, feature]) == list:
                split_row = True
        ##-- If so, duplicate the row:
        df_before = df_out[:index+1].copy()
        df_i = pd.DataFrame(df.loc[index].copy()).T
        if split_row == False:
            df_out = pd.concat([df_before, df_i], axis=0, ignore_index=True) 
        else:
            df_out = pd.concat([df_before, df_i, df_i], axis=0, ignore_index=True) 
            counter.append(index)
            df_out.at[ int(df_out.index[-1]), 'multiple_projects' ] = 1
            df_out.at[ int(df_out.index[-2]), 'multiple_projects' ] = 1

            n_projects = 2
            
            project_new_indices = [ int(df_out.index[-i]) for i in np.arange(1,n_projects,1) ]
            project_number = 0
            for index in project_new_indices:
                for feature in all_keywords:
                    try:
                        df_out.at[ index, f"{feature}_1"] = df_out.at[ index, feature][project_number]
                    except:
                        continue
                    project_number += 1

    
        #print(index, split_row)
        #display(df_before)
        #display(df_i)#, 
        #display(df_out)
        #print('------------------------------------------------------ \n')
                
    return df_out, counter

In [None]:
test, counter = split_projects(checkins_df)
print(counter)
test[:10]

In [None]:
##-- Inspecting scenarios for progress_and_roadblocks:
progress = []
roadblocks = []
progress_roadblocks = []
progress_and_roadblocks_true = []
progress_and_roadblocks_other = []
for i in range(initial_length):
    try:
        if checkins_df.at[i, 'progress'] != 0 and checkins_df.at[i, 'roadblocks'] != 0:
            progress_roadblocks.append(i)
        else:
            if checkins_df.at[i, 'progress'] != 0 :
                progress.append(i)
            if checkins_df.at[i, 'roadblocks'] != 0 :
                roadblocks.append(i)
        if checkins_df.at[i, 'progress_and_roadblocks'] == 0 and checkins_df.at[i, 'progress'] != 0:
            progress_and_roadblocks_true.append(i)
        if checkins_df.at[i, 'progress_and_roadblocks'] == 0 and checkins_df.at[i, 'roadblocks'] != 0:
            progress_and_roadblocks_true.append(i)
        if checkins_df.at[i, 'progress_and_roadblocks'] == 0 and checkins_df.at[i, 'progress'] != 0 and checkins_df.at[i, 'roadblocks'] != 0:
            progress_and_roadblocks_other.append(i)
    except:
        continue

print(len(progress_roadblocks), progress_roadblocks)          ## both 'progress' and 'roadblocks' are filled
print(len(progress), progress)                                ## additional rows where only 'progress' in filled
print(len(roadblocks), roadblocks)                            ## additional rows where only 'roadblocks' in filled
print(len(progress_and_roadblocks_true), progress_and_roadblocks_true) ## rows where 'progress_and_roadblocks' is filled
#14+18+21

#print('\n', progress_and_roadblocks_other)

##-- Notes:
##-- When properly identified, the distinction between 'progress_and_roadbacks', 'progress', 'roadblocks' works.
##-- The messages in 'progress_and_roadblocks_other' are empty because of some other reason that needs to be debuged.

In [None]:
##-- Formating 'progress_and_roadblocks':
def combine_progress_and_roadblocks(df):
    """ Combines the information in 'progress' and 'roadblocks' into 'progress_and_roadblocks', such that
    the text in progress_and_roadblocks becomes:
        "Progress: progress_text
         new_line
         Roadblocks: roadblocks_text"
    An alternative is to split 'progress_and_roadblocks' although it is much more complicated.
    """
    for i in df.index:
        pr_text = ''
        try:
            if df.at[i, 'progress_and_roadblocks'] == 0 and df.at[i, 'progress'] != 0:
                pr_text += 'Progress: ' + df.at[i, 'progress'] + '\n'
            if df.at[i, 'progress_and_roadblocks'] == 0 and df.at[i, 'roadblocks'] != 0:
                pr_text += 'Roadblocks: ' + df.at[i, 'roadblocks']
            
            df.at[i, 'progress_and_roadblocks_combined'] = pr_text
        except:
            continue
    return df

df = checkins_df.copy()
df = combine_progress_and_roadblocks(df)
df.columns