In [None]:
from nltk.corpus import wordnet

def get_word_synonyms_from_sent(word, sent):
    word_synonyms = []
    for synset in wordnet.synsets(word):
        for lemma in synset.lemma_names():
#             if lemma in sent and lemma != word:
            if lemma in sent:
                word_synonyms.append(lemma)
    return word_synonyms

word = "patient"
sent = "incorrect patient in the room".split( )
word_synonyms = set(get_word_synonyms_from_sent(word, sent))
print ("WORD:", word)
print ("SENTENCE:", sent)
print ("SYNONYMS FOR '" + word.upper() + "' FOUND IN THE SENTENCE: " + ", ".join(word_synonyms))

In [None]:
fm1 = "At first fraction wrong patient".lower().split( )

fm2 = "Incorrect patient in the room".lower().split( )

def get_word_synonyms_from_sent(fm1, fm2):
    word_synonyms = []
    for word in fm1:
        for synset in wordnet.synsets(word):
            for lemma in synset.lemma_names():
                if lemma in fm2:
                    word_synonyms.append(lemma)
    return word_synonyms

word_synonyms = set(get_word_synonyms_from_sent(fm1, fm2))

print ("SYNONYMS FOR '" + " ".join(fm1) + "' FOUND IN " + " ".join(fm2) + ": " + ", ".join(word_synonyms))

In [None]:
synonyms = []

for syn in wordnet.synsets("misinterpretation"):
    for l in syn.lemmas():
        synonyms.append(l.name())

print(set(synonyms))

In [1]:
import pandas as pd
import os
import numpy as np
import openpyxl
from word_forms.word_forms import get_word_forms
import inflect
p = inflect.engine()

In [2]:
folder = 'data'
input_file = 'iSART process map V3.xlsx'
output_file_separate = 'duplicates_separate.xlsx'
output_file = 'duplicates.xlsx'
sheet_name = 'FMs_count'
rows = 585

In [3]:
input_path = os.path.join(folder, input_file)
output_separate_path = os.path.join(folder, output_file_separate)
output_path = os.path.join(folder, output_file)

In [4]:
df_excel = pd.DataFrame(pd.read_excel(input_path, sheet_name, nrows=rows))

### Extract subprocess id

In [5]:
df_excel['subprocess_id'] = df_excel['Subprocess'].str.strip().str.split('.').str[0]
df_excel.loc[df_excel['subprocess_id'].str.contains('A', na=False), 'subprocess_id'] = -1 # A.
df_excel['subprocess_id'] = df_excel['subprocess_id'].fillna(0) # unknown 

### Extract step id

In [6]:
df_excel['step_id'] = df_excel['Step'].str.strip().str.split(' ').str[0].str.split('.').str[1]
df_excel.loc[df_excel['step_id'].str.contains('A', na=False), 'step_id'] = -1 # A
df_excel.loc[df_excel['step_id'].str.contains('\?', na=False), 'step_id'] = 0 # unknown
df_excel['step_id'] = df_excel['step_id'].fillna(0) # unknown 

### Generate process id

In [7]:
df_excel['process_id'] = df_excel.\
apply(lambda r: (r['subprocess_id'] + '-' + r['step_id']) if r['subprocess_id'] != -1 and r['subprocess_id'] != 0 and r['step_id'] != 0 and r['step_id'] != -1 else np.NaN, axis=1)

## Find duplicates based on keywords

### Group dataframes by process_id

In [8]:
df_group_dict = {k: df for k, df in df_excel.groupby('process_id')}

In [9]:
keywords_list = [
    {
        'process_id': '6-4', 
        'keywords': [
            'Wrong, incorrect, poor, imperfect', 
            'Contouring, delineation, outline, exams, PTV, CTV, GTV, target, organs at risk'
        ]
    },
    {
        'process_id': '6-5', 
        'keywords': [
            'Wrong, incorrect', 
            'Prescription'
        ]
    },
    {
        'process_id': '8-1', 
        'keywords': [
            'Wrong, mistaken, incorrect,  lack, missed', 
            'Patient, identify, identification, treatment data'
        ]
    },
    {
        'process_id': '8-3', 
        'keywords': [
            'fail, lack', 
        ]
    },
    {
        'process_id': '8-4', 
        'keywords': [
            'Wrong, incorrect, not applied, lack, different, no use', 
            'Immobilization device, immobilization aids, treatment accessories'
        ]
    }, 
    {
        'process_id': '8-5', 
        'keywords': [
            'Wrong match, mismatch, misinterpretation'
        ]
    },
    {
        'process_id': '8-8', 
        'keywords': [
            'Collision, monitor',
        ]
    },   
]

### Change word form

In [10]:
def change_word_form(keywords):
    new_keywords = [keyword for keyword in keywords]
    for keyword in keywords:
        if ' ' in keyword:
            new_keywords.append(p.plural(keyword))
        else:
            for value_set in get_word_forms(keyword).values():
                if len(value_set) != 0:
                    new_keywords.extend(value_set)
    return list(set(new_keywords))

def keywords_to_list(keywords_list):
    for step_keywords in keywords_list:
        new_keywords = []
        for group in step_keywords['keywords']:
            keywords = [elem.strip() for elem in group.lower().split(', ')]
            keywords = change_word_form(keywords)
            new_keywords.append(keywords)
        
        step_keywords['keywords'] = new_keywords
        
keywords_to_list(keywords_list)

In [11]:
keywords_duplicates_dict = {}

def find_duplicates_via_keywords(df_group_dict, keywords_list):
    for fm_keywords in keywords_list:
        for process_id, df in df_group_dict.items():
            if fm_keywords['process_id'] == process_id:
                duplicates = []
                for index, row in df.iterrows():
                    fm_list = row['F_m'].strip().lower().split( )
                    count = 0
                    for group in fm_keywords['keywords']: 
                        for word in fm_list:
                            # remove apostrophe
                            word = word.replace('’s', '')
                            if word in group:
                                count += 1
                                break
                    if count == len(fm_keywords['keywords']):
                        duplicates.append(row['ID'])
                keywords_duplicates_dict[process_id] = duplicates
                                
find_duplicates_via_keywords(df_group_dict, keywords_list)

## Find duplicates based on separator (i.e., :)

### Filter dataframes only from source 05AR03

In [35]:
df_source = df_excel[(df_excel['Source'] == '05AR03')]
# df_source = df_excel[(df_excel['Source'] == '05AR03') & (df_excel['subprocess_id'].isin(['6', '8']))]
# df_source = df_excel[(df_excel['Source'] == '05AR03') & (df_excel['process_id'] == '8-4')]
df_group_source_dict = {k: df for k, df in df_source.groupby('process_id')}

In [36]:
source_duplicates_dict = {}

def find_duplicates_via_colon(df_group_source_dict):
    temp = {}
    for process_id, df in df_group_source_dict.items():
        for index, row in df.iterrows():
            if ':' in row['F_m']:
                general_fm = row['F_m'].split(':')[0].strip().lower()
                if process_id not in temp:
                    temp[process_id] = {general_fm: [row['ID']]}
                else:
                    if general_fm not in temp[process_id].keys():
                        temp[process_id][general_fm] = [row['ID']]
                    else:
                        temp[process_id][general_fm].append(row['ID'])
                        
    for process_id, fm_obj in temp.items():
        source_duplicates_dict[process_id] = [id_list for general_fm, id_list in fm_obj.items()]

find_duplicates_via_colon(df_group_source_dict)

## Find duplicates using the exact same wording

In [14]:
df_same = df_excel[df_excel['F_m_count'] != 1]

In [15]:
exact_same_duplicates_dict = {}

def find_duplicates_via_same(df):
    temp = {}
    for index, row in df.iterrows():
        if row['process_id'] not in temp:
            temp[row['process_id']] = {row['F_m']: [row['ID']]}
        else:
            if row['F_m'] not in temp[row['process_id']].keys():
                temp[row['process_id']][row['F_m']] = [row['ID']]
            else:
                temp[row['process_id']][row['F_m']].append(row['ID'])
                
    for process_id, fm_obj in temp.items():
        exact_same_duplicates_dict[process_id] = [id_list for general_fm, id_list in fm_obj.items()]

find_duplicates_via_same(df_same)

## Export to Excel

In [16]:
df_drop = df_excel.drop(['subprocess_id', 'step_id', 'process_id'], axis=1)

In [37]:
duplicate_IDs = []

def export_to_separate_sheet(ditc_list, output_path):
    with pd.ExcelWriter(output_path) as writer: 
        for duplicates_dict, filename in ditc_list:
            if filename == 'keywords_':
                duplicates = []
                for process_id, ids in duplicates_dict.items():
                    duplicates.extend(ids)
                    df_drop.loc[df_drop['ID'].isin(ids)].to_excel(writer, filename + process_id, index=False)
                duplicate_IDs.append(duplicates)
            else:
                duplicates = []
                for process_id, id_list in duplicates_dict.items():
                    new_ids = []
                    for ids in id_list:
                        if len(ids) > 1:
                            duplicates.extend(ids)
                            new_ids.extend(ids)
                    df_drop.loc[df_drop['ID'].isin(new_ids)].to_excel(writer, filename + process_id, index=False)
                duplicate_IDs.append(duplicates)
            
export_to_separate_sheet([(keywords_duplicates_dict, 'keywords_'), 
                 (source_duplicates_dict, '05AR03_'), 
                 (exact_same_duplicates_dict, 'same_')], output_separate_path)

In [18]:
def export_to_one_sheet(ditc_list, output_path):
    with pd.ExcelWriter(output_path) as writer: 
        for duplicates_dict, filename in ditc_list:
            id_collection = []
            if filename == 'keywords':
                for process_id, ids in duplicates_dict.items():
                    id_collection.extend(ids)
            else:
                for process_id, id_list in duplicates_dict.items():
                    for ids in id_list:
                        if len(ids) > 1:
                            id_collection.extend(ids)
            df_drop.loc[df_drop['ID'].isin(id_collection)].to_excel(writer, filename, index=False)
            
export_to_one_sheet([(keywords_duplicates_dict, 'keywords'), 
                 (source_duplicates_dict, '05AR03'), 
                 (exact_same_duplicates_dict, 'same')], output_path)

In [19]:
unique_ids = []

def export_without_duplicates(id_list, output_path, sheet_name):
    unique_ids = list(set([item for sublist in duplicate_IDs for item in sublist]))
    with pd.ExcelWriter(output_path, engine='openpyxl', mode='a') as writer: 
         df_drop.loc[df_drop['ID'].isin(unique_ids)].sort_values(by=['Subprocess', 'Step'], ascending=[True, True]).to_excel(writer, sheet_name, index=False) 
            
export_without_duplicates(duplicate_IDs, output_path, 'no_duplicates')

In [28]:
def count_duplicates(ids):
    df = df_excel.loc[df_excel['ID'].isin(ids)]
    df_dict = {k: df for k, df in df.groupby('process_id')}
    count_dict = {k: len(df) for k, df in df.groupby('process_id')}
    return df_dict, count_dict

In [30]:
id_dict_list = []
for ids in duplicate_IDs:
    df_dict, count_dict = count_duplicates(ids)
    id_dict_list.append(count_dict)
id_dict_list

[{'6-4': 15, '6-5': 2, '8-1': 7, '8-3': 4, '8-5': 2, '8-8': 7},
 {'4-11': 2,
  '4-8': 4,
  '5-1': 5,
  '6-4': 6,
  '6-5': 2,
  '6-6': 39,
  '6-7': 6,
  '6-8': 6,
  '6-9': 32,
  '8-2': 5,
  '8-4': 9,
  '8-5': 2,
  '8-8': 12},
 {'10-1': 2, '6-6': 18, '6-9': 27, '8-2': 3, '8-4': 3, '8-6': 4}]

In [32]:
unique_ids = list(set([item for sublist in duplicate_IDs for item in sublist]))
df_dict, count_dict = count_duplicates(unique_ids)
count_dict

{'10-1': 2,
 '4-11': 2,
 '4-8': 4,
 '5-1': 5,
 '6-4': 17,
 '6-5': 4,
 '6-6': 39,
 '6-7': 6,
 '6-8': 6,
 '6-9': 32,
 '8-1': 7,
 '8-2': 5,
 '8-3': 4,
 '8-4': 9,
 '8-5': 3,
 '8-6': 4,
 '8-8': 17}

In [12]:
characters = []
for index, row in df_excel.iterrows():
    fm = row['F_m'].replace(' ', '')
    for c in fm:
        if not c.isalnum():
            characters.append(c)

"  ".join(list(set(characters)))

'(  +  “  –  #  >  ,  ®  ?  \n  °  ”  ≤  <  *  -  ;  )  &  .  :  /  ’'

In [None]:
test = 'Missed patient’s photo'.replace(' ', '')
# for c in test:
#     if not c.isalnum():
#         characters.append(c)
characters = [c for c in test if not c.isalnum()]

list(set(characters))