In [67]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff

%matplotlib inline
sns.set()

## Automatic error annotation

In [165]:
# path txt to dataframe (V1: term ID, V2: term path, V3: term)
path=pd.read_fwf('OntoBiotope_BioNLP-OST-2019_paths.txt', header=None, names=['V1', 'V2'])

def term_path(x):
    return x.split('\t')[0]

def get_term_from_path(x):
    return x.split('\t')[1]

path['V3']= path['V2'].apply(get_term_from_path)
path['V2']= path['V2'].apply(term_path)


# path : from string to list
path['V2'] = path['V2'].apply(str2list)


# read prediction result file and extract information(ID/Name) to a new dataframe
df = pd.read_csv('contes_500_dev_annotated.csv', sep=';', usecols = ['name','match','score','type','word','reference','predict'])

def extract_id(x):
    id_= re.findall("OBT:\\d+", x)
    return id_

# tokenize name string by ' ', didn't take consideration of name seperated by '-' yet, ex."gram-negative")
def extract_name(x):
    name = re.findall('\((.*?)\)', x)
    for list_name in name :
        list_name = list_name.split(' ')
    return list_name

def list2str(x):
    strx = ""
    return (strx.join(x))

def str2list(x):
    return x.split('/')[1:]

def word2list(x):
    return x.split()


df['reference_id'] = df['reference'].apply(extract_id)
df['predict_id'] = df['predict'].apply(extract_id)

df['reference_name'] = df['reference'].apply(extract_name)
df['predict_name'] = df['predict'].apply(extract_name)


# list to string
df['predict_id'] = df['predict_id'].apply(list2str)

# string to list
df['word'] = df['word'].apply(word2list)


# delete rows of multiple references -------------------> can be improved
for i in range(len(df['reference_id'])):
    if len(df['reference_id'][i]) !=1:
        df = df.drop(i)  

df = df.reset_index().drop('index', axis=1)
df['reference_id'] = df['reference_id'].apply(list2str)

# initiate error_type & word_len
df['error_type'] = range(len(df))
df['word_len'] = range(len(df))

# initiate 2nd level reference columns
df['sous_class'] = range(len(df))

In [166]:
df.head()

Unnamed: 0,name,match,score,type,word,reference,predict,reference_id,predict_id,reference_name,predict_name,error_type,word_len,sous_class
0,BB-norm-20580604,MM,0.0313,Habitat,[['bottle']],OBT:003146 (fermented milk),OBT:001456 (bottle),OBT:003146,OBT:001456,"[fermented, milk]",[bottle],0,0,0
1,BB-norm-25114119,MM,0.0369,Habitat,"[['human',, 'macrophages']]",OBT:002995 (macrophage),OBT:002488 (human),OBT:002995,OBT:002488,[macrophage],[human],1,1,1
2,BB-norm-11989773,MM,0.0445,Habitat,[['chicken']],OBT:002394 (chicken meat),OBT:003300 (poultry),OBT:002394,OBT:003300,"[chicken, meat]",[poultry],2,2,2
3,BB-norm-11989773,MM,0.0445,Habitat,[['chicken']],OBT:002394 (chicken meat),OBT:003300 (poultry),OBT:002394,OBT:003300,"[chicken, meat]",[poultry],3,3,3
4,BB-norm-10496597,MM,0.045,Habitat,"[['monoclonal',, 'B',, 'cells']]",OBT:001623 (lymphocyte),OBT:001480 (cheese),OBT:001623,OBT:001480,[lymphocyte],[cheese],4,4,4


## Defining Error type 

In [167]:
for i in range(len(df)):
    path_ref = path['V2'][path[path['V1'] == df['reference_id'][i]].index].values
    path_pred = path['V2'][path[path['V1'] == df['predict_id'][i]].index].values
    
    # annote sous-class by 2nd level reference
    df['sous_class'][i] = path_ref[0][2]
        
    df['word_len'][i] = len(df['word'][i])
    
    # annotate error type   
        
    for m in range(len(path_ref)):
        for n in range(len(path_pred)):
            
            if df['score'][i] == 1:
                df['error_type'][i] = 'Correct'
                        
            elif set(path_ref[m]).issubset(path_pred[n]):
                df['error_type'][i] = 'Precise'
                            
            elif set(path_pred[n]).issubset(path_ref[m]):
                
            # general class classification : level 1/2/3 + too general

                level = len(path_ref[m])- len(path_pred[n])
                if level == 1:
                    df['error_type'][i] = 'General1'
                elif level == 2:
                    df['error_type'][i] = 'General2'
                elif level == 3 :
                    df['error_type'][i] = 'General3'
                    
                elif len(path_pred[n]) < 3 :
                    df['error_type'][i] = 'Too General'
                    
                else:
                    df['error_type'][i] = 'General'
            
            elif path_pred[n][1]!= path_ref[m][1]:
                df['error_type'][i] = 'Wrong type'

            else:
                #df['error_type'][i] = 'Wrong class'
        
# wrong class classification : wrong2/3/4 + the rest of wrong class  ______________________

                common_len = len(set(path_ref[m]).intersection(path_pred[n]))
                wrong_level = len(path_pred[n]) - common_len + (len(path_ref[m])- common_len)

                if wrong_level == 2:
                    df['error_type'][i] = 'Wrong2'
                elif wrong_level == 3 :
                    df['error_type'][i] = 'Wrong3'   
                elif wrong_level == 4:
                    df['error_type'][i] = 'Wrong4'

                else:
                    df['error_type'][i] = 'Wrong class'     

In [168]:
# replace sous_class ID by terms
df = df.replace(['OBT:000004',
 'OBT:000006',
 'OBT:000007',
 'OBT:000008',
 'OBT:000009',
 'OBT:000010',
 'OBT:000011',
 'OBT:000012',
 'OBT:000013',
 'OBT:000014',
 'OBT:000015',
 'OBT:000017',
 'OBT:000019',
 'OBT:000020',
 'OBT:000021',
 'OBT:000023',
 'OBT:000024',
 'OBT:000025'], ['animal husbandry and agricultural habitat', 
                 'artificial environment', 
                 'experimental medium',
                 'food',
                 'habitat wrt chemico-physical property', 
                 'living organism', 
                 'medical environment',
                 'microorganism associated habitat',
                 'natural environment habitat',
                 'part of living organism',
                 'pphenotype wrt adhesion',
                 'phenotype wrt environment',
                 'phenotype wrt metabolic activity',
                 'phenotype wrt morphology',
                 'phenotype wrt motility',
                 'phenotype wrt stress',
                 'phenotype wrt genetic',
                 'hysiological phenotype'])
df.head()

Unnamed: 0,name,match,score,type,word,reference,predict,reference_id,predict_id,reference_name,predict_name,error_type,word_len,sous_class
0,BB-norm-20580604,MM,0.0313,Habitat,[['bottle']],OBT:003146 (fermented milk),OBT:001456 (bottle),OBT:003146,OBT:001456,"[fermented, milk]",[bottle],Wrong class,1,food
1,BB-norm-25114119,MM,0.0369,Habitat,"[['human',, 'macrophages']]",OBT:002995 (macrophage),OBT:002488 (human),OBT:002995,OBT:002488,[macrophage],[human],Wrong class,2,part of living organism
2,BB-norm-11989773,MM,0.0445,Habitat,[['chicken']],OBT:002394 (chicken meat),OBT:003300 (poultry),OBT:002394,OBT:003300,"[chicken, meat]",[poultry],Wrong class,1,food
3,BB-norm-11989773,MM,0.0445,Habitat,[['chicken']],OBT:002394 (chicken meat),OBT:003300 (poultry),OBT:002394,OBT:003300,"[chicken, meat]",[poultry],Wrong class,1,food
4,BB-norm-10496597,MM,0.045,Habitat,"[['monoclonal',, 'B',, 'cells']]",OBT:001623 (lymphocyte),OBT:001480 (cheese),OBT:001623,OBT:001480,[lymphocyte],[cheese],Wrong class,3,part of living organism


In [169]:
df.to_csv("contes_500_dev_annotated.csv", encoding= 'utf-8', sep=';')

## Error type Visualization 

In [191]:
error_type = df.groupby('error_type').count().reset_index()
error_type['match'] = round(error_type['match']/sum(error_type['match']), 3)
error_type

Unnamed: 0,error_type,name,match,score,type,word,reference,predict,reference_id,predict_id,reference_name,predict_name,word_len,sous_class
0,Correct,206,0.291,206,206,206,206,206,206,206,206,206,206,206
1,General,21,0.03,21,21,21,21,21,21,21,21,21,21,21
2,General1,138,0.195,138,138,138,138,138,138,138,138,138,138,138
3,General2,70,0.099,70,70,70,70,70,70,70,70,70,70,70
4,General3,37,0.052,37,37,37,37,37,37,37,37,37,37,37
5,Precise,14,0.02,14,14,14,14,14,14,14,14,14,14,14
6,Too General,27,0.038,27,27,27,27,27,27,27,27,27,27,27
7,Wrong class,147,0.208,147,147,147,147,147,147,147,147,147,147,147
8,Wrong type,25,0.035,25,25,25,25,25,25,25,25,25,25,25
9,Wrong2,2,0.003,2,2,2,2,2,2,2,2,2,2,2


In [192]:
general = df.loc[df['error_type'] == 'General']
general1 = df.loc[df['error_type'] == 'General1']
general2 = df.loc[df['error_type'] == 'General2']
general3 = df.loc[df['error_type'] == 'General3']
too_general = df.loc[df['error_type'] == 'Too General']
wrong_class = df.loc[df['error_type'] == 'Wrong class']
wrong_type = df.loc[df['error_type'] == 'Wrong type']
wrong2 = df.loc[df['error_type'] == 'Wrong2']
wrong3 = df.loc[df['error_type'] == 'Wrong3']
wrong4 = df.loc[df['error_type'] == 'Wrong4']
precise = df.loc[df['error_type'] == 'Precise']
correct = df.loc[df['error_type'] == 'Correct']

In [193]:
fig = px.pie(error_type, values='word', names='error_type',color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

In [194]:
score = pd.concat([df.groupby('error_type')['score'].min(),df.groupby('error_type')['score'].max(), df.groupby('error_type')['score'].mean(), df.groupby('error_type')['score'].std()], axis=1)
score.columns=['min', 'max', 'mean', 'std']
score

Unnamed: 0_level_0,min,max,mean,std
error_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Correct,1.0,1.0,1.0,0.0
General,0.4504,0.6903,0.582838,0.070421
General1,0.6987,0.8915,0.816835,0.044807
General2,0.5872,0.7898,0.67094,0.058714
General3,0.5037,0.7319,0.645086,0.069742
Precise,0.6084,0.8134,0.731536,0.055111
Too General,0.2842,0.4531,0.3777,0.040051
Wrong class,0.0313,0.6425,0.303066,0.206622
Wrong type,0.0161,0.2313,0.096644,0.057081
Wrong2,0.5175,0.6315,0.5745,0.08061


In [195]:
# display kde plot and rug plot(one dimensional scatter plot)
hist_data = [df['score'], wrong_type['score'], wrong_class['score'], general['score'], general1['score'],general2['score'], general3['score'], precise['score'], too_general['score'], wrong2['score'], wrong3['score'], wrong4['score']]
group_labels = ['total', 'wrong_type','wrong_class', 'general', 'general1','general2','general3', 'precise', 'too general', 'wrong2', 'wrong3', 'wrong4']
fig = ff.create_distplot(hist_data, group_labels, bin_size=0)
fig.show()

In [175]:
df.groupby('error_type')['word_len'].sum()/df.groupby('error_type')['word_len'].count()

error_type
Correct        1.436893
General        2.190476
General1       2.152174
General2       1.928571
General3       2.675676
Precise        1.142857
Too General    4.333333
Wrong class    2.700680
Wrong type     2.320000
Wrong2         2.500000
Wrong3         1.333333
Wrong4         2.600000
Name: word_len, dtype: float64

In [176]:
df.groupby(['error_type','sous_class']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,name,match,score,type,word,reference,predict,reference_id,predict_id,reference_name,predict_name,word_len
error_type,sous_class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Correct,artificial environment,1,1,1,1,1,1,1,1,1,1,1,1
Correct,experimental medium,7,7,7,7,7,7,7,7,7,7,7,7
Correct,food,26,26,26,26,26,26,26,26,26,26,26,26
Correct,hysiological phenotype,1,1,1,1,1,1,1,1,1,1,1,1
Correct,living organism,66,66,66,66,66,66,66,66,66,66,66,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wrong4,experimental medium,1,1,1,1,1,1,1,1,1,1,1,1
Wrong4,food,2,2,2,2,2,2,2,2,2,2,2,2
Wrong4,living organism,3,3,3,3,3,3,3,3,3,3,3,3
Wrong4,natural environment habitat,1,1,1,1,1,1,1,1,1,1,1,1


In [197]:
sous_class_correct = correct.groupby('sous_class').count()
sous_class_general = general.groupby('sous_class').count()
sous_class_general1 = general1.groupby('sous_class').count()
sous_class_general2 = general2.groupby('sous_class').count()
sous_class_general3 = general3.groupby('sous_class').count()
sous_class_too_general = too_general.groupby('sous_class').count()
sous_class_precise = precise.groupby('sous_class').count()
sous_class_wrong_type = wrong_type.groupby('sous_class').count()
sous_class_wrong_class = wrong_class.groupby('sous_class').count()
sous_class_wrong2 = wrong2.groupby('sous_class').count()
sous_class_wrong3 = wrong3.groupby('sous_class').count()
sous_class_wrong4 = wrong4.groupby('sous_class').count()

In [178]:
fig = px.pie(sous_class_correct, values='name', names=sous_class_correct.index, color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

In [187]:
fig = px.pie(sous_class_precise, values='name', names=sous_class_precise.index, color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

In [182]:
fig = px.pie(sous_class_general, values='name', names=sous_class_general.index, color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

In [183]:
fig = px.pie(sous_class_general1, values='name', names=sous_class_general1.index, color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

In [184]:
fig = px.pie(sous_class_general2, values='name', names=sous_class_general2.index, color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

In [185]:
fig = px.pie(sous_class_general3, values='name', names=sous_class_general3.index, color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

In [189]:
fig = px.pie(sous_class_too_general, values='name', names=sous_class_too_general.index, color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

In [198]:
fig = px.pie(sous_class_wrong_type, values='name', names=sous_class_wrong_type.index, color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

In [199]:
fig = px.pie(sous_class_wrong_class, values='name', names=sous_class_wrong_class.index, color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

In [200]:
fig = px.pie(sous_class_wrong2, values='name', names=sous_class_wrong2.index, color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

In [201]:
fig = px.pie(sous_class_wrong3, values='name', names=sous_class_wrong3.index, color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

In [203]:
fig = px.pie(sous_class_wrong4, values='name', names=sous_class_wrong4.index, color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()