# Analysing SNOMED annotations from the MedCAT output


In [None]:
# Import packages
import json
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from medcat.cdb import CDB
import os
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
import statsmodels
from IPython.display import display, HTML
from datetime import datetime

# setup offline mode
py.offline.init_notebook_mode(connected=True)

%matplotlib inline

## Load Concept database (CDB) used for the project

In [15]:
cdb = CDB()
cdb.load_dict(os.path.join("/Volumes/NO NAME/", "snomed.dat"))  # path to concept database

## Load MedCAT output

In [16]:
file_path = r"/Volumes/NO NAME/"  # Add file path
file = r"Epilepsy_MedCAT_Export_With_Text_2020-01-28_12_20_08.json"  # Add file name

with open(file_path + file) as f:
    data = json.load(f)

print("The number of projects is:", len(data['projects']))  # Number of projects

The number of projects is: 1


In [17]:
# Read all documents from all projects to doc_df
doc_df = pd.DataFrame([a for d in data['projects'] for a in d['documents']])
print("The number of documents is", len(doc_df['id']))  # number of documents
doc_df['last_modified'] = pd.to_datetime(doc_df['last_modified'])

# Read annotations to ann_df
ann_df = pd.DataFrame([a for c in data['projects'] for b in c['documents'] for a in b['annotations']])
ann_df['last_modified'] = pd.to_datetime(ann_df['last_modified'])

The number of documents is 114


### Write to CSV

In [None]:
""" Write document DF to CSV """
file_name = input("Enter file name here: ")
file_path = r"C:\Users\k1767582\Documents\GitHub\Epilepsy-project/"
doc_df.to_csv(file_path+file_name+".csv")

In [None]:
""" Write annotation DF to CSV """
file_name = input("Enter file name here: ")
file_path = r"C:\Users\k1767582\Documents\GitHub\Epilepsy-project/"
ann_df.to_csv(file_path+file_name+".csv")

## User Statistics
-------


In [18]:
print("The number of correct annotations is",
      ann_df[~ann_df['deleted'] & ~ann_df['alternative'] & ~ann_df['manually_created']].shape[0])  # Correct
print("The number of deleted annotations is", ann_df[ann_df['deleted']].shape[0])  # Deleted
print("The number of alternative concepts are", ann_df[ann_df['alternative']].shape[0])  # Alternatives
print("The number of annotations added", ann_df[ann_df['manually_created']].shape[0])  # Add annotation
# Display(ann_df.groupby('user').count())
display("The work each user has done is as follows:", ann_df.groupby('user').agg({'validated':'count', 'correct': 'sum', 'alternative': 'sum', 'manually_created': 'sum', 'deleted':'sum', 'killed':'sum'}))  # annotator performane in exercise

The number of correct annotations is 1876
The number of deleted annotations is 106
The number of alternative concepts are 74
The number of annotations added 882


'The work each user has done is as follows:'

Unnamed: 0_level_0,validated,correct,alternative,manually_created,deleted,killed
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Anthony,416,199.0,22.0,154.0,9.0,47.0
Eabela,1,0.0,0.0,1.0,0.0,0.0
Ebruno,1303,890.0,21.0,304.0,48.0,42.0
Pviana,1200,630.0,31.0,423.0,49.0,71.0


## Required Functions

In [19]:
def concept_count(df, concepts_freq=10):
    """
    This function will group by concept ID's in descending order by a default of concept frequency of 10. \n
    :param df: The DataFrame containing the medcat output to analyse, Use the ann_df.
    :param concepts_freq: Will return the frequency of concept mentions >= the integer value.
    :return: Plot of top concept frequencies in medcat output.
    """
    
    # Describe Cui
    groups_by_cui = df.groupby('cui')
    # Print(list(groups_by_cui))

    # Plot the count of each CUI
    a = groups_by_cui.count()
    a = a.sort_values(by='acc', ascending=False)

    # Convert cui to pretty name
    pretty_name = []
    a = a.reset_index()

    for index, row in a.iterrows():
        value = row["cui"]
        p_name = cdb.cui2pretty_name[value]
        pretty_name.append(p_name)
    a["Concept_name"] = pretty_name
    # print(a)

    # Filter df by top concept frequency
    a = a[a['acc'] >= concepts_freq]

    # Plot bar plot of snomed concept frequency
    x = a["Concept_name"]
    y = a["acc"]
    # Plot Layout
    layout = go.Layout(
        title = "Count of SNOMED concepts >= {}".format(concepts_freq),
        yaxis=dict(
            title='Total Concept Count'
        ),
    )

    fig = go.Figure(data=[go.Bar(x=x, y=y)], layout=layout)
    
    # Filename with datetime
    now = datetime.now()
    current_time = now.strftime("%Y%m%d%H%M%S")
    filename = current_time + "_Concept_count.html"
    
    # Save figure
    py.offline.plot(fig, filename=filename, auto_open=True)  # Saves figure to current_time + "_Concept_count.html"   
    return

In [40]:
def concept_count(df, concepts_freq=10):
    """
    This function will group by concept ID's in descending order by a default of concept frequency of 10. \n
    :param df: The DataFrame containing the medcat output to analyse, Use the ann_df.
    :param concepts_freq: Will return the frequency of concept mentions >= the integer value.
    :return: Plot of top concept frequencies in medcat output.
    """
    
    # Describe Cui
    groups_by_cui = df.groupby('cui')
    # Print(list(groups_by_cui))

    # Plot the count of each CUI
    a = groups_by_cui.count()
    a = a.sort_values(by='acc', ascending=False)

    # Convert cui to pretty name
    pretty_name = []
    a = a.reset_index()

    for index, row in a.iterrows():
        value = row["cui"]
        p_name = cdb.cui2pretty_name[value]
        pretty_name.append(p_name)
    a["Concept_name"] = pretty_name
    # print(a)

    # Filter df by top concept frequency
    a = a[a['acc'] >= concepts_freq]

    # Plot bar plot of snomed concept frequency
    x = a["Concept_name"]
    y = a["acc"]
    # Plot Layout
    layout = go.Layout(
        title=go.layout.Title(
            text="Count of SNOMED concepts >= {}".format(concepts_freq),
            xref='paper',
            font=dict(
                    family='Courier New, monospace',
                    size=25,
                    color='#000000'
                )
        ),
        xaxis=go.layout.XAxis(
            title=go.layout.xaxis.Title(
                text='Concept',
                font=dict(
                    family='Courier New, monospace',
                    size=22,
                    color='#000000'
                )
            )
        ),
        yaxis=go.layout.YAxis(
            title=go.layout.yaxis.Title(
                text='Concept Frequency',
                font=dict(
                    family='Courier New, monospace',
                    size=22,
                    color='#000000'
                )
            )
        )
    )

    fig = go.Figure(data=[go.Bar(x=x, y=y)], layout=layout)
    
    # Filename with datetime
    now = datetime.now()
    current_time = now.strftime("%Y%m%d%H%M%S")
    filename = current_time + "_Concept_count.html"
    
    # Save figure
    py.offline.plot(fig, filename=filename, auto_open=True)  # Saves figure to current_time + "_Concept_count.html"   
    return

In [36]:
def medcat_lr(df, top_freq_concepts=None):
    """
    This function will return the learning rate for overall MedCAT performance.
    :param df: Use the doc_df
    :param top_freq_concepts: Filter by the top frequency concepts
    :return: plotly plot of medcat output learning rate
    """
    # TODO create a top_freq_concepts option
    doc_id = []
    no_correct = []
    value = []
    for index, row in df.iterrows():
        temp_df = pd.DataFrame([a for a in row['annotations']])
        for index, row2 in temp_df.iterrows():
            doc_id.append(row['id'])
            no_correct.append(row2["correct"])
            value.append(row2["value"])
    summary_df = pd.DataFrame(columns=["doc_id", "correct", "value"])
    summary_df["doc_id"] = doc_id
    summary_df["correct"] = no_correct
    summary_df["value"] = value

    # Calculate accuracy value of each grouped synonym
    by_name = summary_df.groupby(['value']) \
        .agg({'doc_id': 'count', 'correct': 'sum'}) \
        .rename(columns={'doc_id': 'Value count', 'correct': 'Correct sum'})
    by_name['Percent Acc'] = by_name['Correct sum'] / by_name['Value count'] * 100
    display(by_name)
    # TODO test if working
    display(by_name[by_name['Percent Acc'] == 0].sort_values(by=['Value count'], ascending=False))

    # Calculate accuracy per doc
    accuracy_by_doc = summary_df.groupby(["doc_id"]).agg({'correct': 'sum', 'value': 'count'}) \
        .reset_index() \
        .rename(columns={'correct': 'Correct sum', 'value': 'Value count'})
    accuracy_by_doc.index = accuracy_by_doc.index + 1  # shift index +1

    accuracy_by_doc['Percent Acc'] = accuracy_by_doc['Correct sum'] / accuracy_by_doc['Value count'] * 100
    # Filter to only show documents with number of annotations > 10
    accuracy_by_doc = accuracy_by_doc[accuracy_by_doc['Value count'] >= 10]
    # Filter erroneous documents with 0 acc
    accuracy_by_doc = accuracy_by_doc[accuracy_by_doc['Percent Acc'] >= 1]
    display(accuracy_by_doc)

    # Plot plotly
    accuracy_by_doc['Document Number'] = accuracy_by_doc.index
    y = accuracy_by_doc['Percent Acc']
    
    fig = px.scatter(accuracy_by_doc, x='Document Number', y=y,
                     size=accuracy_by_doc['Value count'], size_max=10,
                     title="MedCAT Learning Rate",
                     trendline="ols"
                     )
    
    # Update layout
    fig.update(layout=dict(
        title="MedCAT Learning rate",
        yaxis=dict(
            title='% Confirmed Accurate'),
        xaxis=dict(
            title='Document Number')
        )
    )
    fig.update_yaxes(range=[0, 110])
    
    # Filename with datetime
    now = datetime.now()
    current_time = now.strftime("%Y%m%d%H%M%S")
    filename = current_time + "_MedCAT_LR.html"
    
    # Save figure
    py.offline.plot(fig, filename=filename, auto_open=True)  # Saves figure to current_time + "_MedCAT_LR.html"
    return


## Produce Figures
-------------------


### Concept Count plot

In [41]:
concept_count(ann_df)  # produce a plot of the number of SNOMED concepts within the documents

### MedCAT learning rate plot

In [22]:
medcat_lr(doc_df)

Unnamed: 0_level_0,Value count,Correct sum,Percent Acc
value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Amlodipine,1,0.0,0.000000
Penicillin,1,0.0,0.000000
Phenytoin,1,0.0,0.000000
fibromyalgia,1,0.0,0.000000
hippocampal temporal sclerosis,1,0.0,0.000000
...,...,...,...
word finding difficulties,2,2.0,100.000000
word-finding difficulties,1,1.0,100.000000
worried,3,2.0,66.666667
worry,1,0.0,0.000000


Unnamed: 0_level_0,Value count,Correct sum,Percent Acc
value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Topiramate,24,0.0,0.0
EEG,18,0.0,0.0
Keppra,11,0.0,0.0
follow up,10,0.0,0.0
Follow-up,8,0.0,0.0
...,...,...,...
burning smells,1,0.0,0.0
burns on the face,1,0.0,0.0
calceos,1,0.0,0.0
cannabis,1,0.0,0.0


Unnamed: 0,doc_id,Correct sum,Value count,Percent Acc
1,34042,16.0,27,59.259259
2,34043,34.0,106,32.075472
3,34044,14.0,42,33.333333
4,34045,19.0,36,52.777778
5,34046,10.0,15,66.666667
...,...,...,...,...
109,34151,32.0,41,78.048780
110,34152,16.0,22,72.727273
111,34153,31.0,34,91.176471
112,34154,47.0,56,83.928571
