This notebook analyzes the performance of the regex method and the detectron method in classifying patents which have drawings.

In [1]:
import os
import pandas as pd

In [2]:
ground_truth_path = 'lists_of_patents_with_figures/ground_truth/new_test.txt'
regex_method_path = 'lists_of_patents_with_figures/regex_method/new_test.txt'
detectron_method_path = 'lists_of_patents_with_figures/detectron_method/new_test.txt'

In [3]:
ground_truth = []
regex_method = []
detectron_method = []

with open(ground_truth_path, 'r') as f:
    for line in f:
        ground_truth.append(line.replace('\n', ''))
        
with open(regex_method_path, 'r') as f:
    for line in f:
        regex_method.append(line.replace('\n', '').split('/')[-1].split('.')[0])

with open(detectron_method_path, 'r') as f:
    for line in f:
        detectron_method.append(line.replace('\n', ''))
all_patents = os.listdir('/Volumes/Non-Backup_Files/GB-patents/seeded_data_for_model/new_test')

In [4]:
def get_label(row, list_, label):
    """
    function to add a column to a given pandas dataframe
    Args:
    row - a given row where we want to add the column
    list_ - if the column 'Patent Number' is in list_, the label is 'yes', else it is 'no'
    label - name of label to add as column
    Returns:
    row - the row with the added column
    """
    if row['Patent Number'] in list_:
        row[label] = 'yes'
    else:
        row[label] = 'no'
    return row

def get_real_label(row):
    """
    function to add ground truth column to dataframe
    Args:
    row - a given row where we want to add the column
    Returns:
    the row with the added column
    """
    return get_label(row, list_=ground_truth, label='Truth')

def get_detectron_label(row):
    """
    function to add detectron label column to dataframe
    Args:
    row - a given row where we want to add the column
    Returns:
    the row with the added column
    """
    return get_label(row, list_=detectron_method,label='According to Detectron')

def get_regex_label(row):
    """
    function to add regex label column to dataframe
    Args:
    row - a given row where we want to add the column
    Returns:
    the row with the added column
    """
    return get_label(row, list_=regex_method, label='According to regex method')

In [5]:
patents = pd.DataFrame(all_patents,columns=['Patent Number'])
patents

Unnamed: 0,Patent Number
0,GB1000941A
1,GB1002311A
2,GB1002479A
3,GB1004509A
4,GB100619A
...,...
551,GB189920732A
552,GB189921838A
553,GB189925099A
554,GB189925140A


In [6]:
patents

Unnamed: 0,Patent Number
0,GB1000941A
1,GB1002311A
2,GB1002479A
3,GB1004509A
4,GB100619A
...,...
551,GB189920732A
552,GB189921838A
553,GB189925099A
554,GB189925140A


In [7]:
patents = patents.apply(lambda row: get_real_label(row),axis=1).apply(lambda row: get_regex_label(row),axis=1).apply(lambda row: get_detectron_label(row),axis=1)

In [8]:
patents

Unnamed: 0,Patent Number,Truth,According to regex method,According to Detectron
0,GB1000941A,yes,yes,yes
1,GB1002311A,yes,yes,yes
2,GB1002479A,yes,yes,yes
3,GB1004509A,yes,yes,yes
4,GB100619A,yes,yes,yes
...,...,...,...,...
551,GB189920732A,yes,yes,yes
552,GB189921838A,no,no,no
553,GB189925099A,no,no,no
554,GB189925140A,yes,yes,yes


In [9]:
yes_regex = patents[patents['According to regex method'] == 'yes']
no_regex = patents[patents['According to regex method'] == 'no']
yes_detectron = patents[patents['According to Detectron'] == 'yes']
no_detectron = patents[patents['According to Detectron'] == 'no']
yes_truth = patents[patents['Truth'] == 'yes']
no_truth = patents[patents['Truth'] == 'no']

In [95]:
#classified as yes and actually yes
len(yes_detectron[yes_detectron['Truth'] == 'yes'])/len(yes_detectron)

0.9885844748858448

In [96]:
#classified as no and actually no
len(no_detectron[no_detectron['Truth'] == 'no'])/len(no_detectron)

0.9915254237288136

In [97]:
#classified as yes and actually yes
len(yes_regex[yes_regex['Truth'] == 'yes'])/len(yes_regex)

0.980722891566265

In [98]:
#classified as no and actually no
len(no_regex[no_regex['Truth'] == 'no'])/len(no_regex)

0.8085106382978723

In [102]:
#actually yes and classified as yes
len(yes_truth[yes_truth['According to Detectron'] == 'yes'])/len(yes_truth)

0.9976958525345622

In [101]:
#actually yes and classified as yes
len(yes_truth[yes_truth['According to regex method'] == 'yes'])/len(yes_truth)

0.9377880184331797

In [100]:
#actually no and classified as no
len(no_truth[no_truth['According to Detectron'] == 'no'])/len(no_truth)

0.9590163934426229

In [99]:
#actually no and classified as no
len(no_truth[no_truth['According to regex method'] == 'no'])/len(no_truth)

0.9344262295081968

In [16]:
correct_class_detectron = patents[patents['According to Detectron'] == patents['Truth']].reset_index().drop(columns=['index'])
correct_class_regex = patents[patents['According to regex method'] == patents['Truth']].reset_index().drop(columns=['index'])

In [15]:
#precision
len(correct_class_detectron)/len(patents)

0.9892086330935251

In [17]:
#precision
len(correct_class_regex)/len(patents)

0.9370503597122302