This code calculates precision and recall scores of title/prose detection. 

In [None]:
import os, fnmatch
import webbrowser

In [None]:
def find_files(pattern, path):
    """
    Walks the path recursively and returns a list of files
    whose filenames match the pattern passed.
    
    Input: pattern, path
    Output: list of file names
    """
    result = []
    for root, dirs, files in os.walk(path):
        for name in files:
            if fnmatch.fnmatch(name, pattern):
                result.append(os.path.join(root, name))
    return result

In [None]:
semantic_header_files = find_files('dom_ind.headers', r'C:/UsablePrivacyPolicy/Data/Misc')

In [None]:
master_header_files = find_files('gold.headers', r'C:/UsablePrivacyPolicy/Data/Misc')

In [None]:
def extract_final_folder(absolute_path):
    """
    Extracts the string which appears between the final folder separator
    and the pre final folder separator
    
    Ex - Input = C:\\UsablePrivacyPolicy\\Data\\OtherSites\\activision.com\\activision.txt
    Returns activision.com
    """
    substring = absolute_path[:absolute_path.rfind('\\')]
    return substring[substring.rfind('\\')+1 :]
    

In [None]:
def extract_headers(absolute_path):
    """
    returns all the lines present in the file passed as a list
    """
    headers = []
    with open (absolute_path) as f:
        headers = f.readlines()
    
    
    headers_return = []
    for each_line in headers:
        each_line = each_line.strip().lower()
        
        headers_return.append(''.join(e for e in each_line if e.isalnum()))
    
    return set(headers_return)
     

In [None]:
def create_header_dict(header_file_list):
    """
    returns a dictionary with the following pattern
    {final_folder_name:[Header1,Header2]}
    """
    dict_headers = {}
    for each_file in header_file_list:
        final_folder_name = extract_final_folder(each_file)
        list_headers = extract_headers(each_file)
        dict_headers[final_folder_name] = list_headers
    
    return dict_headers

In [None]:
dict_semantic_headers = create_header_dict(semantic_header_files)

In [None]:
dict_master_headers = create_header_dict(master_header_files)

In [None]:
assert(len(dict_semantic_headers) == len(dict_master_headers)), "Lengths don't match"

On to precision and recall part

In [None]:
results_file = open('results_headers_markers.html','w')

file_contents = """<html>
<head><title>Header Evaluation Results</title></head>
<body><table style="width:40%">
  <tr>
    <th>Website</th>
    <th>Precision</th> 
    <th>Recall</th>
    <th>F1</th>
  </tr>"""

close_html = """</body></html>"""

#f.write(message)
#f.close()

#webbrowser.open_new_tab('helloworld.html')

In [None]:
total_precision = 0
total_recall = 0
global_correct_headers = 0
global_predicted_headers = 0
global_correct_predicted_headers = 0



for website, master_header_list in dict_master_headers.items():
    
    # Get the corresponding semantic header
    try:
        semantic_header_list = dict_semantic_headers[website]
    except:
        continue
    
    #print(master_header_list)
    #print(semantic_header_list)
    
    total_correct_headers = len(master_header_list)
    total_predicted_headers = len(semantic_header_list)
    total_correct_predicted_headers = 0
    
    for each_master_header in master_header_list:
        for each_semantic_header in semantic_header_list:
            if(each_master_header in each_semantic_header or each_semantic_header in each_master_header):
                total_correct_predicted_headers += 1
                #print(each_master_header)
                break
    
    if(total_correct_predicted_headers > total_predicted_headers):
        total_correct_predicted_headers = total_predicted_headers
    
    try:
        recall = total_correct_predicted_headers / total_correct_headers
    except ZeroDivisionError:
        recall = 0
        
    try:
        precision = total_correct_predicted_headers / total_predicted_headers
    except ZeroDivisionError:
        precision = 0
        
    #total_precision += precision
    try:
        f1 = 2 * precision * recall / (precision + recall)
    except ZeroDivisionError:
        f1 = 0
        
    global_correct_headers += total_correct_headers
    global_predicted_headers += total_predicted_headers
    global_correct_predicted_headers += total_correct_predicted_headers
    
    file_contents = file_contents + "<tr><td>" + website + "</td>" + "<td>" + '{0:.2f}'.format(precision) + "</td>"
    file_contents = file_contents + "<td>" + '{0:.2f}'.format(recall) + "</td>"
    file_contents = file_contents + "<td>" + '{0:.2f}'.format(f1) + "</td></tr>"

In [None]:
try:
    total_precision = global_correct_predicted_headers / global_predicted_headers
except ZeroDivisionError:
        precision = 0

try:
    total_recall = global_correct_predicted_headers / global_correct_headers
except ZeroDivisionError:
        precision = 0

total_f1 = 2 * total_precision * total_recall / (total_precision + total_recall)
print(global_correct_headers)
print(global_correct_predicted_headers)
print(global_predicted_headers)
file_contents = file_contents + "<tr><td>" + "Total" + "</td>" + "<td>" + '{0:.4f}'.format(total_precision) + "</td>"
file_contents = file_contents + "<td>" + '{0:.4f}'.format(total_recall) + "</td>"
file_contents = file_contents + "<td>" + '{0:.4f}'.format(total_f1) + "</td></tr>"

file_contents = file_contents + "</table>" + close_html

results_file.write(file_contents)
results_file.close()

webbrowser.open_new_tab('results_headers_markers.html')