In [2]:
import pandas as pd
from pprint import pprint as pprint

def read_prefixes(filename):
    """ 
    :param filename: path to a CSV file with columns "ns", "uri"
    :return: 
    """
    return pd.read_csv(filename, header=0, names=["ns", "uri"])

def read_fp_distinct(filename):
    """ 
    :param filename: path to a CSV file with columns "s", "p", "o"
    :return: 
    """
    return pd.read_csv(filename, header=0, names=["s", "p", "o"])

def read_fp_count(filename):
    """ 
    :param filename: path to a CSV file with columns "s", "p", "o", "c"
    :return: 
    """
    return pd.read_csv(filename, header=0, names=["s", "p", "o", "c"])

def replace_ns(triples, ns_dataframe):
    """ 
    :param triples: a dataframe with columns (s,p,o)
    :param ns_dataframe: a dataframe with columns (ns,uri)
    :return: triples replaced with ns
    """
    d = dict(zip(ns_dataframe.uri, ns_dataframe.ns))
    return triples.replace(d, regex=True)

def df_to_set_of_tuples(df, list_of_desired_columns=["s", "p", "o"]):
    """ 
    :param df: the fingerprint dataframe 
    :param list_of_desired_columns: in case it is a fingerprint with counts then this is an option 
            to remove the count column, or select any combination of columns
    :return: set of tuples
    """
    return set([tuple(line) for line in df[list_of_desired_columns].values.tolist()])

def df_diff(alpha, beta):
    """ 
    provides the set difference between two dataframes
    :param alpha: first dataframe
    :param beta: second dataframe
    :return: (a^b, a - b, b - a)
    """
    a = df_to_set_of_tuples(alpha)
    b = df_to_set_of_tuples(beta)
    return a.intersection(b), a.difference(b), b.difference(a)


fp1 = read_fp_count("test/dataset_figerprint_for_count.rq_eurovoc44.log")
fp2 = read_fp_count("test/dataset_figerprint_for_count.rq_EV45OLD.log")
ns = read_prefixes("test/prefix.csv")

fp1 = replace_ns(fp1,ns)
fp2 = replace_ns(fp2,ns)

print "Done."

Done.


The next section defines functions generating parts of latex documents

In [8]:
from pylatex import Package
from pylatex import Section, Subsection, Document
from pylatex.base_classes import Environment
from pylatex.utils import NoEscape


class LandscapeEnvironment(Environment):
    _latex_name = 'landscape'
    packages = [Package('pdflscape')]

def diff_to_latex_section(tex_doc, alpha, alpha_description, beta, beta_description,
                          cols=["Subject", "Predicate", "Object"]):
    """ 
    :param tex_doc: the pylatex document
    :param alpha: the first fingerprint
    :param alpha_description: the first fingerprint description
    :param beta: the second fingerprint
    :param beta_description: the second fingerprint description
    :return: returns a section of latex document with deltas
    """
    cmn_s, adb_s, bda_s = df_diff(alpha, beta)

    cmn_df = pd.DataFrame(list(cmn_s), columns=cols, )
    cmn_df.sort_values(by=cols, inplace=True)

    adb_df = pd.DataFrame(list(adb_s), columns=cols, )
    adb_df.sort_values(by=cols, inplace=True)

    bda_df = pd.DataFrame(list(bda_s), columns=cols, )
    bda_df.sort_values(by=cols, inplace=True)

    ref_alpha = alpha_description["title"]
    ref_beta = beta_description["title"]

    section_title = 'Difference between ' + ref_alpha + ' and ' + ref_beta
    with tex_doc.create(LandscapeEnvironment()):
        with tex_doc.create(Section(section_title)):
            # tex_doc.append(alpha_description)
            # tex_doc.append(beta_description)
            with tex_doc.create(Subsection("Common parts")) as subsec:
                subsec.append("The table below represents the elements common to both datasets.")
                # with tex_doc.create(Table(position='H')) as tbl:
                tex_doc.append(NoEscape(cmn_df.to_latex(longtable=True, index=False)))

            with tex_doc.create(Subsection("Unique to " + ref_alpha)) as subsec:
                subsec.append(
                    "The table below represents the elements present in " + ref_alpha + " but missing in " + ref_beta + ".")
                # with tex_doc.create(Table(position='H')) as tbl:
                tex_doc.append(NoEscape(adb_df.to_latex(longtable=True, index=False)))

            with tex_doc.create(Subsection("Unique to " + ref_beta)) as subsec:
                subsec.append(
                    "The table below represents the elements present in " + ref_beta + " but missing in " + ref_alpha + ".")
                # with tex_doc.create(Table(position='H')) as tbl:
                tex_doc.append(NoEscape(bda_df.to_latex(longtable=True, index=False)))

                # diff_to_latex_table(fp1, "Bla Bla", fp2, "Blu blu")


configuration_dict = {
    "type": "difference between two dataset fingerprints",
    "alpha": {"title": "EuroVoc 4.4",
              "filename": "test/dataset_figerprint_for_count.rq_eurovoc44.log",
              "desc": "EuroVoc 4.4 was released a long time ago using EuroVoc Ontology"},
    "beta": {"title": "EuroVoc 4.5",
             "filename": "test/dataset_figerprint_for_count.rq_EV45OLD.log",
             "desc": "EuroVoc 4.5 was released in July 2016 with SKOS-AP-EU and then converted to fit also the old EuroVoc Ontology."},
}

def generate_document(filename, config=configuration_dict):
    """ 
    :param filename: filename for the tex document
    :param config: 
    :return: None
    """
    geometry_options = {
        "head": "40pt",
        "margin": "0.5in",
        "bottom": "0.6in",
        "includeheadfoot": True
    }
    doc = Document('basic', geometry_options=geometry_options)
    
    doc.packages.append(Package('longtable'))
    doc.packages.append(Package('booktabs'))
    doc.packages.append(Package('float'))
    doc.packages.append(Package('ltablex'))
    doc.packages.append(Package('pdflscape'))
    
    first_page = PageStyle("firstpage")
    
    
    # Add document title
    with first_page.create(Head("R")) as right_header:
        with right_header.create(MiniPage(width=NoEscape(r"0.49\textwidth"),
                                 pos='c', align='r')) as title_wrapper:
            title_wrapper.append(LargeText(bold("Bank Account Statement")))
            title_wrapper.append(LineBreak())
            title_wrapper.append(MediumText(bold("Date")))

    # Add footer
    with first_page.create(Foot("C")) as footer:
        message = "Important message please read"
        with footer.create(Tabularx(
                "X X X X",
                width_argument=NoEscape(r"\textwidth"))) as footer_table:

            footer_table.add_row(
                [MultiColumn(4, align='l', data=TextColor("blue", message))])
            footer_table.add_hline(color="blue")
            footer_table.add_empty_row()

            branch_address = MiniPage(
                width=NoEscape(r"0.25\textwidth"),
                pos='t')
            branch_address.append("960 - 22nd street east")
            branch_address.append("\n")
            branch_address.append("Saskatoon, SK")

            document_details = MiniPage(width=NoEscape(r"0.25\textwidth"),
                                        pos='t', align='r')
            document_details.append("1000")
            document_details.append(LineBreak())
            document_details.append(simple_page_number())

            footer_table.add_row([branch_address, branch_address,
                                  branch_address, document_details])

    doc.preamble.append(first_page)
    # End first page style
    


    diff_to_latex_section(doc, fp1, config["alpha"], fp2, config["beta"])
    doc.generate_tex(filepath=filename)
    doc.generate_pdf(clean_tex=False, filepath=filename)

generate_document("temp/diff_report", configuration_dict)

the next section generates the LaTex documents and compiles them to PDF

In [3]:
from df_io import read_prefixes, read_fp_spo_count, replace_ns, compile_tex_file_multipass
ns = read

read_fp_spo_count("test/test_fingerprint_spo.csv").replace_ns()

Unnamed: 0,stype,p,ootype,propType,scnt,ocnt,cnt,min_sp,max_sp,avg_sp
0,http://publications.europa.eu/ontology/authori...,http://publications.europa.eu/ontology/authori...,http://www.w3.org/2001/XMLSchema#string,data,1053,7,1053,1,1,1.000000
1,http://publications.europa.eu/ontology/authori...,http://publications.europa.eu/ontology/authori...,http://www.w3.org/2001/XMLSchema#string,data,1340,1116,1340,1,1,1.000000
2,http://publications.europa.eu/ontology/authori...,http://purl.org/dc/elements/1.1/source,http://www.w3.org/2001/XMLSchema#string,data,1340,7,1340,1,1,1.000000
3,http://publications.europa.eu/ontology/authori...,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://publications.europa.eu/ontology/authori...,object,1340,1,1340,1,1,1.000000
4,http://publications.europa.eu/ontology/euvoc#C...,http://publications.europa.eu/ontology/authori...,http://www.w3.org/2001/XMLSchema#string,data,9,9,9,1,1,1.000000
5,http://publications.europa.eu/ontology/euvoc#C...,http://publications.europa.eu/ontology/authori...,http://www.w3.org/2001/XMLSchema#string,data,9,9,9,1,1,1.000000
6,http://publications.europa.eu/ontology/euvoc#C...,http://publications.europa.eu/ontology/euvoc#s...,http://www.w3.org/2001/XMLSchema#date,data,9,3,9,1,1,1.000000
7,http://publications.europa.eu/ontology/euvoc#C...,http://publications.europa.eu/ontology/euvoc#s...,http://publications.europa.eu/ontology/euvoc#C...,object,9,1,9,1,1,1.000000
8,http://publications.europa.eu/ontology/euvoc#C...,http://publications.europa.eu/ontology/euvoc#s...,http://www.w3.org/2004/02/skos/core#Concept,object,9,1,9,1,1,1.000000
9,http://publications.europa.eu/ontology/euvoc#C...,http://publications.europa.eu/ontology/euvoc#x...,http://publications.europa.eu/ontology/euvoc#X...,object,9,18,18,2,2,2.000000
