# Entity highlighting

Given a csv of the form

```
origin,text,entity_ranges
0,"an example of highlighted text","[[3, 10], [14, 25]]"
```
or a JSON file of the form
```
[
  {
    "origin": 0,
    "text": "an example of highlighted text",
    "entity_ranges": [
      [3, 10],
      [14, 25]
    ]
  }
]
```

you can transform it into:

"an <mark><u><b>example</b></u></mark> of <mark><u><b>highlighted</b></u></mark> text"

## NOTE: Ensure that the entity ranges do not overlap each other!

In [None]:
import ast
import pandas as pd

In [None]:
def apply_highlight(text, ranges):
    mark_start = "<mark><u><b>"
    mark_end = "</b></u></mark>"

    html_template = "<html>\n <body>\n  {}\n </body>\n</html>\n"

    # process ranges in reverse to make it easier to handle indices
    for i, j in sorted(ranges, reverse=True):
        text = text[:j] + mark_end + text[j:]
        text = text[:i] + mark_start + text[i:]

    return html_template.format(text)

In [None]:
# # For CSV
csv_input_path = "./example/input/example.csv"
csv_output_path = "./example/output/example.csv"

data = pd.read_csv(csv_input_path, converters={"entity_ranges": ast.literal_eval})
data["html"] = data.apply(
    lambda x: apply_highlight(x["text"], x["entity_ranges"]), axis=1
)
data[["origin", "html"]].to_csv(csv_output_path, index=False)

In [None]:
# #  For JSON
json_input_path = "./example/input/example.json"
json_output_path = "./example/output/example.json"

data = pd.read_json(json_input_path)
data["html"] = data.apply(
    lambda x: apply_highlight(x["text"], x["entity_ranges"]), axis=1
)
data[["origin", "html"]].to_json(json_output_path, orient="records", indent=2)

In [None]:
# Visualization
import IPython.display

print(data["html"][0])
IPython.display.HTML(data["html"][0])