In [None]:
import json
def process_jsonl_sentences(jsonl_file_path):
    with open(jsonl_file_path, 'r') as file:
        for line in file:
            json_obj = json.loads(line)
            text_to_infer = json_obj.get('text', '')
            print(text_to_infer)
process_jsonl_sentences('output.jsonl')

In [None]:
import re
from chained_classification.run_pipeline import run_pipeline as NERandREL
from class_recognition.class_recognition_pipeline import PipelineRunner as DOLCEAligner

def infer_years_for_dates(text):
    # Pattern to match dates, capturing the date part and the optional year
    date_pattern = re.compile(r'(\d{1,2}(?:st|nd|rd|th)?\s(?:January|February|March|April|May|June|July|August|September|October|November|December))(?:\s(\d{4}))?', re.I)
    
    # Find all dates in the text
    dates = [(match.group(0), match.start(), match.end(), match.group(2)) for match in date_pattern.finditer(text)]
    
    # If no dates found, return the original text
    if not dates:
        return text
    
    # Process dates to infer missing years
    for i, (_, start, end, year) in enumerate(dates):
        if year is None:
            # Look for the nearest date with a year, searching both directions
            prev_years = [prev_year for _, _, _, prev_year in dates[:i] if prev_year is not None]
            next_years = [next_year for _, _, _, next_year in dates[i+1:] if next_year is not None]
            
            # Determine the closest year from previous or next dates
            closest_year = prev_years[-1] if prev_years else (next_years[0] if next_years else None)
            
            # If a closest year is found, replace the date without year with the inferred year
            if closest_year:
                text = text[:start] + text[start:end] + f" {closest_year}" + text[end:]
    
    return text

def prepare_text_for_DOLCE_aligner(e, text):
    return f"{e.text} in the context of this sentence '{text}'"

def serialize_doc_with_relations(doc):
    # Convert the Doc to a basic JSON structure
    doc_json = doc.to_json()
    doc_json['relations'] = []
    doc_json['classes'] = {}  
    # Check for and add relation data if present
    if hasattr(doc._, 'rel'):
        relations = []
        for rel in doc._.rel:
            # Serialize all relations without filtering based on entity or class
            dep_entity = doc.ents[rel.dep]
            dest_entity = doc.ents[rel.dest]
            serialized_rel = {
                "dep_text": dep_entity.text,  # Dependent entity text
                "dep": rel.dep,  # Dependent entity index
                "rel": rel.relation,  # Relation type
                "dest_text": dest_entity.text,  # Destination entity text
                "dest": rel.dest  # Destination entity index
            }
            relations.append(serialized_rel)
        
        # Add the serialized relations to the doc_json
        doc_json['relations'] = relations

    return doc_json

def trim_context(entity, context, percentage):
    # Normalize spaces in entity and context
    entity = " ".join(entity.split())
    context = " ".join(context.split())
    
    # Use regex to find the entity in the context with case-insensitive search
    match = re.search(re.escape(entity), context, re.IGNORECASE)
    if not match:
        return context  # Or handle this case as you see fit
    
    # Extract the start index of the matched entity
    start_index = match.start()
    entity_words = entity.split()
    
    # Convert the context into words after finding the match to ensure alignment with entity position
    words = context.split()
    
    # Calculate the position of the entity in terms of word count, not characters
    word_count_before_entity = len(re.findall(r'\S+', context[:start_index]))
    
    # Calculate the number of words to include around the entity
    total_words = len(words)
    words_to_include = round(total_words * (percentage / 100))
    
    # Determine the slice of words to include around the entity
    half_words_to_include = words_to_include // 2
    slice_start = max(0, word_count_before_entity - half_words_to_include)
    slice_end = min(total_words, word_count_before_entity + len(entity_words) + half_words_to_include)
    
    # Adjust if the entity is towards the start or end of the sentence
    if slice_end - slice_start < words_to_include:
        if slice_start == 0:
            slice_end = min(slice_start + words_to_include, total_words)
        elif slice_end == total_words:
            slice_start = max(0, slice_end - words_to_include)
    
    # Reconstruct the trimmed context
    trimmed_context = ' '.join(words[slice_start:slice_end])
    
    return trimmed_context


entity2 = "lead vocalist"
context2 = "Anita Auglend is the lead vocalist of the gothic-doom metal band."
percentage2 = 60  # Adjust the percentage as needed
trimmed_context2 = trim_context(entity2, context2, percentage2)
print(trimmed_context2)

In [None]:
import json

def process_text_to_json_v2(text_to_infer):
    # Assume infer_years_for_dates, NERandREL, prepare_text_for_DOLCE_aligner, and serialize_doc_with_relations are defined elsewhere and operational
    text_to_infer = infer_years_for_dates(text_to_infer)
    doc = NERandREL(text=text_to_infer, config_path="./chained_classification/fewshot.cfg", examples_path="./chained_classification/examples.jsonl")

    # Use serialize_doc_with_relations to get initial structure including relations
    output_json = serialize_doc_with_relations(doc)

    # Update the text in the output_json
    output_json["text"] = text_to_infer

    # Process entities and match the new format
    for index, entity in enumerate(doc.ents):
        ent_dict = {
            "start_char": entity.start_char,
            "end_char": entity.end_char,
            "label": entity.label_,
            "text": entity.text
        }
        
        # For CLASS entities, use DOLCEAligner to determine subclassOf information
        if entity.label_ == "CLASS":
            print(entity.text)
            trimmed_context = trim_context(entity.text, text_to_infer, 50)
            print(trimmed_context)
            text = prepare_text_for_DOLCE_aligner(entity, trimmed_context)
            runner = DOLCEAligner(config_path="./class_recognition/fewshot.cfg", examples_path="./class_recognition/examples.jsonl")
            dolce_doc = runner.run(text)
            filtered_categories = {label: score for label, score in dolce_doc.cats.items() if score > 0.0}
            labels_with_positive_scores = list(filtered_categories.keys())

            # Add the subclassOf information to the ent_dict
            ent_dict["subClassOf"] = labels_with_positive_scores if labels_with_positive_scores else ["Unknown"]

            # Store class information separately with subclassOf details
            formatted_text = entity.text.replace(" ", "_")
            key = f"{formatted_text}_{index}"
            output_json['classes'][key] = {"labels": labels_with_positive_scores, "class": entity.text}

        # Append entity information to the ents list in output_json
        output_json["ents"].append(ent_dict)

    # Note: Relations are already included in output_json from the serialize_doc_with_relations call

    return output_json



# text_to_infer = "Fut\u016bh al-Buld\u0101n is an Arabic book by Persian historian Ahmad Ibn Yahya al-Baladhuri. The work by which he is best known is the Kitab Futuh al-Buldan (\"Book of the Conquests of the Lands\"), edited by M. J. de Goeje as Liber expugnationis regionum (Leiden, 1870; Cairo, 1901)."
# processed_json = process_text_to_json_v2(text_to_infer)
# with open("results/doc_data_1.json", "w", encoding="utf-8") as f:
#     json.dump(processed_json, f, ensure_ascii=False, indent=2)

In [None]:
import json

def find_substring_position(main_string, substring):
    start_pos = main_string.find(substring)
    if start_pos != -1:
        end_pos = start_pos + len(substring)
        return start_pos, end_pos
    else:
        return None, None  # Substring not found

def read_and_check_json_file(file_path):
    # Open and read the JSON file
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    for e in data['ents']:
        if e['text'] in data['text']:
            start_real, end_real = find_substring_position(data['text'], e['text'])
            start_j = e['start_char']
            end_j = e['end_char']
            print(start_real, start_j)
            print(end_real, end_j)


# Specify the path to your JSON file
json_file_path = 'examples_jsonl/adalgis.jsonl'

# Run the script with the specified JSON file
read_and_check_json_file(json_file_path)

In [None]:
text = "Aaron Copland was an American composer, composition teacher, writer, and later in his career a conductor of his own and other American music."
find_substring_position(text, "American music")

In [None]:
import json

def parse_json_and_return_data(input_file):
    # Open and read the JSON file
    with open(input_file, 'r', encoding='utf-8') as file:
        data = json.load(file)  # Load the entire JSON file
        
        # Assume data is a list of objects, each with a "sentence" key
        for index, item in enumerate(data):
            # Extract the sentence text
            sentence_text = item["sentence"]
            
            # Process the text
            processed_json = process_text_to_json_v2(sentence_text)
            
            # Construct the output filename
            output_filename = f"results-GPT-4/doc_data_{index}.json"
            
            # Save the processed JSON to a file
            with open(output_filename, "w", encoding='utf-8') as out_file:
                json.dump(processed_json, out_file, ensure_ascii=False, indent=2)

            print(f"Processed and saved: {output_filename}")

# Example usage
# Adjust the input file path to your actual JSON file path
input_file_path = 'oke2_eval_dataset.json'
parse_json_and_return_data(input_file_path)
