# Identifying and Merging Relevant Fields
In our specific use case, we will take the AgendaItem as the main index for our Elasticsearch data. Our goal is to merge data from Meetings, People, Files, and Organizations JSON objects into the AgendaItem index. Below are sample codes for the pre-processing steps that will be useful for this process: \
Note - these pre-processing codes are just for reference , change the code according to your use case.

In [None]:
# we will start with meetings_data.json . 
# code to remove specific fields that we dont want('type', 'agendaItem', 'verbatimProtocol', 'invitation','created', 'modified')
import json

# Load the JSON data from the file
file_path = '/Users/ameerkhan/Downloads/Oparl_Files/meetings_data_copy.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Function to remove specified fields from each meeting
def remove_fields(json_data, fields_to_remove):
    for meeting in json_data:
        for field in fields_to_remove:
            if field in meeting:
                del meeting[field]

# Specify the fields to be removed from the JSON data
fields_to_remove = ['type', 'agendaItem', 'verbatimProtocol', 'invitation','created', 'modified']

# Call the function to remove the specified fields
remove_fields(data, fields_to_remove)

# Save the modified JSON data back to the file
with open(file_path, 'w') as file:
    json.dump(data, file, indent=2)

print("Specified fields removed successfully.")

In [None]:
# code to remove data from fields that are nested. For example we want to remove 'type', 'created', 'modified' from the 'locations' field in meetings_json

import json

# Load the JSON data from the file
file_path = '/Users/ameerkhan/Downloads/Oparl_Files/meetings_data_copy.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Function to remove specified fields from the "location" field inside each meeting
def remove_fields_from_location(json_data, fields_to_remove):
    for meeting in json_data:
        if 'location' in meeting and isinstance(meeting['location'], dict):
            location = meeting['location']
            for field in fields_to_remove:
                if field in location:
                    del location[field]

# Specify the fields to be removed from the "location" field
fields_to_remove = ['tyep', 'created', 'modified']

# Call the function to remove the specified fields from the "location" field
remove_fields_from_location(data, fields_to_remove)

# Save the modified JSON data back to the file
with open(file_path, 'w') as file:
    json.dump(data, file, indent=2)

print("Specified fields removed successfully from the 'location' field.")


In [None]:
# code to flatten the nested locations field in meetings_data.json

import json

# Load the JSON data from the file
file_path = '/Users/ameerkhan/Downloads/Oparl_Files/meetings_data_copy.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Function to modify the "location" field inside each meeting
def modify_location(json_data):
    for meeting in json_data:
        if 'location' in meeting and isinstance(meeting['location'], dict):
            location = meeting['location']
            meeting['meeting_location_id'] = location.get('id', None)
            meeting['meeting_streetAddress'] = location.get('streetAddress', None)
            meeting['meeting_room'] = location.get('room', None)
            meeting['meting_postalCode'] = location.get('postalCode', None)
            meeting['meeting_subLocality'] = location.get('subLocality', None)
            meeting['meeting_locality'] = location.get('locality', None)
            del meeting['location']

# Call the function to modify the "location" field
modify_location(data)

# Specify the path for the new JSON file
new_file_path = '/Users/ameerkhan/Downloads/Oparl_Files/updated_meetings.json'

# Save the modified JSON data to the new file
with open(new_file_path, 'w') as new_file:
    json.dump(data, new_file, indent=2)

print("Modified JSON data saved to the new file.")


In [None]:
# Working with the organizations data and renaming the field names (organisation_id, organisation_name )and storing them in to a new json file , so that we can merge it later with meetings_data.json

import json

# Load the JSON data from the "organisation_data.json" file
file_path = '/Users/ameerkhan/Downloads/Oparl_Files/organizations_data.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Create a new list to store the modified data
modified_data = []
for item in data:
    # Extract "id" and "name" fields and rename them
    organisation_id = item['id']
    organisation_name = item['name']
    
    # Create a new dictionary with the modified fields
    modified_item = {
        "organisation_id": organisation_id,
        "organisation_name": organisation_name
    }
    
    # Append the modified data to the new list
    modified_data.append(modified_item)

# Specify the path for the new JSON file
new_file_path = '/Users/ameerkhan/Downloads/Oparl_Files/new_organisation_data.json'

# Save the modified data to the new JSON file
with open(new_file_path, 'w') as new_file:
    json.dump(modified_data, new_file, indent=2)

print("Modified JSON data saved to the new file.")


In [None]:
# function to replace the "organization" field in meetings.json file.

import json

# Load the modified JSON data from the "new_json_file"
new_file_path = '/Users/ameerkhan/Downloads/Oparl_Files/updated_meetings.json'
with open(new_file_path, 'r') as file:
    new_data = json.load(file)

# Load the "new_organisation_data.json" file with the required fields (id and name)
new_organisation_data_path = '/Users/ameerkhan/Downloads/Oparl_Files/new_organisation_data.json'
with open(new_organisation_data_path, 'r') as file:
    organisation_data = json.load(file)

# Create a dictionary mapping organisation IDs to organisation names
organisation_map = {item['organisation_id']: item['organisation_name'] for item in organisation_data}

# Function to replace the "organization" field with "organisation_id" and "organisation_name"
def replace_organization(json_data):
    for item in json_data:
        org_urls = item.get('organization', [])
        organisation_names = [organisation_map.get(org_url) for org_url in org_urls]
        item['organisation_id'] = org_urls
        item['organisation_name'] = organisation_names
        del item['organization']

# Call the function to replace the "organization" field
replace_organization(new_data)

# Specify the path for the new JSON file
new_file_path_with_organisation = '/Users/ameerkhan/Downloads/Oparl_Files/meeting_location_organisation.json'

# Save the modified JSON data with the replaced "organization" field to the new file
with open(new_file_path_with_organisation, 'w') as new_file:
    json.dump(new_data, new_file, indent=2)

print("Modified JSON data with replaced organisation field saved to the new file.")


In [None]:
# it came as a list, changing them into single values

import json

# Load the modified JSON data from the input file
input_file_path = '/Users/ameerkhan/Downloads/Oparl_Files/meeting_location_organisation.json'
with open(input_file_path, 'r') as file:
    data = json.load(file)

# Function to convert the list of organisation IDs and organisation names to single values
def convert_to_single_value(json_data):
    for item in json_data:
        org_ids = item.get('organisation_id', [])
        org_names = item.get('organisation_name', [])
        item['organisation_id'] = org_ids[0] if org_ids else None
        item['organisation_name'] = org_names[0] if org_names else None

# Call the function to convert to single values
convert_to_single_value(data)

# Specify the path for the new JSON file
output_file_path = '/Users/ameerkhan/Downloads/Oparl_Files/finetuned_meeting_location_organisation.json'

# Save the modified JSON data to the new file
with open(output_file_path, 'w') as output_file:
    json.dump(data, output_file, indent=2)

print("Modified JSON data with single values for organisation_id and organisation_name saved to the new file.")

In [None]:
# working with peoples data
import json

# Load the people_data.json file
people_data_path = '/Users/ameerkhan/Downloads/Oparl_Files/people_data.json'
with open(people_data_path, 'r') as file:
    data = json.load(file)

# Function to extract the desired fields and change their names
def extract_and_rename_fields(json_data):
    new_data = []
    for item in json_data:
        new_item = {
            'person_id': item.get('id', ''),
            'person_name': item.get('name', ''),
            'person_formOfAddress': item.get('formOfAddress', ''),
            'person_gender': item.get('gender', ''),
            'person_web': item.get('web', ''),
            'person_email': item.get('email', ''),
            'person_phone': item.get('phone', '')
        }
        new_data.append(new_item)
    return new_data

# Call the function to extract and rename the fields
new_data = extract_and_rename_fields(data)

# Specify the path for the new JSON file
new_file_path = '/Users/ameerkhan/Downloads/Oparl_Files/updated_people_data.json'

# Save the new JSON data to the new file
with open(new_file_path, 'w') as new_file:
    json.dump(new_data, new_file, indent=2)

print("New JSON file with the desired fields and renamed names created.")


In [None]:
import json

# Load the JSON file containing the data with participant IDs
data_path = '/Users/ameerkhan/Downloads/Oparl_Files/finetuned_meeting_location_organisation.json'
with open(data_path, 'r') as file:
    data = json.load(file)

# Load the "people.json" file containing the people data
people_path = '/Users/ameerkhan/Downloads/Oparl_Files/updated_people_data.json'
with open(people_path, 'r') as file:
    people_data = json.load(file)

# Create a dictionary to map participant IDs to their content
people_dict = {person['person_id']: person for person in people_data}

# Function to replace participant IDs with their content from people_data
def replace_participant_with_content(json_data):
    for item in json_data:
        participants = item.get('participant', [])
        item['participant'] = [people_dict.get(participant_id, {}) for participant_id in participants]
    return json_data

# Call the function to replace the participant IDs with their content
new_data = replace_participant_with_content(data)

# Specify the path for the new JSON file
new_file_path = '/Users/ameerkhan/Downloads/Oparl_Files/finetuned_meeting_location_organisation_participant.json'

# Save the new JSON data to the new file
with open(new_file_path, 'w') as new_file:
    json.dump(new_data, new_file, indent=2)

print("New JSON file with replaced participant content created.")


# below code is only for my reference where i am performing various pre preocessing operations to make the json file in a certain way

In [None]:
import json

# Load the JSON file containing the data
data_path = '/Users/ameerkhan/Downloads/Oparl_Files/finetuned_meeting_location_organisation_participant.json'
with open(data_path, 'r') as file:
    data = json.load(file)

# Function to modify the "auxiliaryFile" and "resultsProtocol" fields
def modify_files(json_data):
    for item in json_data:
        # Modify "auxiliaryFile" field
        auxiliary_files = item.get('auxiliaryFile', [])
        item['auxiliaryFile'] = [{
            'auxiliaryfile_id': file['id'],
            'auxiliaryfile_name': file['name'],
            'auxiliaryfile_downloadurl': file['downloadUrl']
        } for file in auxiliary_files]

        # Modify "resultsProtocol" field
        results_protocol = item.get('resultsProtocol', {})
        if results_protocol:
            item['resultsProtocol'] = {
                'results_protocol_id': results_protocol['id'],
                'results_protocol_name': results_protocol['name'],
                'results_protocol_downloadurl': results_protocol['downloadUrl']
            }
    return json_data

# Call the function to modify the "auxiliaryFile" and "resultsProtocol" fields
modified_data = modify_files(data)

# Specify the path for the new JSON file
new_file_path = '/Users/ameerkhan/Downloads/Oparl_Files/finetuned_meeting_location_organisation_participant_files.json'

# Save the modified JSON data to the new file
with open(new_file_path, 'w') as new_file:
    json.dump(modified_data, new_file, indent=2)

print("New JSON file with modified 'auxiliaryFile' and 'resultsProtocol' fields created.")


In [None]:
import json
import requests

# Load the JSON file containing the data
data_path = '/Users/ameerkhan/Downloads/Oparl_Files/finetuned_meeting_location_organisation_participant_files.json'
with open(data_path, 'r') as file:
    data = json.load(file)

# Function to get content from the extraction web service
def get_content_from_url(url):
    encoded_url = requests.utils.quote(url, safe='')
    api_url = f'http://pc-4301.kl.dfki.de:8080/ExtractionWebService/aloeExtractionHandler/getMetadataFromURIAndExtractContent?uri={encoded_url}'
    response = requests.get(api_url)
    if response.status_code == 200:
        content = response.json().get('m_resourceContent', '')
        return content
    else:
        print(f"Failed to retrieve content for URL: {url}")
        return ''

# Function to modify the "auxiliaryFile" and "resultsProtocol" fields
def modify_files(json_data):
    for item in json_data:
        # Modify "auxiliaryFile" field
        auxiliary_files = item.get('auxiliaryFile', [])
        item['auxiliaryFile'] = []
        for file in auxiliary_files:
            content = get_content_from_url(file['auxiliaryfile_downloadurl'])
            item['auxiliaryFile'].append({
                'auxiliaryfile_id': file['auxiliaryfile_id'],
                'auxiliaryfile_name': file['auxiliaryfile_name'],
                'auxiliaryfile_content': content
            })

        # Modify "resultsProtocol" field
        results_protocol = item.get('resultsProtocol', {})
        if results_protocol:
            content = get_content_from_url(results_protocol['results_protocol_downloadurl'])
            item['resultsProtocol'] = {
                'results_protocol_id': results_protocol['results_protocol_id'],
                'results_protocol_name': results_protocol['results_protocol_name'],
                'results_protocol_content': content
            }
    return json_data

# Call the function to modify the "auxiliaryFile" and "resultsProtocol" fields
modified_data = modify_files(data)

# Specify the path for the new JSON file
new_file_path = '/Users/ameerkhan/Downloads/Oparl_Files/finetuned_meeting_location_organisation_participant_files_downloaded.json'

# Save the modified JSON data to the new file
with open(new_file_path, 'w') as new_file:
    json.dump(modified_data, new_file, indent=2)

print("New JSON file with modified 'auxiliaryFile' and 'resultsProtocol' fields created.")


In [None]:
import json

# Load the JSON file containing the data
data_path = '/Users/ameerkhan/Downloads/Oparl_Files/agendaitems_data.json'
with open(data_path, 'r') as file:
    data = json.load(file)

# Function to remove specified fields from each item in the data
def remove_fields(json_data):
    for item in json_data:
        item.pop('type', None)
        item.pop('number', None)
        item.pop('order', None)
        item.pop('public', None)
        item.pop('consultation', None)
        item.pop('auxiliaryFile', None)
    return json_data

# Call the function to remove specified fields
modified_data = remove_fields(data)

# Specify the path for the new JSON file
new_file_path = '/Users/ameerkhan/Downloads/Oparl_Files/final_editing_agendaitems_data.json'

# Save the modified JSON data to the new file
with open(new_file_path, 'w') as new_file:
    json.dump(modified_data, new_file, indent=2)

print("New JSON file with specified fields removed created.")


In [None]:
import json

# Load the JSON file containing the data
data_path = '/Users/ameerkhan/Downloads/Oparl_Files/final_editing_agendaitems_data.json'
with open(data_path, 'r') as file:
    data = json.load(file)

# Function to rename keys in the data
def rename_keys(json_data):
    for item in json_data:
        item['agendaitem_id'] = item.pop('id', None)
        item['agendaitem_name'] = item.pop('name', None)
    return json_data

# Call the function to rename keys
modified_data = rename_keys(data)

# Specify the path for the new JSON file
new_file_path = '/Users/ameerkhan/Downloads/Oparl_Files/final_agendaitems.json'

# Save the modified JSON data to the new file
with open(new_file_path, 'w') as new_file:
    json.dump(modified_data, new_file, indent=2)

print("New JSON file with renamed keys created.")


In [None]:
import json

# Load the JSON file containing the data
data_path = '/Users/ameerkhan/Downloads/Oparl_Files/finetuned_meeting_location_organisation_participant_files_downloaded.json'
with open(data_path, 'r') as file:
    data = json.load(file)

# Function to rename fields in the data
def rename_fields(json_data):
    for item in json_data:
        item['meeting_id'] = item.pop('id', None)
        item['meeting_name'] = item.pop('name', None)
        item['meeting_start'] = item.pop('start', None)
        item['meeting_end'] = item.pop('end', None)
    return json_data

# Call the function to rename fields
modified_data = rename_fields(data)

# Specify the path for the new JSON file
new_file_path = '/Users/ameerkhan/Downloads/Oparl_Files/renamed_finetuned_meeting_location_organisation_participant_files_downloaded.json'

# Save the modified JSON data to the new file
with open(new_file_path, 'w') as new_file:
    json.dump(modified_data, new_file, indent=2)

print("New JSON file with renamed fields created.")


In [None]:
import json

# Load the JSON file containing the data
data_path = '/Users/ameerkhan/Downloads/Oparl_Files/renamed_finetuned_meeting_location_organisation_participant_files_downloaded.json'
with open(data_path, 'r') as file:
    data = json.load(file)

# Function to reorder fields in the data
def reorder_fields(json_data):
    modified_data = []
    for item in json_data:
        modified_item = {
            "meeting_id": item.get("meeting_id", ""),
            "meeting_name": item.get("meeting_name", ""),
            "meeting_start": item.get("meeting_start", ""),
            "meeting_end": item.get("meeting_end", ""),
            "meeting_location_id": item.get("meeting_location_id", ""),
            "meeting_streetAddress": item.get("meeting_streetAddress", ""),
            "meeting_room": item.get("meeting_room", ""),
            "meting_postalCode": item.get("meting_postalCode", ""),
            "meeting_subLocality": item.get("meeting_subLocality", ""),
            "meeting_locality": item.get("meeting_locality", ""),
            "participant": item.get("participant", []),
            "auxiliaryFile": item.get("auxiliaryFile", []),
            "resultsProtocol": item.get("resultsProtocol", {}),
            "organisation_id": item.get("organisation_id", ""),
            "organisation_name": item.get("organisation_name", "")
        }
        modified_data.append(modified_item)
    return modified_data

# Call the function to reorder fields
modified_data = reorder_fields(data)

# Specify the path for the new JSON file
new_file_path = '/Users/ameerkhan/Downloads/Oparl_Files/new.json'

# Save the modified JSON data to the new file
with open(new_file_path, 'w') as new_file:
    json.dump(modified_data, new_file, indent=2)

print("New JSON file with reordered fields created.")


In [None]:
#results protocol is added

import json

# Load the subset data JSON file
data_path = '/Users/ameerkhan/Downloads/Oparl_Files/reordered_final_agendaitems.json'
with open(data_path, 'r') as file:
    data = json.load(file)

# Load the meetings data JSON file
meetings_data_path = '/Users/ameerkhan/Downloads/Oparl_Files/reordered_renamed_finetuned_meeting_location_organisation_participant_files_downloaded.json'
with open(meetings_data_path, 'r') as file:
    meetings_data = json.load(file)

# Create a dictionary to map meeting_id to meeting data
meeting_dict = {meeting["meeting_id"]: meeting for meeting in meetings_data}

# Function to replace the meeting field with meeting data
def replace_meeting_field(json_data):
    modified_data = []
    for item in json_data:
        meeting_id = item.get("meeting")
        if meeting_id in meeting_dict:
            meeting_data = meeting_dict[meeting_id]
            modified_item = {
                "agendaitem_id": item.get("agendaitem_id", ""),
                "agendaitem_name": item.get("agendaitem_name", ""),
                "meeting_id": meeting_data.get("meeting_id", ""),
                "meeting_name": meeting_data.get("meeting_name", ""),
                "meeting_start": meeting_data.get("meeting_start", ""),
                "meeting_end": meeting_data.get("meeting_end", ""),
                "meeting_location_id": meeting_data.get("meeting_location_id", ""),
                "meeting_streetAddress": meeting_data.get("meeting_streetAddress", ""),
                "meeting_room": meeting_data.get("meeting_room", ""),
                "meeting_postalCode": meeting_data.get("meeting_postalCode", ""),
                "meeting_subLocality": meeting_data.get("meeting_subLocality", ""),
                "meeting_locality": meeting_data.get("meeting_locality", ""),
                "participant": meeting_data.get("participant", []),
                "auxiliaryFile": meeting_data.get("auxiliaryFile", []),
                "resultsProtocol": meeting_data.get("resultsProtocol", {}),
                "organisation_id": meeting_data.get("organisation_id", ""),
                "organisation_name": meeting_data.get("organisation_name", "")
            }
            modified_data.append(modified_item)
    return modified_data

# Call the function to replace the meeting field
modified_data = replace_meeting_field(data)

# Specify the path for the new JSON file
new_file_path = '/Users/ameerkhan/Downloads/Oparl_Files/file_with_resultprotocol_elasticsearch.json'

# Save the modified JSON data to the new file
with open(new_file_path, 'w') as new_file:
    json.dump(modified_data, new_file, indent=2)

print("New JSON file with changes created.")


In [None]:
import json

# Replace 'input_file.json' with the name of your original JSON file
input_file = '/Users/ameerkhan/Downloads/Oparl_Files/file_with_resultprotocol_elasticsearch.json'
bulk_output_file = '/Users/ameerkhan/Downloads/Oparl_Files/file_in_elasticsearch_format.json'

with open(input_file, 'r') as f:
    data = json.load(f)

with open(bulk_output_file, 'w') as f:
    for doc in data:
        doc_line = json.dumps({"index": {"_index": "agendaitem_content", "_id": doc["agendaitem_id"]}})
        f.write(doc_line + '\n')
        doc.pop("agendaitem_id")  # Remove the _id field since it's already included in the metadata line
        f.write(json.dumps(doc) + '\n')

# Adding a newline at the end of the file
with open(bulk_output_file, 'a') as f:
    f.write('\n')