In [8]:
import os
import json
import csv

def extract_context_sentences(utterances, target_index, context_size=5):
    context_start_index = max(0, target_index - context_size)
    context_end_index = min(len(utterances), target_index + context_size + 1)
    context_sentences = utterances[context_start_index:context_end_index]
    return context_sentences

def extract_metadata_titles(folder_path):
    metadata_titles_with_caret = []
    metadata_titles_data = []

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if filename.endswith('.json'):
            with open(file_path, 'r', encoding='utf-8') as json_file:
                data = json.load(json_file)
                document = data.get('document', [{}])[0]
                utterances = document.get('utterance', [])
                for i, utterance in enumerate(utterances):
                    utterance_form = utterance.get('form', '')
                    if '^^' in utterance_form:
                        metadata_title = data.get('metadata', {}).get('title', '')
                        metadata_titles_with_caret.append((filename, metadata_title, i, utterances))
                        metadata_titles_data.append(data)
                        
    return metadata_titles_with_caret, metadata_titles_data

def create_csv_with_context_sentences(folder_path, metadata_titles_with_caret, metadata_titles_data, csv_file):
    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['setting_relation', 'setting_intimacy', 'setting_contact_frequency', 'document_topic', 'sentence_utterance_id', 'speaker_id', 'utterance_form', 'sentence_time', 'speaker_age', 'speaker_occupation', 'speaker_sex'])
        
        for (filename, metadata_title, target_index, utterances), data in zip(metadata_titles_with_caret, metadata_titles_data):
            document = data.get('document', [{}])[0]
            document_title = document.get('metadata', {}).get('title', '')
            document_topic = document.get('metadata', {}).get('topic', '')
            setting_relation = document.get('metadata', {}).get('setting', {}).get('relation', '')
            setting_intimacy = document.get('metadata', {}).get('setting', {}).get('intimacy', '')
            setting_contact_frequency = document.get('metadata', {}).get('setting', {}).get('contact_frequency', '')
            target_utterance = utterances[target_index]
            utterance_form = target_utterance.get('form', '')
            utterance_id = target_utterance.get('id', '')
            speaker_id = target_utterance.get('speaker_id', '')
            speakers = document.get('metadata', {}).get('speaker', [])
            speaker_info = next((speaker for speaker in speakers if speaker.get('id') == speaker_id), {})
            speaker_age = speaker_info.get('age', '')
            speaker_occupation = speaker_info.get('occupation', '')
            speaker_sex = speaker_info.get('sex', '')
            speaker_birthplace = speaker_info.get('birthplace', '')
            speaker_principal_residence = speaker_info.get('pricipal_residence', '')
            speaker_current_residence = speaker_info.get('current_residence', '')
            speaker_device = speaker_info.get('device', '')
            speaker_keyboard = speaker_info.get('keyboard', '')
            context_sentences = extract_context_sentences(utterances, target_index)
            
            for sentence in context_sentences:
                sentence_utterance_id = sentence.get('id', '')  # 수정된 부분: 각 context_sentence의 원래 utterance_id
                sentence_time = sentence.get('time', '')
                speaker_id = sentence.get('speaker_id', '')
                speaker_info = next((speaker for speaker in speakers if speaker.get('id') == speaker_id), {})
                speaker_age = speaker_info.get('age', '')
                speaker_occupation = speaker_info.get('occupation', '')
                speaker_sex = speaker_info.get('sex', '')
                speaker_birthplace = speaker_info.get('birthplace', '')
                speaker_principal_residence = speaker_info.get('pricipal_residence', '')
                speaker_current_residence = speaker_info.get('current_residence', '')
                speaker_device = speaker_info.get('device', '')
                speaker_keyboard = speaker_info.get('keyboard', '')
                writer.writerow([setting_relation, setting_intimacy, setting_contact_frequency, document_topic, sentence_utterance_id, speaker_id, sentence.get('form', ''), sentence_time, speaker_age, speaker_occupation, speaker_sex])
                
                
# JSON 파일이 있는 폴더 경로 설정
folder_path = "C:\\Users\\ArumPark\\Desktop\\Projekt_Hamburg\\NIKL_MESSENGER_v2.0\\국립국어원 메신저 말뭉치(버전 2.0)"

# '^^'를 포함하는 utterance_form을 가진 metadata_title을 추출합니다.
metadata_titles_with_caret, metadata_titles_data = extract_metadata_titles(folder_path)  # 수정된 부분

# CSV 파일 경로 설정
csv_file_path = './output2.csv'

# '^^'를 포함하는 utterance_form을 가진 metadata_title을 가진 JSON 파일만을 선택하여 CSV 파일로 생성합니다.
create_csv_with_context_sentences(folder_path, metadata_titles_with_caret, metadata_titles_data, csv_file_path)  # 수정된 부분


In [4]:
import os
import json
import csv
import pandas as pd

def extract_context_sentences(utterances, target_index, context_size=5):
    context_start_index = max(0, target_index - context_size)
    context_end_index = min(len(utterances), target_index + context_size + 1)
    context_sentences = utterances[context_start_index:context_end_index]
    return context_sentences

def extract_metadata_titles(folder_path, target_utterance_ids):
    metadata_titles_with_caret = []
    metadata_titles_data = []

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if filename.endswith('.json'):
            with open(file_path, 'r', encoding='utf-8') as json_file:
                data = json.load(json_file)
                document = data.get('document', [{}])[0]
                utterances = document.get('utterance', [])
                for i, utterance in enumerate(utterances):
                    utterance_id = utterance.get('id', '')
                    if utterance_id in target_utterance_ids:
                        metadata_title = data.get('metadata', {}).get('title', '')
                        metadata_titles_with_caret.append((filename, metadata_title, i, utterances))
                        metadata_titles_data.append(data)
                        
    return metadata_titles_with_caret, metadata_titles_data

def create_csv_with_context_sentences(folder_path, metadata_titles_with_caret, metadata_titles_data, csv_file):
    with open(csv_file, 'w', newline='', encoding='utf-8-sig') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['setting_relation', 'setting_intimacy', 'setting_contact_frequency', 'document_topic', 'sentence_utterance_id', 'speaker_id', 'utterance_form', 'sentence_time', 'speaker_age', 'speaker_occupation', 'speaker_sex', 'speaker_birthplace', 'speaker_principal_residence', 'speaker_current_residence', 'speaker_device', 'speaker_keyboard'])
        
        for (filename, metadata_title, target_index, utterances), data in zip(metadata_titles_with_caret, metadata_titles_data):
            document = data.get('document', [{}])[0]
            document_topic = document.get('metadata', {}).get('topic', '')
            setting_relation = document.get('metadata', {}).get('setting', {}).get('relation', '')
            setting_intimacy = document.get('metadata', {}).get('setting', {}).get('intimacy', '')
            setting_contact_frequency = document.get('metadata', {}).get('setting', {}).get('contact_frequency', '')
            target_utterance = utterances[target_index]
            utterance_form = target_utterance.get('form', '')
            utterance_id = target_utterance.get('id', '')
            speaker_id = target_utterance.get('speaker_id', '')
            speakers = document.get('metadata', {}).get('speaker', [])
            speaker_info = next((speaker for speaker in speakers if speaker.get('id') == speaker_id), {})
            speaker_age = speaker_info.get('age', '')
            speaker_occupation = speaker_info.get('occupation', '')
            speaker_sex = speaker_info.get('sex', '')
            speaker_birthplace = speaker_info.get('birthplace', '')
            speaker_principal_residence = speaker_info.get('pricipal_residence', '')
            speaker_current_residence = speaker_info.get('current_residence', '')
            speaker_device = speaker_info.get('device', '')
            speaker_keyboard = speaker_info.get('keyboard', '')
            context_sentences = extract_context_sentences(utterances, target_index)
            
            for sentence in context_sentences:
                sentence_utterance_id = sentence.get('id', '')
                sentence_time = sentence.get('time', '')
                speaker_id = sentence.get('speaker_id', '')
                speaker_info = next((speaker for speaker in speakers if speaker.get('id') == speaker_id), {})
                speaker_age = speaker_info.get('age', '')
                speaker_occupation = speaker_info.get('occupation', '')
                speaker_sex = speaker_info.get('sex', '')
                speaker_birthplace = speaker_info.get('birthplace', '')
                speaker_principal_residence = speaker_info.get('pricipal_residence', '')
                speaker_current_residence = speaker_info.get('current_residence', '')
                speaker_device = speaker_info.get('device', '')
                speaker_keyboard = speaker_info.get('keyboard', '')
                writer.writerow([setting_relation, setting_intimacy, setting_contact_frequency, document_topic, sentence_utterance_id, speaker_id, sentence.get('form', ''), sentence_time, speaker_age, speaker_occupation, speaker_sex, speaker_birthplace, speaker_principal_residence, speaker_current_residence, speaker_device, speaker_keyboard])

# JSON 파일이 있는 폴더 경로 설정
folder_path = "C:\\Users\\ArumPark\\Desktop\\Projekt_Hamburg\\NIKL_MESSENGER_v2.0\\국립국어원 메신저 말뭉치(버전 2.0)"

def get_utterance_ids(file_path, sheet_name, column_name):
    try:
        # Excel 파일 불러오기
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        
        # 'utterance_id' 열의 값을 리스트로 저장
        utterance_ids = df[column_name].tolist()
        
        return utterance_ids
    except Exception as e:
        print("Error:", e)
        return []

# 파일 경로와 열 이름을 적절히 수정하여 사용하세요
file_path = "smiliey_new_annotation_file_20chats_each_110624.xlsx"
sheet_name = "annotation"
column_name = "utterance_id"

utterance_ids = get_utterance_ids(file_path, sheet_name, column_name)

# 추출할 utterance_id 리스트
target_utterance_ids = utterance_ids  # 필요한 utterance_id를 여기에 추가

# 특정 utterance_id를 포함하는 metadata_title을 추출합니다.
metadata_titles_with_caret, metadata_titles_data = extract_metadata_titles(folder_path, target_utterance_ids)

# CSV 파일 경로 설정
csv_file_path = 'C:/Users/ArumPark/Desktop/Disputation/emoticon/data/new_annotation_context_test3.csv'

# 특정 utterance_id를 포함하는 metadata_title을 가진 JSON 파일만을 선택하여 CSV 파일로 생성합니다.
create_csv_with_context_sentences(folder_path, metadata_titles_with_caret, metadata_titles_data, csv_file_path)


In [6]:
# context_size 제한하지 않기

import os
import json
import pandas as pd

def extract_metadata_titles(folder_path, target_utterance_ids):
    metadata_titles_with_caret = []
    metadata_titles_data = []

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if filename.endswith('.json'):
            with open(file_path, 'r', encoding='utf-8') as json_file:
                data = json.load(json_file)
                document = data.get('document', [{}])[0]
                utterances = document.get('utterance', [])
                for i, utterance in enumerate(utterances):
                    utterance_id = utterance.get('id', '')
                    if utterance_id in target_utterance_ids:
                        metadata_title = data.get('metadata', {}).get('title', '')
                        metadata_titles_with_caret.append((filename, metadata_title, i, utterances))
                        metadata_titles_data.append(data)
                        
    return metadata_titles_with_caret, metadata_titles_data

def create_excel_with_all_sentences(folder_path, metadata_titles_with_caret, metadata_titles_data, excel_file):
    # DataFrame 생성
    rows = []
    for (filename, metadata_title, _, utterances), data in zip(metadata_titles_with_caret, metadata_titles_data):
        document = data.get('document', [{}])[0]
        document_topic = document.get('metadata', {}).get('topic', '')
        setting_relation = document.get('metadata', {}).get('setting', {}).get('relation', '')
        setting_intimacy = document.get('metadata', {}).get('setting', {}).get('intimacy', '')
        setting_contact_frequency = document.get('metadata', {}).get('setting', {}).get('contact_frequency', '')
        speakers = document.get('metadata', {}).get('speaker', [])
        
        for sentence in utterances:
            sentence_utterance_id = sentence.get('id', '')
            sentence_time = sentence.get('time', '')
            speaker_id = sentence.get('speaker_id', '')
            speaker_info = next((speaker for speaker in speakers if speaker.get('id') == speaker_id), {})
            speaker_age = speaker_info.get('age', '')
            speaker_occupation = speaker_info.get('occupation', '')
            speaker_sex = speaker_info.get('sex', '')
            speaker_birthplace = speaker_info.get('birthplace', '')
            speaker_principal_residence = speaker_info.get('pricipal_residence', '')
            speaker_current_residence = speaker_info.get('current_residence', '')
            speaker_device = speaker_info.get('device', '')
            speaker_keyboard = speaker_info.get('keyboard', '')
            form = sentence.get('form', '')
            rows.append([setting_relation, setting_intimacy, setting_contact_frequency, document_topic, sentence_utterance_id, speaker_id, form, sentence_time, speaker_age, speaker_occupation, speaker_sex, speaker_birthplace, speaker_principal_residence, speaker_current_residence, speaker_device, speaker_keyboard])

    df = pd.DataFrame(rows, columns=['setting_relation', 'setting_intimacy', 'setting_contact_frequency', 'document_topic', 'sentence_utterance_id', 'speaker_id', 'utterance_form', 'sentence_time', 'speaker_age', 'speaker_occupation', 'speaker_sex', 'speaker_birthplace', 'speaker_principal_residence', 'speaker_current_residence', 'speaker_device', 'speaker_keyboard'])

    # Excel 파일로 저장
    df.to_excel(excel_file, index=False, encoding='utf-8-sig')

# JSON 파일이 있는 폴더 경로 설정
folder_path = "C:\\Users\\ArumPark\\Desktop\\Projekt_Hamburg\\NIKL_MESSENGER_v2.0\\국립국어원 메신저 말뭉치(버전 2.0)"

def get_utterance_ids(file_path, sheet_name, column_name):
    try:
        # Excel 파일 불러오기
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        
        # 'utterance_id' 열의 값을 리스트로 저장
        utterance_ids = df[column_name].tolist()
        
        return utterance_ids
    except Exception as e:
        print("Error:", e)
        return []

# 파일 경로와 열 이름을 적절히 수정하여 사용하세요
file_path = "smiliey_new_annotation_file_20chats_each_110624.xlsx"
sheet_name = "annotation"
column_name = "utterance_id"

utterance_ids = get_utterance_ids(file_path, sheet_name, column_name)

# 추출할 utterance_id 리스트
target_utterance_ids = utterance_ids  # 필요한 utterance_id를 여기에 추가

# 특정 utterance_id를 포함하는 metadata_title을 추출합니다.
metadata_titles_with_caret, metadata_titles_data = extract_metadata_titles(folder_path, target_utterance_ids)

# Excel 파일 경로 설정
excel_file_path = 'C:/Users/ArumPark/Desktop/Disputation/emoticon/data/new_annotation_context_all_sentences.xlsx'

# 특정 utterance_id를 포함하는 metadata_title을 가진 JSON 파일만을 선택하여 Excel 파일로 생성합니다.
create_excel_with_all_sentences(folder_path, metadata_titles_with_caret, metadata_titles_data, excel_file_path)

In [5]:
#output을 excel 파일로

# CSV 파일 경로 설정
csv_file_path = "C:/Users/ArumPark/Desktop/Disputation/emoticon/data/new_annotation_context_test2.csv"

# Excel 파일 경로 설정
excel_file_path = "C:/Users/ArumPark/Desktop/Disputation/emoticon/data/new_annotation_context_excel_test.xlsx"

# CSV 파일 불러오기
df = pd.read_csv(csv_file_path)

# Excel 파일로 저장
df.to_excel(excel_file_path, index=False)
