# txt파일 데이터프레임으로 변환

In [None]:
from datetime import datetime

import pandas as pd
import numpy as np
import csv
import re
import random
import json

In [None]:
date_sep = '일 ---------------'

def remove_special(string):
    return re.sub(r"[^가-힣a-zA-Z0-9]","", string)

def check_am_pm(string):
    am_pm = string.split(' ')[0]
    hour = int(string.split(' ')[1].split(':')[0])
    minute = int(string.split(' ')[1].split(':')[1])
    if am_pm == '오전' and hour == 12:
        hour = 0
    elif am_pm == '오후' and hour != 12:
        hour += 12
    return format(hour, '02') + ':' + format(minute, '02') + ':' + '00'

def make_df(file_path):
     with open(file_path, 'r', encoding='UTF-8') as input_file:
        person = []
        date = []
        time = []
        context = []
        d = ''
        for line in input_file:
            if line.startswith('---------------'):
                d = line.split(' ')
                d = d[1][:-1] + '-' + format(int(d[2][:-1]), '02') + '-' + format(int(d[3][:-1]), '02')
            elif line.startswith('['):
                sp = line.split('] ')
                if sp[0][1:] == '방장봇': # '삭제된 메시지입니다.'
                    continue
                
                # context에 ']'가 있는 경우
                if len(sp) > 3:
                    tmp = '] '.join(sp[2:]).strip()
                    # 관계없는 키워드 제외시  이곳과 아래구문에 추가해주시면 됩니다.
                    if tmp == '삭제된 메시지입니다.' or '하트인증' in tmp or '/닉네임' in tmp or '/SCD란' in tmp or '친목다과회' in tmp or '토론방' in tmp or '디스코드' in tmp or '신문고' in tmp:
                        continue
                    else:
                        context.append(tmp)
                else:
                    tmp = sp[2].strip()
                    if tmp == '삭제된 메시지입니다.' or '하트인증' in tmp or '/닉네임' in tmp or '/SCD란' in tmp or '친목다과회' in tmp or '토론방' in tmp or '디스코드' in tmp or '신문고' in tmp:
                        continue
                    else:
                        context.append(tmp)
                    
                    
                # person.append(remove_special(sp[0][1:]))
                person.append(sp[0][1:])
                date.append(d)
                time.append(check_am_pm(sp[1][1:]))
        df = pd.DataFrame({'person':person, 'date': date, 'time': time, 'context':context})
        return df
    
def context_punc(c):
    try:
        if c == '이모티콘':
            return '#@이모티콘#'
        elif c == '사진':
            return '#@시스템#사진#'
        elif c.startswith('사진') and c.endswith('장') and len(c.split(' ')) == 2:
            return '#@시스템#사진#'
        elif c == '동영상':
            return '#@시스템#동영상#'
        else:
            return c
    except:
        return c
    
def get_attribute(df):
    _utteranceID = []
    _turnID = []
    _participantID = []

    _utterance = 0
    _turn = 0
    _participant = 0
    _participant_dict = {}  # 초이스커피:P01

    before_participant = ""
    for idx in df.index:
        _utterance += 1
        present_participant = df.loc[idx,'person']
        if present_participant != before_participant:
            _turn += 1
            before_participant = present_participant
        if present_participant not in _participant_dict.keys():
            _participant += 1
            _participant_dict[present_participant] = "P" + format(_participant, '03')
            _participantID.append("P"+format(_participant, '03'))
        else:  # present_participant in participant_dict.keys():
            _participantID.append(_participant_dict[present_participant])
        _utteranceID.append("U"+str(_utterance))
        _turnID.append("T"+str(_turn))
    
    df['utteranceID'] = _utteranceID
    df['turnID'] = _turnID
    df['participantID'] = _participantID
    
    return df, _utterance, _turn, _participant

def get_last_dialog(df):
    """
    모든 대화들 중 최근시간대의 대화만 추출
    """
    last_idx = 0
    for idx in df.index:
        try:
            date1 = df.loc[idx]['date']
            date2 = df.loc[idx+1]['date']
            time1 = df.loc[idx]['time']
            time2 = df.loc[idx+1]['time']

            time1 = datetime.strptime(date1 + ' ' + time1, '%Y-%m-%d %H:%M:%S')
            time2 = datetime.strptime(date2 + ' ' + time2, '%Y-%m-%d %H:%M:%S')
            time_interval = time2 - time1
            if time_interval.seconds/3600 > 2:
                last_idx = idx
        except:
            pass
    return df[last_idx:].reset_index()

In [None]:
# talk_df = make_df('카톡대화파일.txt')
talk_df = make_df('KakaoTalk_20220524_0259_07_155_mbti단톡방_group.txt')
talk_df

In [None]:
talk_df['context'] = talk_df['context'].transform(context_punc)
talk_df

In [None]:
talk_df = get_last_dialog(talk_df)
talk_df

In [None]:
"""
아래셀은 주석 풀며 각각 처리
"""

# csv로 변환후 local에서 개인정보 처리
# talk_df.to_csv('talk.csv', encoding='UTF-8-sig')

# 개인정보 처리후 다시 불러와서 작업
# talk_df = pd.read_csv('talk.csv')
# talk_df

In [None]:
talk_df, utterance, turn, participant = get_attribute(talk_df)
print(f'utterance: {utterance}, turn:{turn}, participant:{participant}')
talk_df

In [None]:
talk_df = talk_df[['utteranceID', 'turnID', 'participantID', 'date', 'time', 'context']]  # 열순서 바꾸기
talk_df

In [None]:
body = talk_df.to_json(orient = 'records', force_ascii=False)
body

In [None]:
body = json.loads(body)
body

In [None]:
total = {}
dialogueInfo = {}
dialogueInfo["dialogueID"] = 'mbti'
dialogueInfo["numberOfParticipants"] = participant
dialogueInfo["numberOfUtterances"] = utterance
dialogueInfo["numberOfTurns"] = turn
dialogueInfo["type"] = "일상 대화"
dialogueInfo["topic"] = "개인 및 관계"
dialogueInfo

In [None]:
total['header'] = dialogueInfo
total['body'] = body
total

In [None]:
final_json = {}
final_json["numberOfItems"] = 1
final_json["data"] = total
final_json

In [None]:
with open('./output_json.json','w') as f:
    json.dump(total, f, ensure_ascii=False, indent=4)