In [1]:
import pandas as pd
import json
from tqdm import tqdm
import time

In [2]:
file = "/content/drive/MyDrive/과제/콜센터_분석/data/asg_test_ds(120)_processed.csv"
df = pd.read_csv(file)

### Prompt

In [6]:
prompt_1 = """
"""

In [13]:
api_key=""
api_gw_key=""

In [14]:
def call_clova_api(api_key, api_gw_key, messages):
    url = 'https://clovastudio.stream.ntruss.com/testapp/v1/chat-completions/HCX-003'

    headers = {
        'X-NCP-CLOVASTUDIO-API-KEY': api_key,
        'X-NCP-APIGW-API-KEY': api_gw_key,
        'Content-Type': 'application/json',
    }

    data = {
        "topK": 0,
        "includeAiFilters": True,
        "maxTokens": 1000,
        "temperature": 0.25,
        "messages": messages,
        "repeatPenalty": 4,
        "topP": 0.8
    }

    try:
        response = requests.post(url, headers=headers, data=json.dumps(data))

        if response.status_code != 200:
            return None, f"API 호출 실패: {response.status_code}, {response.text}"

        if not response.text:
            return None, "Empty response received"

        try:
            response_json = response.json()

        except json.JSONDecodeError as e:
            return None, f"JSON decoding error: {e} - Response text: {response.text[:100]}"

        if isinstance(response_json, dict):
            return response_json, None
        else:
            return None, "Unexpected response format"

    except requests.exceptions.RequestException as e:
        return None, f"Request failed: {e}"

In [15]:
def process_response_content(response_content):
    try:
        result = json.loads(response_content)
        return result
    except json.JSONDecodeError:
        return {"error": "Invalid JSON structure", "raw_response": response_content}

In [16]:
def process_single_row(row, prompt, api_key, api_gw_key):
    try:
        context = row['formatted_text']
        messages = [
            {"role": "system", "content": prompt},
            {"role": "user", "content": context},
        ]

        response_json, error = call_clova_api(api_key, api_gw_key, messages)

        if error:
            return {"error": error}

        if 'result' not in response_json or 'message' not in response_json['result']:
            return {"error": "Unexpected response format", "raw_response": response_json}

        result_content = response_json['result']['message'].get('content', '')
        return process_response_content(result_content)

    except Exception as e:
        return {"error": str(e)}

In [18]:
# Single row test
single_row = df[df['idx'] == 10].iloc[0]
print(single_row)

process_single_row(single_row, prompt_1, api_key, api_gw_key)

idx                                                                  10
text                  [{'speaker': 'A', 'text': '여보세요'}, {'speaker':...
formatted_text        A: 여보세요\nB: 네 안녕하세요. 현대커머셜 고은영입니다. 실례하지만 김현희 고...
len_text                                                           2046
len_formatted_text                                                 1369
Name: 9, dtype: object


{'result7': '고객이 입원 중이라 전화를 받지 못해 결제가 지연되었고, 상담원은 오늘 중으로 처리를 요청했으나 고객은 금요일까지 처리를 약속함.',
 'result8': 60,
 'result9': '상담원이 고객의 상황을 이해하고 금요일까지 처리하도록 배려하면서도 오늘 중으로 처리가 가능하면 전용 계좌로 입금하도록 안내함.',
 'result10': '상담원이 "오늘이라도 되시면 신한은행 전용 계좌로 입금을 해주시구요"라고 안내함',
 'result11': 70,
 'result12': '고객이 결제 지연에 대해 사과하고 금요일까지 처리할 것을 약속함. 또한 자동 이체 외 다른 계좌 정보를 요청함',
 'result13': '고객이 "자동 이체 말고 가산 계좌는 금액을 좀 문자로 남겨놔 주시면 그 전에라도 정리를 해드릴게요."라고 말함',
 'result14': 50}

### Batch

In [None]:
def add_clova_results_to_dataframe(df, prompt, api_key, api_gw_key, max_retries=3):
    results = []
    errors = []
    start_time = time.time()

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows", unit="row"):
        try:
            result = process_single_row(row, prompt, api_key, api_gw_key)
            if "error" in result:
                raise Exception(result["error"])
            results.append(result)
        except Exception as e:
            raw_response = result.get("raw_response", None) if 'result' in locals() else None
            errors.append({
                "idx": idx,
                "row": row.to_dict(),
                "error": str(e),
                "raw": raw_response
            })
            results.append(None)

        elapsed_time = time.time() - start_time
        avg_time_per_row = elapsed_time / (idx + 1)
        remaining_time = avg_time_per_row * (len(df) - idx - 1)

        tqdm.write(f"Row {idx + 1}/{len(df)} processed. Elapsed: {elapsed_time:.2f}s, "
                   f"Estimated remaining: {remaining_time:.2f}s")

    error_df = pd.DataFrame(errors)
    df['result'] = results

    return df, error_df

In [None]:
def merge_retry_results(result_df, error_df):
    """
    Merge retry results into the main result DataFrame based on idx.

    Args:
        result_df (pd.DataFrame): The main DataFrame containing all results.
        error_df (pd.DataFrame): The DataFrame with retried results.

    Returns:
        pd.DataFrame: Updated result DataFrame with retried results merged.
    """
    error_results = error_df[['idx', 'result']].dropna()
    result_df = result_df.set_index('idx')
    error_results = error_results.set_index('idx')

    result_df.update(error_results)

    return result_df.reset_index()

In [None]:
#bug fix
def retry_failed_rows(result_df, error_df, prompt, api_key, api_gw_key, max_retries=3):
    """
    Retry processing for failed rows and merge results into the main DataFrame.

    Args:
        result_df (pd.DataFrame): The main DataFrame containing all results.
        error_df (pd.DataFrame): The DataFrame with failed rows and error details.
        prompt (str): The LLM prompt to use.
        api_key (str): API key for Clova API.
        api_gw_key (str): API gateway key for Clova API.
        max_retries (int): Maximum number of retries.

    Returns:
        pd.DataFrame: Updated result DataFrame with retried results merged.
        pd.DataFrame: Updated error DataFrame with any remaining errors.
    """
    retries = 0
    while retries < max_retries and not error_df.empty:
        new_errors = []
        retried_results = []
        start_time = time.time()

        for idx, row_data in tqdm(
            error_df.iterrows(),
            total=len(error_df),
            desc=f"Retrying (Attempt {retries + 1})",
            unit="row"
        ):
            row = pd.Series(row_data["row"])
            try:
                result = process_single_row(row, prompt, api_key, api_gw_key)
                if "error" in result:
                    raise Exception(result["error"])
                retried_results.append({"idx": idx, "result": result})  # Store successful results
            except Exception as e:
                raw_response = result.get("raw_response", None) if 'result' in locals() else None
                new_errors.append({
                    "idx": idx,
                    "row": row.to_dict(),
                    "error": str(e),
                    "raw": raw_response
                })

            elapsed_time = time.time() - start_time
            avg_time_per_row = elapsed_time / (len(error_df) if len(error_df) > 0 else 1)
            remaining_time = avg_time_per_row * (len(error_df) - len(retried_results) - len(new_errors))
            tqdm.write(f"Row {idx + 1}/{len(error_df)} retried. Elapsed: {elapsed_time:.2f}s, "
                       f"Estimated remaining: {remaining_time:.2f}s")

        # Update error_df with new errors and merge retried results into result_df
        error_df = pd.DataFrame(new_errors)
        if retried_results:
            retried_results_df = pd.DataFrame(retried_results)
            result_df = merge_retry_results(result_df, retried_results_df)

        retries += 1

    return result_df, error_df

In [None]:
df, error_df = add_clova_results_to_dataframe(df, prompt_1, api_key, api_gw_key, max_retries=1)
result_df = df.copy()

Processing rows:  10%|█         | 1/10 [00:08<01:18,  8.69s/row]

Row 1/10 processed. Elapsed: 8.70s, Estimated remaining: 78.26s


Processing rows:  20%|██        | 2/10 [00:18<01:13,  9.23s/row]

Row 2/10 processed. Elapsed: 18.31s, Estimated remaining: 73.22s


Processing rows:  30%|███       | 3/10 [00:31<01:16, 10.93s/row]

Row 3/10 processed. Elapsed: 31.26s, Estimated remaining: 72.93s


Processing rows:  40%|████      | 4/10 [00:40<01:00, 10.15s/row]

Row 4/10 processed. Elapsed: 40.21s, Estimated remaining: 60.32s


Processing rows:  50%|█████     | 5/10 [00:50<00:50, 10.14s/row]

Row 5/10 processed. Elapsed: 50.35s, Estimated remaining: 50.35s


Processing rows:  60%|██████    | 6/10 [01:02<00:43, 10.82s/row]

Row 6/10 processed. Elapsed: 62.47s, Estimated remaining: 41.64s


Processing rows:  70%|███████   | 7/10 [01:10<00:29,  9.98s/row]

Row 7/10 processed. Elapsed: 70.73s, Estimated remaining: 30.31s


Processing rows:  80%|████████  | 8/10 [01:28<00:24, 12.47s/row]

Row 8/10 processed. Elapsed: 88.54s, Estimated remaining: 22.13s


Processing rows:  90%|█████████ | 9/10 [01:36<00:11, 11.12s/row]

Row 9/10 processed. Elapsed: 96.67s, Estimated remaining: 10.74s


Processing rows: 100%|██████████| 10/10 [01:46<00:00, 10.70s/row]

Row 10/10 processed. Elapsed: 107.01s, Estimated remaining: 0.00s



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['result'] = results


In [None]:
if not error_df.empty:
    result_df, error_df = retry_failed_rows(result_df, error_df, prompt_1, api_key, api_gw_key, max_retries=2)

Retrying (Attempt 1):  10%|█         | 1/10 [00:07<01:09,  7.71s/row]

Row 1/10 retried. Elapsed: 7.71s, Estimated remaining: 6.94s


Retrying (Attempt 1):  20%|██        | 2/10 [00:18<01:16,  9.53s/row]

Row 2/10 retried. Elapsed: 18.51s, Estimated remaining: 14.81s


Retrying (Attempt 1):  30%|███       | 3/10 [00:28<01:08,  9.84s/row]

Row 3/10 retried. Elapsed: 28.72s, Estimated remaining: 20.10s


Retrying (Attempt 1):  40%|████      | 4/10 [00:36<00:54,  9.06s/row]

Row 4/10 retried. Elapsed: 36.58s, Estimated remaining: 21.95s


Retrying (Attempt 1):  50%|█████     | 5/10 [00:46<00:47,  9.53s/row]

Row 5/10 retried. Elapsed: 46.96s, Estimated remaining: 23.48s


Retrying (Attempt 1):  60%|██████    | 6/10 [00:57<00:39,  9.83s/row]

Row 6/10 retried. Elapsed: 57.37s, Estimated remaining: 22.95s


Retrying (Attempt 1):  70%|███████   | 7/10 [01:04<00:26,  8.86s/row]

Row 7/10 retried. Elapsed: 64.22s, Estimated remaining: 19.27s


Retrying (Attempt 1):  80%|████████  | 8/10 [01:17<00:20, 10.39s/row]

Row 8/10 retried. Elapsed: 77.90s, Estimated remaining: 15.58s


Retrying (Attempt 1):  90%|█████████ | 9/10 [01:26<00:09,  9.88s/row]

Row 9/10 retried. Elapsed: 86.66s, Estimated remaining: 8.67s


Retrying (Attempt 1): 100%|██████████| 10/10 [01:36<00:00,  9.70s/row]

Row 10/10 retried. Elapsed: 96.96s, Estimated remaining: 0.00s





In [None]:
def prettify_result_column(df, result_column='result', pretty_column='pretty_result'):
    """
    Format JSON data in the specified column into a readable string with indentation.

    Args:
        df (pd.DataFrame): DataFrame containing the JSON data.
        result_column (str): Column name containing the JSON data.
        pretty_column (str): Name for the new column with pretty-printed JSON.

    Returns:
        pd.DataFrame: Updated DataFrame with the pretty-printed JSON column added.
    """
    df = df.copy()
    df[pretty_column] = df[result_column].apply(
        lambda x: json.dumps(x, indent=2, ensure_ascii=False) if isinstance(x, dict) else x
    )

    return df

In [None]:
result_df_pretty = prettify_result_column(result_df, result_column='result')
result_df_pretty.head(2)

Unnamed: 0,idx,text,formatted_text,len_text,len_formatted_text,result,pretty_result
0,1,"[{'speaker': 'A', 'text': '여보세요 네 안녕하십니까 현대커머셜...",A: 여보세요 네 안녕하십니까 현대커머셜 고은영입니다. 실례하지만 윤현중 고객님 본...,1646,1125,{'result7': '상담원이 고객의 민원 건 처리 상황을 확인하고 결제 관련 정...,"{\n ""result7"": ""상담원이 고객의 민원 건 처리 상황을 확인하고 결제 ..."
1,2,"[{'speaker': 'A', 'text': '네 안녕하세요. 현대커머셜 구은영입...",A: 네 안녕하세요. 현대커머셜 구은영입니다. 시라지반 조완모 고객님 본인 맞으세요...,1483,1118,"{'result7': '상담원이 고객의 오일자 결제건에 대해 처리 일정을 확인하고,...","{\n ""result7"": ""상담원이 고객의 오일자 결제건에 대해 처리 일정을 확..."


In [None]:
file = "/content/drive/MyDrive/과제/콜센터_분석/data/result_pretty.csv"
result_df_pretty.to_csv(file, index=False)

### 패턴

1. 콤마 뒤 값 분리 실패    
새로운 값으로 간주해야 할 텍스트가 같은 필드에 포함됨.

2. 인용문 처리 문제  
인용문 내부에 따옴표가 제대로 이스케이프되지 않아 JSON 구조를 깨트림.

3. 문장 경계 인식 실패    
한 문장에서 끊어야 할 부분을 제대로 구분하지 못하고 이어붙임.

4. 필드 혼합  
다른 필드로 나눠야 할 값을 같은 필드에 포함함.