In [1]:
import re
from pathlib import Path

In [2]:
def extract(infile, outfile):
    with open(infile, encoding='utf-8') as f, open(outfile, 'w', encoding='utf-8') as fout:
        lines = f.readlines()
        concepts = {}

        for line in lines:
            try:
                line = eval(line)
            except:
                print(f"Skipping line due to eval error: {line.strip()}")
                continue

            for res in line:
                res = res.split('\n')
                for term in res:
                    term = term.strip().lower()
                    if not term:
                        continue

                    terms = term.split(':')
                    if len(terms) != 2:
                        print(f"Parsing error: term {term}")
                        continue

                    concept, epa = terms[0].strip(), terms[1].strip()

                    concept = concept.rstrip(']').strip()

                    # Replace underscores with spaces to unify the concept name
                    concept = concept.replace('_', ' ')

                    # Extract numeric identifier and concept
                    match = re.search(r'(\d+#.*)', concept)
                    if not match:
                        print(f"Parsing error: concept {concept}")
                        continue

                    num_concept = match.group(1)
                    epa = epa.strip('[]')

                    try:
                        epa = float(epa)
                    except:
                        print(f"EPA '{epa}' cannot be parsed for term '{term}'")
                        continue

                    if num_concept not in concepts:
                        concepts[num_concept] = []

                    concepts[num_concept].append(epa)

        for num_concept in sorted(concepts.keys(), key=lambda x: int(x.split('#')[0])):
            if len(concepts[num_concept]) != 5:
                print(f"Concept {num_concept} does not have 5 outputs!")
            else:
                num, concept = num_concept.split('#', 1)
                epa_values = concepts[num_concept]
                fout.write(f"{num}, {concept}, {epa_values}\n")

In [26]:
infile = 'chatgpt_f_A_1_600_714.txt'
outfile = 'extracted_f/chatgpt_f_A_1_600_714.txt'
Path('extracted').mkdir(parents=True, exist_ok=True)
extract(infile, outfile)

Parsing error: term ```plaintext
Parsing error: term ```


! for individual dataset

In [9]:
import pandas as pd

In [37]:
df = pd.read_csv('extracted/chatgpt_s1_A_1_0_41.txt', header=None)

# 解析方括号中的内容并转换为列表
df[2] = df[2].str.strip('[]').str.split(', ')

# 添加列名
df.columns = ['number', 'concept', 'values']

# 将列表中的内容拆分为多列
df[['E1', 'E2', 'E3', 'E4', 'E5']] = pd.DataFrame(df['values'].tolist())

# 删除原始的values列
df.drop(columns=['values'], inplace=True)

ValueError: Length mismatch: Expected axis has 7 elements, new values have 3 elements

In [28]:
def data_change_form(filename):
    data = []
    with open(filename, encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            parts = line.split(",")
            num_concept = parts[0].strip()
            concept = parts[1].strip()
            epa_values = list(map(float, re.findall(r"-?\d+\.\d+", parts[2])))
            data.append([num_concept, concept] + epa_values)
    
    # 提取数字和概念
    df = pd.DataFrame(data, columns=["number", "concept"] + ["E1", "E2", "E3", "E4", "E5"])

    # 按照数字#概念将数字和概念拆分为两列
    df[['number', 'concept']] = df['number'].str.split('#', expand=True)

    return df

In [30]:
df_e = data_change_form('extracted/chatgpt_s1_E_1_0_41.txt')

ValueError: 7 columns passed, passed data had 3 columns

In [25]:
# Extract data from files
df_e = data_change_form('extracted/chatgpt_s1_E_1_0_41.txt', include_metadata=True)
df_p = data_change_form('extracted/chatgpt_s1_P_1_0_41.txt', include_metadata=False)
df_a = data_change_form('extracted/chatgpt_s1_A_1_0_41.txt', include_metadata=False)

# Rename columns for df2 and df3
df_p.columns = [f"P{i+1}" for i in range(df_p.shape[1])]
df_a.columns = [f"A{i+1}" for i in range(df_a.shape[1])]

# Merge dataframes on the index
combined_df = pd.concat([df_e, df_p, df_a], axis=1)

In [26]:
print(df_e)

   number                                concept   E1
0       0                      un bateau de fret  0.5
1       1                           une épicerie  2.0
2       2           supplier qqn. de faire qqch. -1.0
3       3                         une transgenre  1.5
4       4                          un kidnappeur -4.0
5       5                          déranger qqn. -2.5
6       6  augmenter la distance avec qqn./qqch. -2.0
7       7                 une conductrice de bus  2.0
8       8                 un magasin de lingerie  2.5
9       9                            une fumeuse -1.0
10     10                   un palais de justice  1.5
11     11                           un casse cou  0.5
12     12                             un aveugle  0.0
13     13                  un assistant dentaire  2.5
14     14                         un homme queer  2.0
15     15                           un casse cou  0.5
16     16                         un homme queer  2.0
17     17                   

In [27]:
print(combined_df)

   number                                concept   E1   P1   A1
0       0                      un bateau de fret  0.5  0.0 -3.0
1       1                           une épicerie  2.0  0.0  0.0
2       2           supplier qqn. de faire qqch. -1.0 -1.0  1.0
3       3                         une transgenre  1.5  0.0  0.0
4       4                          un kidnappeur -4.0  4.0  3.0
5       5                          déranger qqn. -2.5 -1.0  2.0
6       6  augmenter la distance avec qqn./qqch. -2.0 -1.0 -2.0
7       7                 une conductrice de bus  2.0  0.0 -1.0
8       8                 un magasin de lingerie  2.5  0.0  1.0
9       9                            une fumeuse -1.0 -2.0  1.0
10     10                   un palais de justice  1.5  2.0 -2.0
11     11                           un casse cou  0.5  2.0  3.0
12     12                             un aveugle  0.0 -2.0 -1.0
13     13                  un assistant dentaire  2.5 -1.0  0.0
14     14                         un hom