# PDTB 2.0

In [None]:
import numpy as np
import pandas as pd

usecols = [
    'Relation', 'Section', 
    'Arg1_RawText', 'Arg2_RawText', 
    'Conn1', 'Conn2',
    'ConnHeadSemClass1', 'ConnHeadSemClass2',
    'Conn2SemClass1', 'Conn2SemClass2'
]
df = pd.read_csv(
    r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\PDTB2\pdtb2.csv',
    low_memory=False,
    usecols=usecols,
)

rename_map = {
    'Arg1_RawText': 'Arg1',
    'Arg2_RawText': 'Arg2',
    'ConnHeadSemClass1': 'Conn1Sem1',
    'ConnHeadSemClass2': 'Conn1Sem2',
    'Conn2SemClass1': 'Conn2Sem1',
    'Conn2SemClass2': 'Conn2Sem2',
}
df.rename(columns=rename_map, inplace=True)
init_df = df.copy()

df = df[df['Relation']=='Implicit']
# print(df.shape)

train_df = df[~df['Section'].isin([0, 1, 21, 22, 23, 24])]
dev_df = df[df['Section'].isin([0, 1])]
test_df = df[df['Section'].isin([21, 22])]

from collections import Counter
cnt = Counter(df['Conn1Sem1'])
sorted(cnt.items())

In [None]:
labels = df['Conn1Sem1'].unique().tolist()
sorted(labels)
# labels

## level1&2 statistic

In [None]:
# GOLF
selected_second_senses = [
    'Expansion.Conjunction',
    'Expansion.Restatement',
    'Expansion.Instantiation', 
    'Expansion.List',
    'Expansion.Alternative', 
    'Contingency.Cause',
    'Contingency.Pragmatic cause', 
    'Comparison.Contrast',
    'Comparison.Concession',
    'Temporal.Asynchronous', 
    'Temporal.Synchrony', 
]
cnt_df = train_df
# cnt_df = dev_df
cnt = 0
rec = {}
for sense in cnt_df['Conn1Sem1']:
    sense_l2 = '.'.join(sense.split('.')[:2])
    if sense_l2 in selected_second_senses:
        cnt += 1
        rec[sense_l2] = rec.get(sense_l2,0)+1
for sense in cnt_df['Conn1Sem2']:
    if pd.isna(sense):
        continue
    sense_l2 = '.'.join(sense.split('.')[:2])
    if sense_l2 in selected_second_senses:
        cnt += 1
        rec[sense_l2] = rec.get(sense_l2,0)+1
for k in selected_second_senses:
    print(k,rec[k])
sum(rec.values())

In [None]:
# PCP
train_df.shape
dev_df.shape
test_df.shape
selected_second_senses = set([
    'Temporal.Asynchronous', 'Temporal.Synchrony', 'Contingency.Cause',
    'Contingency.Pragmatic cause', 'Comparison.Contrast', 'Comparison.Concession',
    'Expansion.Conjunction', 'Expansion.Instantiation', 'Expansion.Restatement',
    'Expansion.Alternative', 'Expansion.List'
])
cnt_df = train_df
# cnt_df = dev_df
# cnt_df = test_df
cnt = 0
for sense in cnt_df['Conn1Sem1']:
    sense_l2 = '.'.join(sense.split('.')[:2])
    if sense_l2 in selected_second_senses:
        cnt += 1
cnt

### 一级多标签统计

In [None]:
def sort_dict(dct):
    return dict(sorted(dct.items(), key=lambda x:-x[1]))
    
class analyse_level1_label:
    labels = 'Temporal Comparison Contingency Expansion'.split()
    
    @classmethod
    def str_to_label(cls, string):
        return string.split('.')[0]
    
    @classmethod
    def get_item_labels(cls, item):
        primary_label = cls.str_to_label(item.Conn1Sem1)
        secondary_labels = []
        for s in [item.Conn1Sem2, item.Conn2Sem1, item.Conn2Sem2]:
            if not pd.isna(s):
                secondary_labels.append(cls.str_to_label(s))
        return primary_label, secondary_labels
        
    def __new__(cls, df, sort_res=True):
        rec = {l:{ll:0 for ll in ['']+cls.labels}for l in cls.labels}
        for p in range(df.shape[0]):
            item = df.iloc[p]
            primary_label, secondary_labels = cls.get_item_labels(item)
            if not secondary_labels:
                rec[primary_label][''] += 1
            else:
                for l in secondary_labels:
                    rec[primary_label][l] += 1

        if sort_res:
            for k in rec:
                rec[k] = sort_dict(rec[k])
                
        import json
        rec_string = json.dumps(rec, ensure_ascii=False, indent=2)
        print('num of level1 labels')
        print(rec_string)
        
        rank = {}
        for k in rec:
            vals = sorted(cls.labels, key=lambda x:rec[k][x])
            rank[k] = [v for v in vals if k != v]
        rank_string = json.dumps(rank, ensure_ascii=False, indent=2)
        print('\nrank of level1 labels')
        print(rank_string)
        pass
    

analyse_level1_label(train_df)    
    

# PDTB 3.0

## preprocess: merge section to csv

In [None]:
import pandas as pd
import os

# prepare `sections` by `preprocess_pdtb3.py`
fold_path = r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\PDTB3\data\sections'

columns = []
data_list = []
for p in range(25):
    with open(os.path.join(fold_path, str(p).rjust(2,'0')+'.tsv'), 'r', encoding='utf8')as file:
        content = list(file.readlines())
        columns = content[0]
        data_list.extend(content[1:])
columns = columns.strip().split('\t')
data_list = [line.strip('\n').split('\t')for line in data_list if line.strip()]
# print(set(map(len, data_list)))
df = pd.DataFrame(data_list, columns=columns)
columns
# df.shape
# df.to_csv(r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\PDTB3\pdtb3_implicit.csv', sep='\t', index=False)

In [None]:
import re
for line in df['arg2']:
    # if len(re.findall(r'\d', line)) > 3:
    #     print(line)
    if 'wj_' in line:
        print(line)

In [None]:
sorted(set(df['conn1_sense1']))

## other's preprocess (strange)

In [None]:
import pandas as pd

# train.csv 9308
# sections\12.tsv 387

train_file = r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\PDTB3\processed\train.tsv'
train_file = r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\tmp\ttmp\train.tsv'
# df
with open(train_file, 'r', encoding='utf8')as f:
    content = list(f.readlines())
    for p in range(9308, 9310):
        # print(repr(content[p]))
        # print(p, len(content[p].split('\t')))
        # for d in content[p].split('\t'):
        #     print(d)
        pass
fake_content = [content[0],content[1],content[9307],content[9308],content[9309]]
# with open(r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\tmp\fake.csv', 'w', encoding='utf8')as f:
#     f.writelines(fake_content)
# with open(r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\tmp\fake.tsv', 'w', encoding='utf8')as f:
#     f.writelines(fake_content)

content = [line.strip().split('\t')for line in content]
# for p, line in enumerate(content):
#     if len(line) != 10:
#         print(p)
# print(content[9308:9310])
skiprows = [
    9308,
]
df = pd.read_csv(train_file, sep='\t'
                 , skiprows=skiprows, encoding='latin1'
                 )
# df.columns
df.iloc[0]
# sorted(set(df['full_sense']))

In [None]:
import pandas as pd

strange_file = r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\PDTB3\data\sections\12.tsv'
cur_pd12 = pd.read_csv(strange_file, delimiter='\t', skiprows=[386])

with open(strange_file, 'r', encoding='utf8')as file:
    content = list(file.readlines())
print(content[385])
print(content[386])
print(content[387])
print(cur_pd12.iloc[385:388])
print(len(content), cur_pd12.shape)
# print(cur_pd12.iloc[386])

## data

In [None]:
import pandas as pd

df = pd.read_csv(r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\PDTB3\pdtb3_implicit.csv', delimiter='\t')

train_df = df[df['section'].isin(list(range(2,21)))]
dev_df = df[df['section'].isin([0,1])]
test_df = df[df['section'].isin([21,22])]

df.columns
df.shape
from collections import Counter
cnt = Counter(df['conn1_sense1'])
sorted(cnt.keys())

## level1&2 statistic

In [None]:
train_df.shape, dev_df.shape, test_df.shape
# dev_df.shape
# test_df.shape

In [None]:
from collections import defaultdict
import json
selected_second_senses = '''
    Expansion.Conjunction Expansion.Level-of-detail Expansion.Instantiation
    Expansion.Manner Expansion.Substitution Expansion.Equivalence
    Contingency.Cause Contingency.Purpose Contingency.Cause+Belief
    Contingency.Condition
    Comparison.Concession
    Comparison.Contrast
    Temporal.Asynchronous
    Temporal.Synchronous
'''.split()
cur_df = train_df
cur_df = dev_df
cur_df = test_df
rec = defaultdict(int)
cnt = 0
for sense in cur_df['conn1_sense1']:
    sense_l2 = '.'.join(sense.split('.')[:2])
    # sense_l2 = sense
    if sense_l2 in selected_second_senses:
        rec[sense_l2] += 1
        cnt += 1
# cnt
# rec
selected_second_senses
# for sense in cur_df['conn2_sense1']:
#     if pd.isna(sense):
#         continue
#     sense_l2 = '.'.join(sense.split('.')[:2])
#     # sense_l2 = sense
#     if sense_l2 in selected_second_senses:
#         rec[sense_l2] += 1
# cnt
# for k in selected_second_senses:
#     print(k, rec[k])
# sum(rec.values())

# CoNLL

In [None]:
import json

train_file = r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\2015-2016_conll_shared_task\data\conll16st-en-03-29-16-train\relations.json'
dev_file = r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\2015-2016_conll_shared_task\data\conll16st-en-03-29-16-dev\relations.json'
test_file = r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\2015-2016_conll_shared_task\data\conll16st-en-03-29-16-test\relations.json'
blind_test_file = r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\2015-2016_conll_shared_task\data\conll16st-en-03-29-16-blind-test\relations.json'

def get_dicts(file):
    # dicts = []
    with open(file, 'r', encoding='utf8')as f:
        dicts = [json.loads(line)for line in f.readlines()]
    return dicts

train_dicts = get_dicts(train_file)
train_dicts = [p for p in train_dicts if p['Type'] == 'Implicit']
sample = train_dicts[0]
sample

In [None]:
from collections import Counter

total_senses = [q for p in get_dicts(train_file) for q in p['Sense']]
sorted(set(total_senses))

total_senses_l2 = ['.'.join(p.split('.')[:])for p in total_senses ]
sorted(set(total_senses_l2))
# Counter(total_senses_l2)

# Label Map

In [None]:
import pandas as pd

data_path_pdtb2 = r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\PDTB2\pdtb2.csv'
data_path_pdtb3 = r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\PDTB3\pdtb3_implicit.csv'
data_path_conll = r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\CoNLL16'

sense_pdtb2 = ['Comparison',
 'Comparison.Concession',
 'Comparison.Concession.Contra-expectation',
 'Comparison.Concession.Expectation',
 'Comparison.Contrast',
 'Comparison.Contrast.Juxtaposition',
 'Comparison.Contrast.Opposition',
 'Comparison.Pragmatic concession',
 'Comparison.Pragmatic contrast',
 'Contingency',
 'Contingency.Cause',
 'Contingency.Cause.Reason',
 'Contingency.Cause.Result',
 'Contingency.Condition.Hypothetical',
 'Contingency.Pragmatic cause.Justification',
 'Contingency.Pragmatic condition.Relevance',
 'Expansion',
 'Expansion.Alternative',
 'Expansion.Alternative.Chosen alternative',
 'Expansion.Alternative.Conjunctive',
 'Expansion.Conjunction',
 'Expansion.Exception',
 'Expansion.Instantiation',
 'Expansion.List',
 'Expansion.Restatement',
 'Expansion.Restatement.Equivalence',
 'Expansion.Restatement.Generalization',
 'Expansion.Restatement.Specification',
 'Temporal',
 'Temporal.Asynchronous.Precedence',
 'Temporal.Asynchronous.Succession',
 'Temporal.Synchrony']
sense_pdtb3 = ['Comparison.Concession+SpeechAct.Arg2-as-denier+SpeechAct',
 'Comparison.Concession.Arg1-as-denier',
 'Comparison.Concession.Arg2-as-denier',
 'Comparison.Contrast',
 'Comparison.Similarity',
 'Contingency.Cause+Belief.Reason+Belief',
 'Contingency.Cause+Belief.Result+Belief',
 'Contingency.Cause+SpeechAct.Reason+SpeechAct',
 'Contingency.Cause+SpeechAct.Result+SpeechAct',
 'Contingency.Cause.Reason',
 'Contingency.Cause.Result',
 'Contingency.Condition+SpeechAct',
 'Contingency.Condition.Arg1-as-cond',
 'Contingency.Condition.Arg2-as-cond',
 'Contingency.Purpose.Arg1-as-goal',
 'Contingency.Purpose.Arg2-as-goal',
 'Expansion.Conjunction',
 'Expansion.Disjunction',
 'Expansion.Equivalence',
 'Expansion.Exception.Arg1-as-excpt',
 'Expansion.Exception.Arg2-as-excpt',
 'Expansion.Instantiation.Arg1-as-instance',
 'Expansion.Instantiation.Arg2-as-instance',
 'Expansion.Level-of-detail.Arg1-as-detail',
 'Expansion.Level-of-detail.Arg2-as-detail',
 'Expansion.Manner.Arg1-as-manner',
 'Expansion.Manner.Arg2-as-manner',
 'Expansion.Substitution.Arg2-as-subst',
 'Temporal.Asynchronous.Precedence',
 'Temporal.Asynchronous.Succession',
 'Temporal.Synchronous']
sense_conll = ['Comparison',
 'Comparison.Concession',
 'Comparison.Contrast',
 'Contingency',
 'Contingency.Cause',
 'Contingency.Cause.Reason',
 'Contingency.Cause.Result',
 'Contingency.Condition',
 'Expansion',
 'Expansion.Alternative',
 'Expansion.Alternative.Chosen alternative',
 'Expansion.Conjunction',
 'Expansion.Exception',
 'Expansion.Instantiation',
 'Expansion.Restatement',
 'Temporal',
 'Temporal.Asynchronous.Precedence',
 'Temporal.Asynchronous.Succession',
 'Temporal.Synchrony']

sense_dic = {
    'pdtb2': sense_pdtb2,
    'pdtb3': sense_pdtb3,
    'conll': sense_conll,
}

In [None]:
sense_pdtb2
sense_pdtb3
sense_conll

## level 1

In [None]:
import json

order = 'Temporal Comparison Contingency Expansion'.split()
order.sort()

def sense_to_id(sense):
    return order.index(sense.split('.')[0])

label_map = {}
for data_name, sense_list in sense_dic.items():
    label_map[data_name] = {
        sense:sense_to_id(sense)
        for sense in sense_list
    }
json_path = r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\label_map\level1.json'
with open(json_path, 'w', encoding='utf8')as f:
    json.dump(label_map, f, indent=2)

## level2

In [None]:
import json


label_map = {}
for data_name, sense_list in sense_dic.items():
    label_map[data_name] = {
        sense:p
        for p,sense in enumerate(sense_list)
    }
json_path = r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\label_map\level2.json'
with open(json_path, 'w', encoding='utf8')as f:
    json.dump(label_map, f, indent=2)

In [None]:
sd = {'Concession':'Comparison','Contrast':'Comparison','Cause':'Contingency','Cause+Belief':'Contingency',
                    'Condition':'Contingency','Purpose':'Contingency',
                    'Conjunction':'Expansion','Equivalence':'Expansion','Instantiation':'Expansion','Level-of-detail':'Expansion',
                    'Manner':'Expansion','Substitution':'Expansion',
                    'Asynchronous':'Temporal','Synchronous':'Temporal',
                    'None':'None'}
len(sd)