# PDTB 2.0

In [2]:
import numpy as np
import pandas as pd

usecols = [
    'Relation', 'Section', 
    'Arg1_RawText', 'Arg2_RawText', 
    'Conn1', 'Conn2',
    'ConnHeadSemClass1', 'ConnHeadSemClass2',
    'Conn2SemClass1', 'Conn2SemClass2'
]
df = pd.read_csv(
    r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\PDTB2\pdtb2.csv',
    low_memory=False,
    usecols=usecols,
)
# print(data.columns)

# for k, v in data.iloc[0].to_dict().items():
#     print(k, v)

rename_map = {
    'Arg1_RawText': 'Arg1',
    'Arg2_RawText': 'Arg2',
    'ConnHeadSemClass1': 'Conn1Sem1',
    'ConnHeadSemClass2': 'Conn1Sem2',
    'Conn2SemClass1': 'Conn2Sem1',
    'Conn2SemClass2': 'Conn2Sem2',
}
df.rename(columns=rename_map, inplace=True)
init_df = df.copy()
# # print(df.columns)

df = df[df['Relation']=='Implicit']

train_df = df[~df['Section'].isin([0, 1, 21, 22, 23, 24])]
dev_df = df[df['Section'].isin([0, 1])]
test_df = df[df['Section'].isin([21, 22])]



In [4]:
labels = df['Conn1Sem1'].unique().tolist()
sorted(labels)
# labels

['Comparison',
 'Comparison.Concession',
 'Comparison.Concession.Contra-expectation',
 'Comparison.Concession.Expectation',
 'Comparison.Contrast',
 'Comparison.Contrast.Juxtaposition',
 'Comparison.Contrast.Opposition',
 'Comparison.Pragmatic concession',
 'Comparison.Pragmatic contrast',
 'Contingency',
 'Contingency.Cause',
 'Contingency.Cause.Reason',
 'Contingency.Cause.Result',
 'Contingency.Condition.Hypothetical',
 'Contingency.Pragmatic cause.Justification',
 'Contingency.Pragmatic condition.Relevance',
 'Expansion',
 'Expansion.Alternative',
 'Expansion.Alternative.Chosen alternative',
 'Expansion.Alternative.Conjunctive',
 'Expansion.Conjunction',
 'Expansion.Exception',
 'Expansion.Instantiation',
 'Expansion.List',
 'Expansion.Restatement',
 'Expansion.Restatement.Equivalence',
 'Expansion.Restatement.Generalization',
 'Expansion.Restatement.Specification',
 'Temporal',
 'Temporal.Asynchronous.Precedence',
 'Temporal.Asynchronous.Succession',
 'Temporal.Synchrony']

### 一级多标签统计

In [45]:
def sort_dict(dct):
    return dict(sorted(dct.items(), key=lambda x:-x[1]))
    
class analyse_level1_label:
    labels = 'Temporal Comparison Contingency Expansion'.split()
    
    @classmethod
    def str_to_label(cls, string):
        return string.split('.')[0]
    
    @classmethod
    def get_item_labels(cls, item):
        primary_label = cls.str_to_label(item.Conn1Sem1)
        secondary_labels = []
        for s in [item.Conn1Sem2, item.Conn2Sem1, item.Conn2Sem2]:
            if not pd.isna(s):
                secondary_labels.append(cls.str_to_label(s))
        return primary_label, secondary_labels
        
    def __new__(cls, df, sort_res=True):
        rec = {l:{ll:0 for ll in ['']+cls.labels}for l in cls.labels}
        for p in range(df.shape[0]):
            item = df.iloc[p]
            primary_label, secondary_labels = cls.get_item_labels(item)
            if not secondary_labels:
                rec[primary_label][''] += 1
            else:
                for l in secondary_labels:
                    rec[primary_label][l] += 1

        if sort_res:
            for k in rec:
                rec[k] = sort_dict(rec[k])
                
        import json
        rec_string = json.dumps(rec, ensure_ascii=False, indent=2)
        print('num of level1 labels')
        print(rec_string)
        
        rank = {}
        for k in rec:
            vals = sorted(cls.labels, key=lambda x:rec[k][x])
            rank[k] = [v for v in vals if k != v]
        rank_string = json.dumps(rank, ensure_ascii=False, indent=2)
        print('\nrank of level1 labels')
        print(rank_string)
        pass
    

analyse_level1_label(train_df)    
    

num of level1 labels
{
  "Temporal": {
    "": 579,
    "Expansion": 54,
    "Contingency": 19,
    "Comparison": 13,
    "Temporal": 0
  },
  "Comparison": {
    "": 1840,
    "Expansion": 31,
    "Temporal": 20,
    "Comparison": 2,
    "Contingency": 1
  },
  "Contingency": {
    "": 3133,
    "Expansion": 128,
    "Temporal": 12,
    "Contingency": 6,
    "Comparison": 3
  },
  "Expansion": {
    "": 6652,
    "Temporal": 63,
    "Contingency": 39,
    "Comparison": 32,
    "Expansion": 6
  }
}

rank of level1 labels
{
  "Temporal": [
    "Comparison",
    "Contingency",
    "Expansion"
  ],
  "Comparison": [
    "Contingency",
    "Temporal",
    "Expansion"
  ],
  "Contingency": [
    "Comparison",
    "Temporal",
    "Expansion"
  ],
  "Expansion": [
    "Comparison",
    "Contingency",
    "Temporal"
  ]
}


# PDTB 3.0

In [2]:
import pandas as pd

df = pd.read_csv(r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\PDTB3\processed\train.tsv', sep='\t')
df

ParserError: Error tokenizing data. C error: Expected 10 fields in line 9309, saw 11


# CoNLL

In [13]:
import json

train_file = r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\2015-2016_conll_shared_task\data\conll16st-en-03-29-16-train\relations.json'
dev_file = r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\2015-2016_conll_shared_task\data\conll16st-en-03-29-16-dev\relations.json'
test_file = r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\2015-2016_conll_shared_task\data\conll16st-en-03-29-16-test\relations.json'
blind_test_file = r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\2015-2016_conll_shared_task\data\conll16st-en-03-29-16-blind-test\relations.json'

def get_dicts(file):
    # dicts = []
    with open(file, 'r', encoding='utf8')as f:
        dicts = [json.loads(line)for line in f.readlines()]
    return dicts

train_dicts = get_dicts(train_file)
train_dicts = [p for p in train_dicts if p['Type'] == 'Implicit']
sample = train_dicts[0]
sample

{'Arg1': {'CharacterSpanList': [[9, 240]],
  'RawText': 'In an Oct. 19 review of "The Misanthrope" at Chicago\'s Goodman Theatre ("Revitalized Classics Take the Stage in Windy City," Leisure & Arts), the role of Celimene, played by Kim Cattrall, was mistakenly attributed to Christina Haag',
  'TokenList': [[9, 11, 0, 0, 0],
   [12, 14, 1, 0, 1],
   [15, 19, 2, 0, 2],
   [20, 22, 3, 0, 3],
   [23, 29, 4, 0, 4],
   [30, 32, 5, 0, 5],
   [33, 34, 6, 0, 6],
   [34, 37, 7, 0, 7],
   [38, 49, 8, 0, 8],
   [49, 50, 9, 0, 9],
   [51, 53, 10, 0, 10],
   [54, 61, 11, 0, 11],
   [61, 63, 12, 0, 12],
   [64, 71, 13, 0, 13],
   [72, 79, 14, 0, 14],
   [80, 81, 15, 0, 15],
   [81, 82, 16, 0, 16],
   [82, 93, 17, 0, 17],
   [94, 102, 18, 0, 18],
   [103, 107, 19, 0, 19],
   [108, 111, 20, 0, 20],
   [112, 117, 21, 0, 21],
   [118, 120, 22, 0, 22],
   [121, 126, 23, 0, 23],
   [127, 131, 24, 0, 24],
   [131, 132, 25, 0, 25],
   [132, 133, 26, 0, 26],
   [134, 141, 27, 0, 27],
   [142, 143, 28, 0, 28],

In [17]:
from collections import Counter

total_senses = [q for p in train_dicts for q in p['Sense']]
sorted(set(total_senses))

# senses_cnt = Counter('&'.join(p['Sense'])for p in train_dicts)
# senses_cnt = dict(sorted(senses_cnt.items()))
# senses_cnt

# Counter(len(p['Sense'])for p in train_dicts)

# Counter(p['Type']for p in train_dicts)

['Comparison',
 'Comparison.Concession',
 'Comparison.Contrast',
 'Contingency',
 'Contingency.Cause',
 'Contingency.Cause.Reason',
 'Contingency.Cause.Result',
 'Contingency.Condition',
 'Expansion',
 'Expansion.Alternative',
 'Expansion.Alternative.Chosen alternative',
 'Expansion.Conjunction',
 'Expansion.Exception',
 'Expansion.Instantiation',
 'Expansion.Restatement',
 'Temporal',
 'Temporal.Asynchronous.Precedence',
 'Temporal.Asynchronous.Succession',
 'Temporal.Synchrony']