# PDTB 2

In [38]:
import numpy as np
import pandas as pd

usecols = [
    'Relation', 'Section', 
    'Arg1_RawText', 'Arg2_RawText', 
    'Conn1', 'Conn2',
    'ConnHeadSemClass1', 'ConnHeadSemClass2',
    'Conn2SemClass1', 'Conn2SemClass2'
]
df = pd.read_csv(
    r'D:\0--data\projects\04.01-IDRR数据\IDRR-base\CorpusData\PDTB2\pdtb2.csv',
    low_memory=False,
    usecols=usecols,
)
# print(data.columns)

# for k, v in data.iloc[0].to_dict().items():
#     print(k, v)

rename_map = {
    'Arg1_RawText': 'Arg1',
    'Arg2_RawText': 'Arg2',
    'ConnHeadSemClass1': 'Conn1Sem1',
    'ConnHeadSemClass2': 'Conn1Sem2',
    'Conn2SemClass1': 'Conn2Sem1',
    'Conn2SemClass2': 'Conn2Sem2',
}
df.rename(columns=rename_map, inplace=True)
init_df = df.copy()
# # print(df.columns)

df = df[df['Relation']=='Implicit']

train_df = df[~df['Section'].isin([0, 1, 21, 22, 23, 24])]
dev_df = df[df['Section'].isin([0, 1])]
test_df = df[df['Section'].isin([21, 22])]



### 一级多标签统计

In [45]:
def sort_dict(dct):
    return dict(sorted(dct.items(), key=lambda x:-x[1]))
    
class analyse_level1_label:
    labels = 'Temporal Comparison Contingency Expansion'.split()
    
    @classmethod
    def str_to_label(cls, string):
        return string.split('.')[0]
    
    @classmethod
    def get_item_labels(cls, item):
        primary_label = cls.str_to_label(item.Conn1Sem1)
        secondary_labels = []
        for s in [item.Conn1Sem2, item.Conn2Sem1, item.Conn2Sem2]:
            if not pd.isna(s):
                secondary_labels.append(cls.str_to_label(s))
        return primary_label, secondary_labels
        
    def __new__(cls, df, sort_res=True):
        rec = {l:{ll:0 for ll in ['']+cls.labels}for l in cls.labels}
        for p in range(df.shape[0]):
            item = df.iloc[p]
            primary_label, secondary_labels = cls.get_item_labels(item)
            if not secondary_labels:
                rec[primary_label][''] += 1
            else:
                for l in secondary_labels:
                    rec[primary_label][l] += 1

        if sort_res:
            for k in rec:
                rec[k] = sort_dict(rec[k])
                
        import json
        rec_string = json.dumps(rec, ensure_ascii=False, indent=2)
        print('num of level1 labels')
        print(rec_string)
        
        rank = {}
        for k in rec:
            vals = sorted(cls.labels, key=lambda x:rec[k][x])
            rank[k] = [v for v in vals if k != v]
        rank_string = json.dumps(rank, ensure_ascii=False, indent=2)
        print('\nrank of level1 labels')
        print(rank_string)
        pass
    

analyse_level1_label(train_df)    
    

num of level1 labels
{
  "Temporal": {
    "": 579,
    "Expansion": 54,
    "Contingency": 19,
    "Comparison": 13,
    "Temporal": 0
  },
  "Comparison": {
    "": 1840,
    "Expansion": 31,
    "Temporal": 20,
    "Comparison": 2,
    "Contingency": 1
  },
  "Contingency": {
    "": 3133,
    "Expansion": 128,
    "Temporal": 12,
    "Contingency": 6,
    "Comparison": 3
  },
  "Expansion": {
    "": 6652,
    "Temporal": 63,
    "Contingency": 39,
    "Comparison": 32,
    "Expansion": 6
  }
}

rank of level1 labels
{
  "Temporal": [
    "Comparison",
    "Contingency",
    "Expansion"
  ],
  "Comparison": [
    "Contingency",
    "Temporal",
    "Expansion"
  ],
  "Contingency": [
    "Comparison",
    "Temporal",
    "Expansion"
  ],
  "Expansion": [
    "Comparison",
    "Contingency",
    "Temporal"
  ]
}
