In [1]:
import pandas as pd

In [2]:
# read the file containing the dispute information collected from the providers
df = pd.read_json('../data/disputes-content.jsonl.gz', lines = True)
# show a sample of the data
df.head()

Unnamed: 0,source,number,domains,complainants,respondents,url,status,date,text,language
0,CIETAC-ODRC,CN-2401600,[mideacapitalholdings.com],"[MIDEA GROUP CO., LTD]",[Benjamin Smith],http://odr.org.cn/superadmin/downLoadFile.acti...,转移域名,2024-03-06,1 \n \nADMINISTRATIVE PANEL DECISION \nCase No...,en
1,CIIDRC,1005-CDRP,[omerdeserres.ca],[N/A],[Unknown],https://ciidrc.org/wp-content/uploads/2022/12/...,transferred,2007-11-19,CANADIAN INTERNET REGISTRATION AUTHORITY \nDOM...,en
2,CIIDRC,1031-CDRP,"[clearlylasik.ca, clearly-lasik.ca]",[N/A],[Unknown],https://ciidrc.org/wp-content/uploads/2022/12/...,Dismissed,2008-01-11,1 \nIn the Matter of a Complaint Pursuant to ...,en
3,CIIDRC,1034-CDRP,[zantac.ca],[N/A],[Unknown],https://ciidrc.org/wp-content/uploads/2022/12/...,transferred,2008-03-11,1 \nIN THE MATTER OF A COMPLAINT PURSUANT TO ...,en
4,CIIDRC,1082-CDRP,"[staplesonlinerebate.ca, staplesonlinerebates....",[N/A],[Unknown],https://ciidrc.org/wp-content/uploads/2022/12/...,transferred,2008-07-07,"CANADIAN INTERNET REGISTRATION, AM 110RITY \nD...",en


## Overall statistics about the disputes

In [3]:
print(f'{df.number.nunique()} disputes collected.')
print('Per provider:')
df.groupby('source').agg({'number': 'nunique'}).sort_values('number', ascending = False)

116341 disputes collected.
Per provider:


Unnamed: 0_level_0,number
source,Unnamed: 1_level_1
WIPO,70112
FORUM,38314
CAC,4497
HKIAC,1709
CIETAC-ODRC,1266
IDRC,226
CIIDRC,187
AIAC,30


## Disputes with proceedings

In [4]:
print(f'{df[~df.text.isna()].number.nunique()} disputes with proceedings.')
print('Per provider:')
df[~df.text.isna()].groupby('source').agg({'number': 'nunique'}).sort_values('number', ascending = False)

99148 disputes with proceedings.
Per provider:


Unnamed: 0_level_0,number
source,Unnamed: 1_level_1
WIPO,56985
FORUM,34438
CAC,4497
HKIAC,1618
CIETAC-ODRC,1192
IDRC,205
CIIDRC,184
AIAC,29


## Disputes with English proceedings

In [5]:
print(f'{df[df.language == "en"].number.nunique()} disputes with proceedings.')
print('Per provider:')
df[df.language == "en"].groupby('source').agg({'number': 'nunique'}).sort_values('number', ascending = False)

90150 disputes with proceedings.
Per provider:


Unnamed: 0_level_0,number
source,Unnamed: 1_level_1
WIPO,50192
FORUM,34306
CAC,4477
HKIAC,660
CIETAC-ODRC,296
CIIDRC,162
IDRC,39
AIAC,18


## Checking the annotated documents

In [6]:
import json
import gzip
import spacy
from spacy import displacy

docs = []
nlp = spacy.blank('en')

with gzip.open('/data/vinny/udrp-artefacts/data/annotated-udrp-corpus.jsonl.gz', 'rt') as fin:
    for line in fin:
        data = json.loads(line)
        doc = nlp(data['text'])
        ents = [doc.char_span(start, end, label = label) for start, end, label in data['label']]
        if ents:
            doc.ents = [ent for ent in ents if ent]
            docs.append(doc)
## note that we annotated more documents than used for the study
print(f'Found {len(docs)} annotated documents')



Found 407 annotated documents


## Visualizing a random annotated document
The entities mentioned in the paper map to the following short names in the annotated document

```json
{
    "e1 -> Filing date": "FILING_DATE",
    "e2 -> Registrar request date": "REGISTRAR_REQ_DATE",
    "e3 -> Registrar response date": "REGISTRAR_REQ_RESP",
    "e4 -> Commencement date": "COMMENCEMENT_DATE",
    "e5 -> Panel appointment date": "APPT_DATE",
    "e6 -> Publication date": "PUB_DATE",
    "e7 -> Complainant(s)": "COMPLAINANT",
    "e8 -> Complainant(s) address(es)": "COMP_LOC",
    "e9 -> Complainants’ counsel": "REPR_ORG",
    "e10 -> Respondent(s)": "RESPONDENT",
    "e11 -> Respondent(s)’ address(es)": "RESP_LOC",
    "e12 -> Respondent default": "NO_RESPONSE",
    "e13 -> Registrar(s)": "REGISTRAR",
    "e14 -> Panelist(s)": "PANELIST",
    "e15 -> Trademark(s)": "TRADEMARK"
}
```

In [9]:
from random import choice
# select an element an visualize it using displacy
# the entities used in the document map to the following elements


displacy.render(choice(docs), style = 'ent', jupyter = True)