# Sanity check and anomaly detection

In [1]:
from google.colab import drive
drive.mount('/gdrive')

%cd /gdrive/MyDrive/cp-EDA-Anomaly

ModuleNotFoundError: No module named 'google'

In [2]:
import pandas as pd
import re
import numpy as np
entity = pd.read_csv('entity.csv')
nameentity_name = pd.read_csv('nameentity_name.csv')
partnership = pd.read_csv('PARTNERSHIP.csv')

In [3]:
data = pd.read_json('ner_curation_data.json')
content_dic = {}
for i in range(data.shape[0]):
    key = str(int(data['document_id'][i][:6])) + '/' +str(data['sentence_id'][i])
    content_dic[key] = data['sentence_text'][i]


## 1. Circular technology check

In [None]:
document_id = []
sentence_id = []
error_message = []
error_entity = ['PACT','CHEMICAL','FAC','PRODUCT','GOVERNMENT']
for i in range(len(entity['document_id'])):
    if entity['named_entity'][i] in error_entity and not pd.isna(entity['circular_tech'][i]):
        document_id.append(entity['document_id'][i])
        sentence_id.append(entity['sentence_id'][i])
        error_message.append('This is not possible circular technology for '+entity['name'][i]+'.')

In [None]:
cir_error = pd.DataFrame({'document_id':document_id, 'sentence_id':sentence_id,'error_message':error_message})
cir_error.head()

Unnamed: 0,document_id,sentence_id,error_message
0,220,4,This is not possible circular technology for ChemCycling.
1,330,4,This is not possible circular technology for NREL.
2,380,5,This is not possible circular technology for Makkah Municipality.
3,380,5,This is not possible circular technology for Makkah Municipality.
4,555,6,This is not possible circular technology for US Army Research Laboratory.


## 2. Entity check

### 2.1 Regex check

In [None]:
document_id = []
sentence_id = []
error_message = []

In [None]:
# Keywords for named entity
regex_academia = re.compile(r"institute|university", re.IGNORECASE) 
regex_association = re.compile(r"alliance|association", re.IGNORECASE)
regex_facility = re.compile(r"plant$|refinery$|facility$", re.IGNORECASE)
regex_govt = re.compile(r"authority|commission|government|ministry|parliament|federal|federation", re.IGNORECASE)
regex_pact = re.compile(r"commitment$|program$|pact$", re.IGNORECASE)
regex_VPP = re.compile(r"VPP")
regex_ATEPW = re.compile(r"Alliance To End Plastic Waste",re.IGNORECASE)

In [None]:
def find_errors():
    for i, name in enumerate(entity['name']):
        if regex_academia.search(name) != None:
            if entity.iloc[i]['named_entity'] != 'ACADEMIA':
                document_id.append(entity.iloc[i]['document_id'])
                sentence_id.append(entity.iloc[i]['sentence_id'])
                error_message.append(name + ' was ' + entity.iloc[i]['named_entity'] + ' , should be ACADEMIA')
        if regex_association.search(name) != None:
            if entity.iloc[i]['named_entity'] != 'ASSOCIATION':
                document_id.append(entity.iloc[i]['document_id'])
                sentence_id.append(entity.iloc[i]['sentence_id'])
                error_message.append(name + ' was ' + entity.iloc[i]['named_entity'] + ' , should be ASSOCIATION')
        if regex_facility.search(name) != None:
            if entity.iloc[i]['named_entity'] != 'FAC':
                document_id.append(entity.iloc[i]['document_id'])
                sentence_id.append(entity.iloc[i]['sentence_id'])
                error_message.append(name + ' was ' + entity.iloc[i]['named_entity'] + ' , should be FAC')
        if regex_govt.search(name) != None:
            if entity.iloc[i]['named_entity'] != 'GOVERNMENT':
                document_id.append(entity.iloc[i]['document_id'])
                sentence_id.append(entity.iloc[i]['sentence_id'])
                error_message.append(name + ' was ' + entity.iloc[i]['named_entity'] + ' , should be GOVERNMENT')
        if regex_pact.search(name) != None:
            if entity.iloc[i]['named_entity'] != 'PACT':
                document_id.append(entity.iloc[i]['document_id'])
                sentence_id.append(entity.iloc[i]['sentence_id'])
                error_message.append(name + ' was ' + entity.iloc[i]['named_entity'] + ' , should be PACT')
        if regex_VPP.search(name) != None:
            if entity.iloc[i]['named_entity'] != 'COMPANY':
                document_id.append(entity.iloc[i]['document_id'])
                sentence_id.append(entity.iloc[i]['sentence_id'])
                error_message.append(name + ' was ' + entity.iloc[i]['named_entity'] + ' , should be COMPANY')
        if regex_ATEPW.search(name) != None:
            if entity.iloc[i]['named_entity'] != 'ASSOCIATION':
                document_id.append(entity.iloc[i]['document_id'])
                sentence_id.append(entity.iloc[i]['sentence_id'])
                error_message.append(name + ' was ' + entity.iloc[i]['named_entity'] + ' , should be ASSOCIATION')

In [None]:
find_errors()
pd.options.display.max_colwidth=600
df_error = pd.DataFrame({'document_id':document_id, 'sentence_id':sentence_id,'error_message':error_message})
df_error.head()

Unnamed: 0,document_id,sentence_id,error_message
0,75,1,"British Plastics Federation was ASSOCIATION , should be GOVERNMENT"
1,75,2,"British Plastics Federation was ASSOCIATION , should be GOVERNMENT"
2,114,4,"Institute for Supply Management was NGO , should be ACADEMIA"
3,136,4,"Alliance Engineering was COMPANY , should be ASSOCIATION"
4,186,2,"European Resilient Flooring Manufacturers ' Institute was ASSOCIATION , should be ACADEMIA"


### 2.2 Frequency check

In [None]:
new_name = pd.DataFrame(nameentity_name[nameentity_name['std']<0.333])
new_name = new_name.reset_index()
typelist = ['ACADEMIA', 'ASSOCIATION', 'CHEMICAL', 'COMPANY','FAC', 'GOVERNMENT','NGO', 'PACT', 'PRODUCT']
new_name['Most_frequent'] = new_name[typelist].idxmax(axis = 1)
new_name.head()

Unnamed: 0,index,name,ACADEMIA,ASSOCIATION,CHEMICAL,COMPANY,FAC,GOVERNMENT,NGO,PACT,PRODUCT,std,Most_frequent
0,106,The Recycling Partnership,0,1,0,0,0,0,2,0,0,0.235702,NGO
1,137,ACC,0,17,0,1,0,0,0,0,0,0.31304,ASSOCIATION
2,267,Alliance to End Plastic Waste,0,13,0,0,0,0,3,0,0,0.270232,ASSOCIATION
3,334,Argonne National Laboratory,1,0,0,0,1,0,0,0,0,0.220479,ACADEMIA
4,535,Bioindustrial Innovation Canada,0,0,0,4,0,0,1,0,0,0.266667,COMPANY


In [None]:
dic = {}
for i in range(len(entity['document_id'])):
    if entity['name'][i] not in dic:
        dic[entity['name'][i]] = []
        dic[entity['name'][i]].append(str(entity['document_id'][i])+'/'+str(entity['sentence_id'][i])+'/'+entity['named_entity'][i])
    else:
        dic[entity['name'][i]].append(str(entity['document_id'][i])+'/'+str(entity['sentence_id'][i])+'/'+entity['named_entity'][i])


In [None]:
document_id = []
sentence_id = []
error_message = []
for i in range(len(new_name['name'])):
    list_new = dic[new_name['name'][i]]
    for j in range(len(list_new)):
        info = list_new[j].split('/')
        if info[2] != new_name['Most_frequent'][i]:
            document_id.append(int(info[0]))
            sentence_id.append(int(info[1]))
            error_message.append('The most frequent named entity for '+new_name['name'][i]+' is '+new_name['Most_frequent'][i]+' but here it is '+info[2])

In [None]:
frequency_error = pd.DataFrame({'document_id':document_id, 'sentence_id':sentence_id,'error_message':error_message})
frequency_error.head()

Unnamed: 0,document_id,sentence_id,error_message
0,303,1,The most frequent named entity for The Recycling Partnership is NGO but here it is ASSOCIATION
1,445,3,The most frequent named entity for ACC is ASSOCIATION but here it is COMPANY
2,388,2,The most frequent named entity for Alliance to End Plastic Waste is ASSOCIATION but here it is NGO
3,388,4,The most frequent named entity for Alliance to End Plastic Waste is ASSOCIATION but here it is NGO
4,388,5,The most frequent named entity for Alliance to End Plastic Waste is ASSOCIATION but here it is NGO


### 2.3 Chemical check

In [None]:
regex_chemical = re.compile(r"(high( |- | -|-| - )density|low( |- | -|-| - ))density|linear( |- | -|-| - )low|ultra( |- | -|-| - )pure|carbon( |- | -|-| - )fiber|polylactic( |- | -|-| - )acid|post( |- | -|-| - )consumer|recycled( |- | -|-| - )poly",re.IGNORECASE)
document_id = []
sentence_id = []
error_message = []
for key in content_dic.keys():
  if regex_chemical.search(content_dic[key]) != None:
    document_id.append(key.split('/')[0])
    sentence_id.append(key.split('/')[1])
    error_message.append('Check chemicals in this sentence')


In [None]:
chemical_error = pd.DataFrame({'document_id':document_id, 'sentence_id':sentence_id,'error_message':error_message})
chemical_error.head()

Unnamed: 0,document_id,sentence_id,error_message
0,2,2,Check chemicals in this sentence
1,2,4,Check chemicals in this sentence
2,2,5,Check chemicals in this sentence
3,9,2,Check chemicals in this sentence
4,9,3,Check chemicals in this sentence


In [None]:
chemical_error.shape

(313, 3)

## 3. Partnership check

### 3.1 Subject check

In [None]:
academia_partnership = ['FUNDED\_RESEARCH', 'INITIATIVE', 'JOINT\_DEVELOPMENT', 'OTHER', 'PLEDGE', 'SUBSIDIARY','SUPPORT']
association_partnership = ['INITIATIVE', 'JOINT\_DEVELOPMENT', 'OTHER', 'PLEDGE', 'SUPPORT']
chemical_partnership = ['INVOLVED\_IN']
company_partnership = ['BUSINESS\_COLLABORATION', 'CORPORATE\_VENTURING', 'DISTRIBUTION\_PARTNERSHIP', 'FUNDED\_RESEARCH', 'INITIATIVE', 'JOINT\_DEVELOPMENT', 'JOINT\_VENTURE', 'MERGERS\_AND\_ACQUISITIONS', 'OTHER', 'PLEDGE', 'SUBSIDIARY', 'SUBSIDIARY\_JOINT\_VENTURE', 'SUPPORT']
facility_partnership = ['LAUNCHED\_BY']
government_partnership = ['FUNDED\_RESEARCH', 'INITIATIVE', 'JOINT\_DEVELOPMENT', 'OTHER', 'SUPPORT']
ngo_partnership = ['INITIATIVE', 'JOINT\_DEVELOPMENT', 'OTHER', 'PLEDGE', 'SUPPORT']
product_partnership = ['LAUNCHED\_BY']

In [None]:
def checkacademia(row):
    if not row['partnership'] in academia_partnership:
        a = [row['document_id'],row['sentence_id'],'This is not possible partnership from '+row['subject_name']+' to '+ row['object_name']+'.']
    else:
        a = [row['document_id'],row['sentence_id'],'NULL']
    return(a)
def checkassociation(row):
    if not row['partnership'] in association_partnership:
        a = [row['document_id'],row['sentence_id'],'This is not possible partnership from '+row['subject_name']+' to '+ row['object_name']+'.']
    else:
        a = [row['document_id'],row['sentence_id'],'NULL']
    return(a)
def checkchemical(row):
    if not row['partnership'] in chemical_partnership:
        a = [row['document_id'],row['sentence_id'],'This is not possible partnership from '+row['subject_name']+' to '+ row['object_name']+'.']
    else:
        a = [row['document_id'],row['sentence_id'],'NULL']
    return(a)
def checkcompany(row):
    if not row['partnership'] in company_partnership:
        a = [row['document_id'],row['sentence_id'],'This is not possible partnership from '+row['subject_name']+' to '+ row['object_name']+'.']
    else:
        a = [row['document_id'],row['sentence_id'],'NULL']
    return(a)
def checkfacility(row):
    if not row['partnership'] in facility_partnership:
        a = [row['document_id'],row['sentence_id'],'This is not possible partnership from '+row['subject_name']+' to '+ row['object_name']+'.']
    else:
        a = [row['document_id'],row['sentence_id'],'NULL']
    return(a)
def checkgovernment(row):
    if not row['partnership'] in government_partnership:
        a = [row['document_id'],row['sentence_id'],'This is not possible partnership from '+row['subject_name']+' to '+ row['object_name']+'.']
    else:
        a = [row['document_id'],row['sentence_id'],'NULL']
    return(a)
def checkngo(row):
    if not row['partnership'] in ngo_partnership:
        a = [row['document_id'],row['sentence_id'],'This is not possible partnership from '+row['subject_name']+' to '+ row['object_name']+'.']
    else:
        a = [row['document_id'],row['sentence_id'],'NULL']
    return(a)
def checkproduct(row):
    if not row['partnership'] in product_partnership:
        a = [row['document_id'],row['sentence_id'],'This is not possible partnership from '+row['subject_name']+' to '+ row['object_name']+'.']
    else:
        a = [row['document_id'],row['sentence_id'],'NULL']
    return(a)

In [None]:
def check_subject_partnership(data):
    partnership_error = pd.DataFrame(columns = ['document_id','sentence_id', 'error_message'], index = range(len(data['document_id'])))
    for i in range(len(data['document_id'])):
        if data.iloc[i]['subject_named_entity'] == 'ACADEMIA':
            partnership_error.iloc[i]['document_id'] = checkacademia(data.iloc[i])[0]
            partnership_error.iloc[i]['sentence_id'] = checkacademia(data.iloc[i])[1]
            partnership_error.iloc[i]['error_message'] = checkacademia(data.iloc[i])[2]
        if data.iloc[i]['subject_named_entity'] == 'ASSOCIATION':
            partnership_error.iloc[i]['document_id'] = checkassociation(data.iloc[i])[0]
            partnership_error.iloc[i]['sentence_id'] = checkassociation(data.iloc[i])[1]
            partnership_error.iloc[i]['error_message'] = checkassociation(data.iloc[i])[2]
        if data.iloc[i]['subject_named_entity'] == 'CHEMICAL':
            partnership_error.iloc[i]['document_id'] = checkchemical(data.iloc[i])[0]
            partnership_error.iloc[i]['sentence_id'] = checkchemical(data.iloc[i])[1]
            partnership_error.iloc[i]['error_message'] = checkchemical(data.iloc[i])[2]
        if data.iloc[i]['subject_named_entity'] == 'FACILITY':
            partnership_error.iloc[i]['document_id'] = checkfacility(data.iloc[i])[0]
            partnership_error.iloc[i]['sentence_id'] = checkfacility(data.iloc[i])[1]
            partnership_error.iloc[i]['error_message'] = checkfacility(data.iloc[i])[2]
        if data.iloc[i]['subject_named_entity'] == 'GOVERNMENT':
            partnership_error.iloc[i]['document_id'] = checkgovernment(data.iloc[i])[0]
            partnership_error.iloc[i]['sentence_id'] = checkgovernment(data.iloc[i])[1]
            partnership_error.iloc[i]['error_message'] = checkgovernment(data.iloc[i])[2]
        if data.iloc[i]['subject_named_entity'] == 'NGO':
            partnership_error.iloc[i]['document_id'] = checkngo(data.iloc[i])[0]
            partnership_error.iloc[i]['sentence_id'] = checkngo(data.iloc[i])[1]
            partnership_error.iloc[i]['error_message'] = checkngo(data.iloc[i])[2]
        if data.iloc[i]['subject_named_entity'] == 'PRODUCT':
            partnership_error.iloc[i]['document_id'] = checkproduct(data.iloc[i])[0]
            partnership_error.iloc[i]['sentence_id'] = checkproduct(data.iloc[i])[1]
            partnership_error.iloc[i]['error_message'] = checkproduct(data.iloc[i])[2]
        if  data.iloc[i]['subject_named_entity'] == 'PACT':
            partnership_error.iloc[i]['document_id'] = data.iloc[i]['document_id']
            partnership_error.iloc[i]['sentence_id'] = data.iloc[i]['sentence_id']
            partnership_error.iloc[i]['error_message'] = 'This is not possible partnership from ' + data.iloc[i]['subject_name']+' to '+ data.iloc[i]['object_name']+'.'
    return partnership_error

In [None]:
subject_error = check_subject_partnership(partnership)
subject_error = subject_error[subject_error['error_message'] != 'NULL'].dropna()
subject_error.head()

Unnamed: 0,document_id,sentence_id,error_message
75,7,72,This is not possible partnership from HVO to Repsol.
164,25,75,This is not possible partnership from Arizona Innovation Challenge to Renewlogy.
429,142,4,This is not possible partnership from Saudi G20 Presidency to Circular Carbon Economy.
873,186,5,This is not possible partnership from Horizon 2020 to Circular Flooring.
1048,207,1,This is not possible partnership from rPET to Berry M&H.


In [None]:
subject_error.shape

(36, 3)

### 3.2 Object check

In [None]:
academia_object_partnership = ['FUNDED\_RESEARCH', 'INVOLVED\_IN', 'JOINT\_DEVELOPMENT', 'LAUNCHED\_BY', 'OTHER', 'PLEDGE', 'SUBSIDIARY','SUPPORT']
association_object_partnership = ['INITIATIVE', 'JOINT\_DEVELOPMENT', 'OTHER', 'PLEDGE', 'SUPPORT','INVOLVED\_IN','LAUNCHED\_BY']
chemical_object_partnership = []
company_object_partnership = ['BUSINESS\_COLLABORATION', 'CORPORATE\_VENTURING', 'DISTRIBUTION\_PARTNERSHIP', 'FUNDED\_RESEARCH', 'JOINT\_DEVELOPMENT', 'JOINT\_VENTURE', 'MERGERS\_AND\_ACQUISITIONS', 'OTHER',  'SUBSIDIARY', 'SUBSIDIARY\_JOINT\_VENTURE', 'SUPPORT', 'INVOLVED\_IN', 'LAUNCHED\_BY']
facility_object_partnership = []
government_object_partnership = ['JOINT\_DEVELOPMENT', 'LAUNCHED\_BY', 'OTHER', 'SUPPORT']
ngo_object_partnership = ['FUNDED\_RESEARCH', 'INITIATIVE', 'JOINT\_DEVELOPMENT', 'LAUNCHED\_BY', 'OTHER', 'PLEDGE', 'SUPPORT']
pact_object_partnership = ['INITIATIVE', 'PLEDGE', 'SUPPORT']
product_object_partnership = []

In [None]:
def checkacademia(row):
    if not row['partnership'] in academia_object_partnership:
        a = [row['document_id'],row['sentence_id'],'This is not possible partnership from '+row['subject_name']+' to '+ row['object_name']+'.']
    else:
        a = [row['document_id'],row['sentence_id'],'NULL']
    return(a)
def checkassociation(row):
    if not row['partnership'] in association_object_partnership:
        a = [row['document_id'],row['sentence_id'],'This is not possible partnership from '+row['subject_name']+' to '+ row['object_name']+'.']
    else:
        a = [row['document_id'],row['sentence_id'],'NULL']
    return(a)
def checkchemical(row):
    if not row['partnership'] in chemical_object_partnership:
        a = [row['document_id'],row['sentence_id'],'This is not possible partnership from '+row['subject_name']+' to '+ row['object_name']+'.']
    else:
        a = [row['document_id'],row['sentence_id'],'NULL']
    return(a)
def checkcompany(row):
    if not row['partnership'] in company_object_partnership:
        a = [row['document_id'],row['sentence_id'],'This is not possible partnership from '+row['subject_name']+' to '+ row['object_name']+'.']
    else:
        a = [row['document_id'],row['sentence_id'],'NULL']
    return(a)
def checkfacility(row):
    if not row['partnership'] in facility_object_partnership:
        a = [row['document_id'],row['sentence_id'],'This is not possible partnership from '+row['subject_name']+' to '+ row['object_name']+'.']
    else:
        a = [row['document_id'],row['sentence_id'],'NULL']
    return(a)
def checkgovernment(row):
    if not row['partnership'] in government_object_partnership:
        a = [row['document_id'],row['sentence_id'],'This is not possible partnership from '+row['subject_name']+' to '+ row['object_name']+'.']
    else:
        a = [row['document_id'],row['sentence_id'],'NULL']
    return(a)
def checkngo(row):
    if not row['partnership'] in ngo_object_partnership:
        a = [row['document_id'],row['sentence_id'],'This is not possible partnership from '+row['subject_name']+' to '+ row['object_name']+'.']
    else:
        a = [row['document_id'],row['sentence_id'],'NULL']
    return(a)
def checkpact(row):
    if not row['partnership'] in pact_object_partnership:
        a = [row['document_id'],row['sentence_id'],'This is not possible partnership from '+row['subject_name']+' to '+ row['object_name']+'.']
    else:
        a = [row['document_id'],row['sentence_id'],'NULL']
    return(a)
def checkproduct(row):
    if not row['partnership'] in product_partnership:
        a = [row['document_id'],row['sentence_id'],'This is not possible partnership from '+row['subject_name']+' to '+ row['object_name']+'.']
    else:
        a = [row['document_id'],row['sentence_id'],'NULL']
    return(a)

In [None]:
def check_object_partnership(data):
    object_partnership_error = pd.DataFrame(columns = ['document_id','sentence_id', 'error_message'], index = range(len(data['document_id'])))
    for i in range(len(data['document_id'])):
        if data.iloc[i]['object_named_entity'] == 'ACADEMIA':
            object_partnership_error.iloc[i]['document_id'] = checkacademia(data.iloc[i])[0]
            object_partnership_error.iloc[i]['sentence_id'] = checkacademia(data.iloc[i])[1]
            object_partnership_error.iloc[i]['error_message'] = checkacademia(data.iloc[i])[2]
        if data.iloc[i]['object_named_entity'] == 'ASSOCIATION':
            object_partnership_error.iloc[i]['document_id'] = checkassociation(data.iloc[i])[0]
            object_partnership_error.iloc[i]['sentence_id'] = checkassociation(data.iloc[i])[1]
            object_partnership_error.iloc[i]['error_message'] = checkassociation(data.iloc[i])[2]
        if data.iloc[i]['object_named_entity'] == 'CHEMICAL':
            object_partnership_error.iloc[i]['document_id'] = checkchemical(data.iloc[i])[0]
            object_partnership_error.iloc[i]['sentence_id'] = checkchemical(data.iloc[i])[1]
            object_partnership_error.iloc[i]['error_message'] = checkchemical(data.iloc[i])[2]
        if data.iloc[i]['object_named_entity'] == 'COMPANY':
            object_partnership_error.iloc[i]['document_id'] = checkcompany(data.iloc[i])[0]
            object_partnership_error.iloc[i]['sentence_id'] = checkcompany(data.iloc[i])[1]
            object_partnership_error.iloc[i]['error_message'] = checkcompany(data.iloc[i])[2]
        if data.iloc[i]['object_named_entity'] == 'FACILITY':
            object_partnership_error.iloc[i]['document_id'] = checkfacility(data.iloc[i])[0]
            object_partnership_error.iloc[i]['sentence_id'] = checkfacility(data.iloc[i])[1]
            object_partnership_error.iloc[i]['object_error_message'] = checkfacility(data.iloc[i])[2]
        if data.iloc[i]['object_named_entity'] == 'GOVERNMENT':
            object_partnership_error.iloc[i]['document_id'] = checkgovernment(data.iloc[i])[0]
            object_partnership_error.iloc[i]['sentence_id'] = checkgovernment(data.iloc[i])[1]
            object_partnership_error.iloc[i]['error_message'] = checkgovernment(data.iloc[i])[2]
        if data.iloc[i]['object_named_entity'] == 'NGO':
            object_partnership_error.iloc[i]['document_id'] = checkngo(data.iloc[i])[0]
            object_partnership_error.iloc[i]['sentence_id'] = checkngo(data.iloc[i])[1]
            object_partnership_error.iloc[i]['error_message'] = checkngo(data.iloc[i])[2]
        if data.iloc[i]['object_named_entity'] == 'PRODUCT':
            object_partnership_error.iloc[i]['document_id'] = checkproduct(data.iloc[i])[0]
            object_partnership_error.iloc[i]['sentence_id'] = checkproduct(data.iloc[i])[1]
            object_partnership_error.iloc[i]['error_message'] = checkproduct(data.iloc[i])[2]
        if  data.iloc[i]['object_named_entity'] == 'PACT':
            object_partnership_error.iloc[i]['document_id'] = checkpact(data.iloc[i])[0]
            object_partnership_error.iloc[i]['sentence_id'] = checkpact(data.iloc[i])[1]
            object_partnership_error.iloc[i]['error_message'] = checkpact(data.iloc[i])[2]
    return object_partnership_error

In [None]:
object_error = check_object_partnership(partnership)
object_error = object_error[object_error['error_message'] != 'NULL'].dropna()
object_error.head()

Unnamed: 0,document_id,sentence_id,error_message
36,7,18,This is not possible partnership from Orlen Lietuva to Lithuanian Ministry of Energy.
273,89,8,This is not possible partnership from rPP to PureCycle.
300,112,6,This is not possible partnership from MMRDA to Marine Debris Partnership.
381,138,5,This is not possible partnership from Morssinkhof Group to CuRe Technology.
382,138,5,This is not possible partnership from Cumapol to CuRe Technology.


In [None]:
object_error.shape

(55, 3)

### 3.3 Special check

In [None]:
document_id = []
sentence_id = []
error_message = []
regex_funded_research = re.compile(r"fund|invest|grant",re.IGNORECASE)
for i in range(partnership.shape[0]):
  if (partnership['object_named_entity'][i] == 'ACADEMIA' and partnership['subject_named_entity'][i] == 'COMPANY') or (partnership['object_named_entity'][i] == 'COMPANY' and partnership['subject_named_entity'][i] == 'ACADEMIA'):
    loc = str(partnership['document_id'][i])+'/'+str(partnership['sentence_id'][i])
    if regex_funded_research.search(content_dic[loc]) != None:
      document_id.append(partnership['document_id'][i])
      sentence_id.append(partnership['sentence_id'][i])
      error_message.append('The partnership between '+partnership['subject_name'][i]+' and '+partnership['object_name'][i]+' should be funded research.')


In [None]:
funded_research_error = pd.DataFrame({'document_id':document_id, 'sentence_id':sentence_id,'error_message':error_message})
funded_research_error

Unnamed: 0,document_id,sentence_id,error_message
0,75,8,The partnership between Amcor and Michigan Sta...
1,179,4,The partnership between P&G. and Fraunhofer CC...
2,179,4,The partnership between P&G. and Fraunhofer UM...
3,245,7,The partnership between Georgia Tech and Novel...
4,257,5,The partnership between P&G. and Fraunhofer CC...
5,257,5,The partnership between P&G. and Fraunhofer UM...
6,257,5,The partnership between Fraunhofer CCPE and SA...
7,257,5,The partnership between Fraunhofer UMSICHT and...
8,314,4,The partnership between P&G. and Fraunhofer UM...
9,480,3,The partnership between SOCAR Turkey R&D and I...


## 4. Combination

In [None]:
error1 = pd.concat([cir_error, df_error, frequency_error, subject_error, object_error,funded_research_error,chemical_error])
error1['document_id'] = pd.to_numeric(error1['document_id'])
error1['sentence_id'] = pd.to_numeric(error1['sentence_id'])
error1 = error1.sort_values(by=['document_id'])
error1['document_id'] = error1['document_id'].apply(str)
error1['sentence_id'] = error1['sentence_id'].apply(str)
error1 = error1.reset_index()
error1.head()

Unnamed: 0,index,document_id,sentence_id,error_message
0,0,2,2,Check chemicals in this sentence
1,1,2,4,Check chemicals in this sentence
2,2,2,5,Check chemicals in this sentence
3,75,7,72,This is not possible partnership from HVO to Repsol.
4,36,7,18,This is not possible partnership from Orlen Lietuva to Lithuanian Ministry of Energy.


In [None]:
dic2 = {}
for i in range(len(error1['document_id'])):
    merge_id = error1['document_id'][i]+'/'+error1['sentence_id'][i]
    if merge_id not in dic2.keys():
        dic2[merge_id] = []
        dic2[merge_id].append(error1['error_message'][i])
    else:
        dic2[merge_id].append(error1['error_message'][i])

In [None]:
error = pd.DataFrame(dic2.items(), columns=['ds', 'error_message'])
error['document_id'] = error.apply(lambda row: row.ds.split('/')[0], axis=1)
error['sentence_id'] = error.apply(lambda row: row.ds.split('/')[1], axis=1)
error = error.drop('ds', axis = 1)
#error.to_csv('error.csv')
error.head()

Unnamed: 0,error_message,document_id,sentence_id
0,[Check chemicals in this sentence],2,2
1,[Check chemicals in this sentence],2,4
2,[Check chemicals in this sentence],2,5
3,[This is not possible partnership from HVO to Repsol.],7,72
4,[This is not possible partnership from Orlen Lietuva to Lithuanian Ministry of Energy.],7,18


In [None]:
error.groupby('document_id').count().shape

(280, 2)