In [99]:
import pandas as pd
import numpy as np
import re


In [100]:
journal_df = pd.read_csv('data/journals.csv')
journal_df.head(3)

Unnamed: 0,authors,title,abstract,journal_name
0,Yu-Xiao Wang; Yue Xin; Jun-Yi Yin; Xiao-Jun Hu...,Revealing the architecture and solution proper...,Macrolepiota albuminosa (Berk.) Pegler is abun...,Food Chemistry
1,Yu-Xiao Wang; Yue Xin; Xiao-Jun Huang; Jun-Yi ...,A branched galactoglucan with flexible chains ...,A homogeneous galactoglucan was purified from ...,Food Chemistry
2,Yu-Xiao Wang; Ting Zhang; Jun-Yi Yin; Xiao-Jun...,Structural characterization and rheological pr...,A homogeneous beta-glucan (JHMP-70) was obtain...,Food Hydrocolloids


## Journal and Title dfs

In [101]:
# Extract unique journal names and assign unique IDs
journal_names = journal_df['journal_name'].unique()
journal_ids = range(len(journal_names))

# Create a DataFrame for journal mapping
journal_mapping_df = pd.DataFrame({
    'Journal': journal_names,
    'Journal_ID': journal_ids
})

# Assign unique IDs to titles and create a DataFrame for title mapping
title_ids = range(len(journal_df))
title_mapping_df = pd.DataFrame({
    'Title': journal_df['title'],
    'Title_ID': title_ids,
    'Journal_ID': journal_df['journal_name'].map(journal_mapping_df.set_index('Journal')['Journal_ID'])
})


journal_mapping_df.to_csv('data/data_model/journal_mapping.csv',index=False)
title_mapping_df.to_csv('data/data_model/title_mapping.csv',index=False)

In [102]:
title_mapping_df.head()

Unnamed: 0,Title,Title_ID,Journal_ID
0,Revealing the architecture and solution proper...,0,0
1,A branched galactoglucan with flexible chains ...,1,0
2,Structural characterization and rheological pr...,2,1
3,Dispersive micro-solid-phase extraction of aca...,3,2
4,The identification of biotransformation pathwa...,4,0


In [103]:
rows = []
for idx, row in enumerate(journal_df['authors']):
    authors = row.split('; ')
    title = journal_df['title'][idx]
    
    for author in authors:
        rows.append([author.strip(), title])


authors_df = pd.DataFrame(rows, columns=['author', 'title'])


In [104]:
authors_df.head(20)

Unnamed: 0,author,title
0,Yu-Xiao Wang,Revealing the architecture and solution proper...
1,Yue Xin,Revealing the architecture and solution proper...
2,Jun-Yi Yin,Revealing the architecture and solution proper...
3,Xiao-Jun Huang,Revealing the architecture and solution proper...
4,Jun-Qiao Wang,Revealing the architecture and solution proper...
5,Jie-Lun Hu,Revealing the architecture and solution proper...
6,Fang Geng,Revealing the architecture and solution proper...
7,Shao-Ping Nie,Revealing the architecture and solution proper...
8,Yu-Xiao Wang,A branched galactoglucan with flexible chains ...
9,Yue Xin,A branched galactoglucan with flexible chains ...


In [105]:
# authors_df.to_csv('data/data_model/authors.csv',index=False)

In [106]:

repeating_authors = authors_df.groupby('author').size().reset_index(name='count')
repeating_authors = repeating_authors[repeating_authors['count'] > 1]
repeating_authors.sort_values(by='count',ascending=False)

Unnamed: 0,author,count
14459,"Petkova, M.",89
16181,"Sanz, Y.",89
10796,"Kouba, M.",89
15222,"Ramos, F.",89
2134,"Bampidis, V.",89
...,...,...
9233,"Illathu Madhavamenon, K.",2
9235,"Imaizumi, T.",2
9285,"Isoda, H.",2
9287,"Issa-Issa, H.",2


## **titles to NER mapping**

In [47]:
ner_df = pd.read_csv('data/data_model/ner_outputs.csv')
ner_df.head(3)

Unnamed: 0,Title,Entity,Entity_Type
0,Revealing the architecture and solution proper...,"Macrolepiota albuminosa (Berk.) Pegler,polysac...","MUSHROOM ,POLYSACCHARIDE,MUSHROOM ,CARBOHYDRAT..."
1,A branched galactoglucan with flexible chains ...,"galactoglucan,polysaccharides,Macrolepiota alb...","POLYSACCHARIDE,POLYSACCHARIDE,MUSHROOM ,CHEMIC..."
2,Structural characterization and rheological pr...,"beta-glucan,JHMP-70,Hypsizygus marmoreus,ethan...","PROTEIN,MUSHROOM ,TOXIN,CHEMICAL,PROPERTY,APPL..."


## **Entity type frequency**

In [109]:
len(ner_df.iloc[0].Entity_Type.split(',')),len(ner_df.iloc[0].Entity.split(','))

(25, 25)

In [91]:
ent_types = pd.read_csv('data/data_model/entity_types.csv')
ent_type_list = list(ent_types['Entity Type'])

entity_type_freq_df = pd.DataFrame(columns=['Title'] + ent_type_list + ['total_ent_types'])

for index, row in ner_df.iterrows():
    present = row.Entity_Type.split(',')
    
    # Create a dictionary to store the frequency of each entity type
    frequency = {entity_type: 0 for entity_type in ent_type_list}
    
    # Count the frequency of each present entity type
    for entity_type in present:
        if entity_type in ent_type_list:
            frequency[entity_type] += 1
    
    # Compute the total count of entity types
    total_ent_types = sum(frequency.values())
    
    # Append a new row to the entity type DataFrame
    entity_type_freq_df.loc[len(entity_type_freq_df)] = [row.Title] + list(frequency.values()) + [total_ent_types]

entity_type_freq_df.head()


Unnamed: 0,Title,ORG,AGE GROUP,PLANT SPECIES,CONDITION,DRINK,LOC,BACTERIUM,CARBOHYDRATES,OBSERVATION,...,TECHNIQUE,MATERIAL,POLYSACCHARIDE,ANIMAL MODEL,MUSHROOM,APPLICATION,CHEMICAL,BIOLOGICAL PATHWAY,MONOSACCHARIDE,total_ent_types
0,Revealing the architecture and solution proper...,1,0,0,0,0,0,0,2,0,...,2,0,5,0,7,0,0,0,0,25
1,A branched galactoglucan with flexible chains ...,1,0,0,0,0,1,0,0,1,...,2,0,3,0,2,2,2,0,2,23
2,Structural characterization and rheological pr...,1,0,0,0,0,0,0,1,0,...,3,0,1,0,2,4,2,0,1,30
3,Dispersive micro-solid-phase extraction of aca...,2,0,1,0,2,0,0,0,0,...,5,2,0,0,0,0,11,0,0,38
4,The identification of biotransformation pathwa...,1,0,0,1,0,0,0,0,0,...,6,2,0,0,0,0,11,0,0,32


## **Entity Frequency**

In [115]:
entity_dict = {}

for entities, types in zip(ner_df['Entity'], ner_df['Entity_Type']):
    entities_list = entities.split(',')
    type_list = types.split(',')

    for i in range(len(entities_list)):
        if entities_list[i] not in entity_dict:
            entity_dict[entities_list[i]] = type_list[i]

# Create entity_df from entity_dict
entity_df = pd.DataFrame({'entity': list(entity_dict.keys()), 'entity_type': list(entity_dict.values())})
entity_df['entity_id'] = range(1, len(entity_df) + 1)
entity_df = entity_df[['entity_id', 'entity', 'entity_type']]
entity_df.head

25 25
23 23
31 30
Error processing entities and types: Lengths of entities and types do not match
38 38
33 32
Error processing entities and types: Lengths of entities and types do not match
48 48
19 19
25 25
19 18
Error processing entities and types: Lengths of entities and types do not match
33 31
Error processing entities and types: Lengths of entities and types do not match
39 39
11 11
46 46
33 33
38 38
50 50
34 33
Error processing entities and types: Lengths of entities and types do not match
22 22
35 35
8 8
22 21
Error processing entities and types: Lengths of entities and types do not match
9 9
13 13
23 23
26 26
13 13
32 31
Error processing entities and types: Lengths of entities and types do not match
28 28
20 20
27 27
23 23
50 49
Error processing entities and types: Lengths of entities and types do not match
11 11
10 10
16 16
50 50
21 21
17 17
40 39
Error processing entities and types: Lengths of entities and types do not match
42 42
17 17
32 32
23 23
29 29
30 30
28 28
24 24
33

KeyboardInterrupt: 

In [93]:
entity_type_freq_df.to_csv('data/data_model/entity_type_frequency.csv',index=False)