In [1]:
from transformers import pipeline
import tqdm
from utils.data_cleaning import get_text_before_introduction
from utils.file_manager import get_folder_content

# reload modules
%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load a NER pipeline from Hugging Face
ner_pipeline = pipeline(
    "ner", 
    model="dbmdz/bert-large-cased-finetuned-conll03-english", 
    tokenizer="dbmdz/bert-large-cased-finetuned-conll03-english"
    )

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
file_path = "../Markdown"
txt_files = get_folder_content(file_path, '.mmd')

# Take only the first 10 files for testing
txt_files = txt_files[:10]

In [6]:
descriptions = {}
for file in txt_files:
    with open(file, "r") as file:
        content = file.read()

    # this just gets title, authors, abstract, etc. and not the introduction
    text = get_text_before_introduction(content)

    # we create a dictionary with the file name as key and the description as value for all papers
    descriptions[file] = {}
    descriptions[file]["description"] = text

In [7]:
# Use the NER pipeline to extract entities from the description

country_names_list = []
uni_names_list = []

# Loop over all descriptions
for key in tqdm.tqdm(descriptions.keys()):
    
    description = descriptions[key]["description"]
    # if description is none, skip
    if description is None:
        continue

    # Use the NER pipeline to extract entities from the description
    entities = ner_pipeline(description)
    
    # Filter out entities classified as persons (which might include authors)
    # author_names = [entity['word'] for entity in entities if entity['entity'] == 'I-PER']
    # descriptions[key]["authors"] = {"authors": author_names}

    # # Filter out entities classified as organizations, which might include universities and institutions
    university_names = [entity['word'] for entity in entities if entity['entity'] == 'B-ORG' or entity['entity'] == 'I-ORG']

    descriptions[key]["universities"] = {"universities": university_names}
    uni_names_list.extend(university_names)

    country_names = [entity['word'] for entity in entities if entity['entity'] in ['B-LOC', 'I-LOC', 'B-GPE']] #, 'I-GPE']]
    descriptions[key]["countries"] = {"countries": country_names}
    
    country_names_list.extend(country_names)

100%|██████████| 10/10 [00:02<00:00,  4.02it/s]


In [8]:
len(country_names_list)

33

In [9]:
from collections import Counter

# Use Counter to count occurrences
counts = Counter(country_names_list)

# Sort items by count in descending order
sorted_counts = sorted(counts.items(), key=lambda item: item[1], reverse=True)

# Print sorted counts
for element, count in sorted_counts:
    print(f"{element}: {count}")

China: 7
Hang: 4
##zhou: 4
Dal: 3
##ian: 3
Singapore: 2
Belle: 1
##vue: 1
WA: 1
##ua: 1
Lia: 1
##oning: 1
Province: 1
San: 1
Diego: 1
Zurich: 1


## Notes

Our preliminary results show that the researchers from China and USA represent a majority of users of the foundation models, which the number of Chinese researchers even surpassing the US researchers.