In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

ds = load_dataset("codeparrot/codeparrot-clean-valid", split="train")
ds = ds.select([i for i in range(100)])

Using custom data configuration codeparrot--codeparrot-clean-valid-826c6fd8b27e5523
Found cached dataset json (/Users/loubnabenallal/.cache/huggingface/datasets/codeparrot___json/codeparrot--codeparrot-clean-valid-826c6fd8b27e5523/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


## Person name detection using an NER model

We will test [bert-base-NER](https://huggingface.co/dslim/bert-base-NER) and [CodeBert finetuned](https://huggingface.co/mrm8488/codebert-base-finetuned-stackoverflow-ner) on an NER dataset on Stackoverflow to detect Person Names in a subset of our Python dataset.

### BERT-Base-NER

In [62]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp_bert = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

In [149]:
example = "\n# @python3.9\n#@ejonath\n# This work is contributed by Marc Dupont and  Elizabeth Johnson  and is part of the Satellite project\n# Python program for implementation of Bubble Sort\n\ndef bubbleSort(arr):\n    n = len(arr)\n    # Traverse through all array elements\n    # Credits to Taylor Jonahthan for this section\n    for i in range(n):\n\n        # Last i elements are already in place\n        for j in range(0, n-i-1):\n\n            # traverse the array from 0 to n-i-1\n            # Swap if the element found is greater\n            # than the next element\n            if arr[j] > arr[j+1]:\n                arr[j], arr[j+1] = arr[j+1], arr[j]"

In [150]:
print(example)


# @python3.9
#@ejonath
# This work is contributed by Marc Dupont and  Elizabeth Johnson  and is part of the Satellite project
# Python program for implementation of Bubble Sort

def bubbleSort(arr):
    n = len(arr)
    # Traverse through all array elements
    # Credits to Taylor Jonahthan for this section
    for i in range(n):

        # Last i elements are already in place
        for j in range(0, n-i-1):

            # traverse the array from 0 to n-i-1
            # Swap if the element found is greater
            # than the next element
            if arr[j] > arr[j+1]:
                arr[j], arr[j+1] = arr[j+1], arr[j]


In [151]:
ner_results = nlp_bert(example)

for e in ner_results:
    if e["entity_group"] =="PER":
        print(e)

{'entity_group': 'PER', 'score': 0.9996812, 'word': 'Marc Dupont', 'start': 54, 'end': 65}
{'entity_group': 'PER', 'score': 0.9995078, 'word': 'Elizabeth Johnson', 'start': 71, 'end': 88}
{'entity_group': 'PER', 'score': 0.9963441, 'word': 'Taylor Jonahthan', 'start': 276, 'end': 292}


### CodeBert StackOverflow NER

In [65]:
tokenizer_so = AutoTokenizer.from_pretrained("mrm8488/codebert-base-finetuned-stackoverflow-ner")
model_so = AutoModelForTokenClassification.from_pretrained("mrm8488/codebert-base-finetuned-stackoverflow-ner")
nlp_so = pipeline("ner", model=model_so, tokenizer=tokenizer_so, aggregation_strategy="simple")

In [152]:
ner_results_so = nlp_so(example)

for e in ner_results_so:
    if e["entity_group"] == "User_Name":
        print(e)

{'entity_group': 'User_Name', 'score': 0.9836406, 'word': 'e', 'start': 16, 'end': 17}
{'entity_group': 'User_Name', 'score': 0.6776985, 'word': 'jonath', 'start': 17, 'end': 23}
{'entity_group': 'User_Name', 'score': 0.98596036, 'word': ' Marc', 'start': 54, 'end': 58}
{'entity_group': 'User_Name', 'score': 0.8043251, 'word': ' Elizabeth', 'start': 71, 'end': 80}
{'entity_group': 'User_Name', 'score': 0.969354, 'word': ' Taylor', 'start': 276, 'end': 282}


In [71]:
# without pipeline aggregation
nlp_so_no_agg = pipeline("ner", model=model_so, tokenizer=tokenizer_so)
res = nlp_so_no_agg(example)

for e in res:
    if "User_Name" in e["entity"]:
        print(e)

{'entity': 'B-User_Name', 'score': 0.98323464, 'index': 15, 'word': 'ĠMarc', 'start': 44, 'end': 48}
{'entity': 'I-User_Name', 'score': 0.75678843, 'index': 16, 'word': 'ĠDup', 'start': 49, 'end': 52}
{'entity': 'B-User_Name', 'score': 0.6773796, 'index': 20, 'word': 'ĠElizabeth', 'start': 61, 'end': 70}
{'entity': 'B-User_Name', 'score': 0.9776358, 'index': 75, 'word': 'ĠTaylor', 'start': 266, 'end': 272}
{'entity': 'I-User_Name', 'score': 0.42680034, 'index': 76, 'word': 'ĠJonah', 'start': 273, 'end': 278}


### Apply on some Python files

In [102]:
def detect_names_bert(example):
    entities = nlp_bert(example["content"])
    names = [sample for sample in entities if sample["entity_group"] == "PER"]
    return {"names": names}

def detect_names_so(example):
    entities = nlp_so(example["content"])
    names = [sample for sample in entities if sample["entity_group"] == "User_Name"]
    return {"names": names}

#### Detection with BERT-Base-NER

In [77]:
ds_bert = ds.map(detect_names_bert)

  0%|          | 0/100 [00:00<?, ?ex/s]

In [97]:
output = [i for i in range(len(ds_bert)) if ds_bert[i]["names"]]
print(f"Names detected in {len(output)} elements which are {output}")

Names detected in 21 elements which are [1, 8, 14, 17, 23, 34, 37, 39, 45, 46, 49, 55, 69, 70, 72, 76, 88, 89, 92, 93, 98]


In [94]:
for i, sample in enumerate(ds_bert):
    if sample["names"]:
        print(f"Sample {i}:\n {sample['names']}")
        print(f"Names are {[e['word'] for e in sample['names'] if e]}\n")

Sample 1:
 [{'end': 197, 'entity_group': 'PER', 'score': 0.9986525774002075, 'start': 195, 'word': 'Mi'}, {'end': 199, 'entity_group': 'PER', 'score': 0.8829349875450134, 'start': 197, 'word': '##ki'}, {'end': 209, 'entity_group': 'PER', 'score': 0.9712410569190979, 'start': 199, 'word': '##o Hirabaya'}]
Names are ['Mi', '##ki', '##o Hirabaya']

Sample 8:
 [{'end': 31, 'entity_group': 'PER', 'score': 0.9820157885551453, 'start': 23, 'word': 'Luc Saff'}]
Names are ['Luc Saff']

Sample 14:
 [{'end': 359, 'entity_group': 'PER', 'score': 0.9671221971511841, 'start': 358, 'word': 'C'}, {'end': 467, 'entity_group': 'PER', 'score': 0.875209629535675, 'start': 466, 'word': 'C'}, {'end': 849, 'entity_group': 'PER', 'score': 0.8578362464904785, 'start': 848, 'word': 'C'}]
Names are ['C', 'C', 'C']

Sample 17:
 [{'end': 28, 'entity_group': 'PER', 'score': 0.9939444661140442, 'start': 26, 'word': 'Ce'}, {'end': 42, 'entity_group': 'PER', 'score': 0.9917498230934143, 'start': 28, 'word': '##dric Be

#### Detection with CodeBERT-Stackoverflow-NER

In [103]:
ds_so = ds.map(detect_names_so)

  0%|          | 0/100 [00:00<?, ?ex/s]

In [104]:
output = [i for i in range(len(ds_so)) if ds_so[i]["names"]]
print(f"Names detected in {len(output)} elements which are {output}")

Names detected in 26 elements which are [1, 8, 13, 15, 17, 21, 23, 26, 34, 37, 38, 45, 46, 49, 52, 55, 70, 72, 76, 84, 88, 92, 93, 94, 96, 98]


In [123]:
for i, sample in enumerate(ds_so):
    if sample["names"]:
        print(f"Sample {i}:\n {sample['names']}")
        print(f"Names are {[e['word'] for e in sample['names'] if e]}\n")

Sample 1:
 [{'end': 200, 'entity_group': 'User_Name', 'score': 0.6459169983863831, 'start': 195, 'word': ' Mikio'}]
Names are [' Mikio']

Sample 8:
 [{'end': 31, 'entity_group': 'User_Name', 'score': 0.7196319699287415, 'start': 23, 'word': ' Luc Saff'}]
Names are [' Luc Saff']

Sample 13:
 [{'end': 65, 'entity_group': 'User_Name', 'score': 0.718497633934021, 'start': 61, 'word': ' Kids'}]
Names are [' Kids']

Sample 15:
 [{'end': 732, 'entity_group': 'User_Name', 'score': 0.8268138766288757, 'start': 731, 'word': ' d'}, {'end': 738, 'entity_group': 'User_Name', 'score': 0.22002415359020233, 'start': 732, 'word': 'ulwich'}]
Names are [' d', 'ulwich']

Sample 17:
 [{'end': 29, 'entity_group': 'User_Name', 'score': 0.892275333404541, 'start': 26, 'word': ' Ced'}, {'end': 48, 'entity_group': 'User_Name', 'score': 0.7896466255187988, 'start': 45, 'word': 'ced'}, {'end': 51, 'entity_group': 'User_Name', 'score': 0.43889954686164856, 'start': 48, 'word': 'ric'}]
Names are [' Ced', 'ced', 'ri

In [124]:
ds_so.features

{'repo_name': Value(dtype='string', id=None),
 'path': Value(dtype='string', id=None),
 'copies': Value(dtype='string', id=None),
 'size': Value(dtype='string', id=None),
 'content': Value(dtype='string', id=None),
 'license': Value(dtype='string', id=None),
 'hash': Value(dtype='int64', id=None),
 'line_mean': Value(dtype='float64', id=None),
 'line_max': Value(dtype='int64', id=None),
 'alpha_frac': Value(dtype='float64', id=None),
 'autogenerated': Value(dtype='bool', id=None),
 'names': [{'end': Value(dtype='int64', id=None),
   'entity_group': Value(dtype='string', id=None),
   'score': Value(dtype='float32', id=None),
   'start': Value(dtype='int64', id=None),
   'word': Value(dtype='string', id=None)}]}

In [156]:
def name_column(example):
    person_names = []
    if example["names"]:
        person_names = [e["word"] for e in example["names"] if e]
    return {"person_names": person_names}

ds_so_new = ds_so.map(name_column, remove_columns=["repo_name", "copies", "hash", 'line_mean', 'line_max', 'alpha_frac', 'autogenerated'])

  0%|          | 0/100 [00:00<?, ?ex/s]

In [159]:
ds_so_new.push_to_hub("names_detection_codebert")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

In [160]:
ds_bert_new = ds_bert.map(name_column, remove_columns=["repo_name", "copies", "hash", 'line_mean', 'line_max', 'alpha_frac', 'autogenerated'])

  0%|          | 0/100 [00:00<?, ?ex/s]

In [161]:
ds_bert_new.push_to_hub("names_detection_bert_ner")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]