In [2]:
import pandas as pd
import spacy

### Load data

In [4]:
data= pd.read_csv('../Dataset/train.csv')
data = data[data['lang']=='fr']

In [16]:
data.sent.iloc[0]

'elle  porte  le  nom  de  la  romancière  américaine  susan  sontag  (  1933  2004  )  . '

### Information Extraction Pipeline

In [6]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained("Davlan/xlm-roberta-base-ner-hrl")
model = AutoModelForTokenClassification.from_pretrained("Davlan/xlm-roberta-base-ner-hrl")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)


Downloading:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

[{'entity': 'B-PER', 'score': 0.9998417, 'index': 1, 'word': '▁Na', 'start': 0, 'end': 2}, {'entity': 'I-PER', 'score': 0.88056296, 'index': 2, 'word': 'der', 'start': 2, 'end': 5}, {'entity': 'I-PER', 'score': 0.999816, 'index': 3, 'word': '▁Jo', 'start': 5, 'end': 8}, {'entity': 'I-PER', 'score': 0.9998022, 'index': 4, 'word': 'kha', 'start': 8, 'end': 11}, {'entity': 'I-PER', 'score': 0.99975294, 'index': 5, 'word': 'dar', 'start': 11, 'end': 14}, {'entity': 'B-LOC', 'score': 0.99962485, 'index': 8, 'word': '▁Syria', 'start': 24, 'end': 30}]


In [17]:
example = "elle  porte  le  nom  de  la  romancière  américaine  susan  sontag  (  1933  2004  )  . "
ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-PER', 'score': 0.99925464, 'index': 12, 'word': '▁sus', 'start': 53, 'end': 57}, {'entity': 'I-PER', 'score': 0.90367925, 'index': 13, 'word': 'an', 'start': 57, 'end': 59}, {'entity': 'I-PER', 'score': 0.9986822, 'index': 14, 'word': '▁son', 'start': 60, 'end': 64}, {'entity': 'I-PER', 'score': 0.99791914, 'index': 15, 'word': 'tag', 'start': 64, 'end': 67}]


In [19]:
for idx in range(len(ner_results)):
    if ner_results[idx]['entity'][0] == 'B':
        start = ner_results[idx]['start']
        end = ner_results[idx]['end']
        j = idx+1
        while j < len(ner_results):
            if ner_results[j]['entity'][0] == 'B':
                break
            elif ner_results[j]['entity'][0] == 'I':
                end = ner_results[j]['end']
            j+=1
        idx = j
        
        print(example[start:end])

{'entity': 'B-PER', 'score': 0.99925464, 'index': 12, 'word': '▁sus', 'start': 53, 'end': 57}
 susan  sontag


In [20]:
#import the pipeline class


from InformationExtraction import InformationExtractionPipeline


# example spacy extractor function
NER = spacy.load("en_core_web_lg")
def tag_extraction_from_spacy(sen, model = NER):
    
    annotated = model(sen)
    extracted_names = [word.text for word in annotated.ents 
                       if word.label_=='PERSON' or word.label_=='ORG'or word.label_=='GPE']
    
    
    return extracted_names


def tag_extraction_from_LM(sen, model = nlp):
    
    annotated = model(sen)
    extracted_names = []
    for idx in range(len(ner_results)):
        if ner_results[idx]['entity'][0] == 'B':
            start = ner_results[idx]['start']
            end = ner_results[idx]['end']
            j = idx+1
            while j < len(ner_results):
                if ner_results[j]['entity'][0] == 'B':
                    break
                elif ner_results[j]['entity'][0] == 'I':
                    end = ner_results[j]['end']
                j+=1
            idx = j
        
            extracted_names.append(sen[start:end].strip())
    
    
    return extracted_names


# example extractor function that uses training labels 
sent_to_tag = dict(zip(data['sent'],data['labels']))
def tag_extraction_from_tags(sent, sent_to_tag=sent_to_tag):

    tags = sent_to_tag[sent]
    sentsWithtags = [(s,t) for s,t in zip(sent.split(),tags.split())]
    entity_list = []
    for i,item in enumerate(sentsWithtags):
        if 'B-' in item[1]:
            j = i
            entity = []
            while j<len(sentsWithtags):
                if sentsWithtags[j][1] =='O':
                    break
                entity.append(sentsWithtags[j][0])
                j+=1
            i = j
         
            entity_list.append(" ".join(entity))
            
    

    return entity_list



In [24]:
tag_extraction_from_tags("elle  porte  le  nom  de  la  romancière  américaine  susan  sontag  (  1933  2004  )  . ")

['susan sontag']

In [25]:
#create pipline object:
#param: extractor: an entity extractor function that returns all the entities from a sentence
#param: max_sen: define the number of sentences to be added for each detected entity
#param: lang: define language. needed for wikipedia api
#param: saveJson: whether to save extracted informaton as json file. Saves time if needed to run the pipeline again
#param: loadJson: if you have saved a json file and want to use it
#param: jsonPath: define saved json file path

infoPipeline = InformationExtractionPipeline(extractor = tag_extraction_from_tags, max_sen = 2, 
                                         lang = 'fr', loadJson = False, jsonPath='wiki-info-fr-train.json',
                                         saveJson=True)

In [None]:
#call pipline and provide list of sentences as argument

augmented = infoPipeline(data['sent'].values.tolist())

  1%|▏                                     | 85/16547 [01:06<6:24:36,  1.40s/it]

HTTPSConnectionPool(host='fr.wikipedia.org', port=443): Read timed out. (read timeout=10.0)


 38%|█████████████                     | 6346/16547 [1:04:47<3:08:54,  1.11s/it]

HTTPSConnectionPool(host='fr.wikipedia.org', port=443): Read timed out. (read timeout=10.0)


 61%|███████████████████▉             | 10021/16547 [1:37:00<4:06:48,  2.27s/it]

HTTPSConnectionPool(host='fr.wikipedia.org', port=443): Read timed out. (read timeout=10.0)


 71%|███████████████████████▌         | 11787/16547 [1:52:49<1:08:46,  1.15it/s]

HTTPSConnectionPool(host='fr.wikipedia.org', port=443): Max retries exceeded with url: /w/api.php?action=query&prop=info&titles=Chancre_Du_Ch%C3%A2taigner&inprop=protection%7Ctalkid%7Cwatched%7Cwatchers%7Cvisitingwatchers%7Cnotificationtimestamp%7Csubjectid%7Curl%7Creadable%7Cpreload%7Cdisplaytitle&format=json&redirects=1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f2b095cf880>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))


 74%|█████████████████████████▉         | 12282/16547 [1:58:27<22:15,  3.19it/s]

### Info Percentage

In [6]:
data['augmented_sen'] = augmented
temp = data[data['sent']!=data['augmented_sen']]


In [18]:
info_percent = temp.shape[0]/data.shape[0]
print(f"Info Percentage: {info_percent*100:.2f}%")

Info Percentage: 33.91%


### Save Augmented Data

In [19]:
data.to_csv('./Dataset/dev-wiki-spacy.csv',index=False)