# Reconnaissance d'entités nommées avec SpaCy

La documentation est accessible ici: https://spacy.io/api

## Imports

In [None]:
from collections import defaultdict
import sys
import spacy
from spacy.lang.fr.examples import sentences
import os

In [None]:
nlp = spacy.load('fr_core_news_lg')

## Appliquer la reconnaissance d'entités nommées sur notre corpus

In [None]:
# Directory path
directory = "../data/txt_clean"

# Get all file names in the directory
file_names = os.listdir(directory)

# Loop through each file
for file_name in file_names:
    # File path
    file_path = os.path.join(directory, file_name)
    
    # Read the file
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()
    
    # Process the text
    doc = nlp(text)
    
    # Extract entities
    entities = []
    for ent in doc.ents:
        entities.append(f"{ent.text} ({ent.label_})")
    
    # Print entities
    if entities:
        print(f"Entities in {file_name}: {', '.join(entities)}")
    else:
        print(f"No entities found in {file_name}")


In [None]:
# Initialize the list of LOC entities
loc_entities = []
person_entities = []
org_entities = []

# Get all file names in the directory
file_names = os.listdir(directory)

# Loop through each file
for file_name in file_names:
    # File path
    file_path = os.path.join(directory, file_name)
    
    # Read the file
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()
    
    # Process the text
    doc = nlp(text)
    
    # Extract LOC entities
    for ent in doc.ents:
        if ent.label_ == "LOC":
            loc_entities.append(ent.text)
        elif ent.label_ == "PER":
            person_entities.append(ent.text)
        elif ent.label_ == "ORG":
            org_entities.append(ent.text)


In [None]:
print(set(loc_entities))
print(set(person_entities))
print(set(org_entities)) 

In [None]:
from collections import Counter

# Count the occurrences of each LOC entity
loc_counts = Counter(loc_entities)
# Get the 20 most common LOC entities
top_20_loc_entities = loc_counts.most_common(20)

# Count the occurrences of each PER entity
per_counts = Counter(person_entities)
# Get the 20 most common PER entities
top_20_per_entities = per_counts.most_common(20)

# Count the occurrences of each ORG entity
org_counts = Counter(org_entities)
# Get the 20 most common ORG entities
top_20_org_entities = org_counts.most_common(20)

# Print the 20 most common LOC entities
print("Top 20 LOC entities:")
for entity, count in top_20_loc_entities:
    print(f"{entity}: {count}")

# Print the 20 most common PER entities
print("\nTop 20 PER entities:")
for entity, count in top_20_per_entities:
    print(f"{entity}: {count}")

# Print the 20 most common ORG entities
print("\nTop 20 ORG entities:")
for entity, count in top_20_org_entities:
    print(f"{entity}: {count}")
