In [None]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from spacy.language import Language

def concatener_phrases_par_document(filepath, separator='\t'):
    """
    Concatène toutes les phrases pour chaque document dans un fichier CSV.

    :param filepath: Chemin du fichier CSV.
    :param separator: Séparateur utilisé dans le fichier CSV.
    :return: Un dictionnaire avec le document comme clé et les phrases concaténées comme valeur.S

    """
    # Charger les données depuis le fichier CSV
    dataframe = pd.read_csv(filepath, sep=separator)

    # Initialiser un dictionnaire pour stocker le texte concaténé
    combined_texts = {}

    # Grouper les données par document
    grouped_data = dataframe.groupby('document')

    # Parcourir chaque groupe et concaténer les textes
    for document, group in grouped_data:
        combined_text = ' '.join(group['content'].tolist())
        combined_texts[document] = combined_text

    return combined_texts

# Utilisation de la fonction
file_path = '/content/drive/MyDrive/Ter_ATAL/Dataset_TER/train.csv' #  le chemin de dataset
combined_texts = concatener_phrases_par_document(file_path,separator=',')
dataset = pd.read_csv(file_path)
# Afficher les résultats pour les premiers documents (exemple)
for doc, text in list(combined_texts.items()):
    print(f"Document: {doc}\nTexte Concaténé:\n{text}\n")


Document: Baldwin_v_Reese_retagged
Texte Concaténé:
*29 JUSTICE BREYER delivered the opinion of the Court. Before seeking a federal writ of habeas corpus, a state prisoner must exhaust available state remedies, 28 U. S. C. § 2254(b)(1), thereby giving the State the "`"opportunity to pass upon and correct" alleged violations of its prisoners' federal rights.'" Duncan v. Henry, 513 U. S. 364 , 365 (1995) (per curiam) (quoting Picard v. Connor, 404 U. S. 270 , 275 (1971)). To provide the State with the necessary "opportunity," the prisoner must "fairly present" his claim in each appropriate state court (including a state supreme court with powers of discretionary review), thereby alerting that court to the federal nature of the claim. Duncan, supra, at 365-366; O'Sullivan v. Boerckel, 526 U. S. 838 , 845 (1999). This case focuses upon the requirement of "fair presentation." Michael Reese, the respondent, appealed his state-court kidnaping and attempted sodomy convictions and sentences thr

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Sapcy

Doc d'expert



In [None]:
nlp = spacy.load("en_core_web_sm")


if "custom_sentence_boundary" in nlp.pipe_names:
    nlp.remove_pipe("custom_sentence_boundary")


print("Composants du pipeline:", nlp.pipe_names)

Composants du pipeline: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [None]:
# Fonction pour concaténer les mots d'une phrase et créer des étiquettes
def concatener_et_etiqueter_spacy(phrase):
    # Concaténation des mots de la phrase en un seul mot en supprimant les espaces
    mot_concatene = ''.join(phrase.split())

    # Création des étiquettes : 0 pour tous sauf le dernier caractère, 1 pour le dernier
    etiquettes = [0] * (len(mot_concatene) - 1) + [1]

    # Création d'une liste de tuples (caractère, étiquette)
    resultat = [(caractere, etiquettes[i]) for i, caractere in enumerate(mot_concatene)]

    return resultat

In [None]:
# document choisie de train :
# 1-ROSENBLATT v. BAER_MCL / 2- Sandoz Inc. v. Amgen Inc_MCL / 3 - opinion-G16-(Shapiro v. McManus, 136 S. Ct. 450).h
# 4 -Baldwin_v_Reese_retagged / 5-California v. Greenwood_retagged / 6 - opinion-G16-(California v. Superior Court of Cal.,

# Choisissez un identifiant de document
id_document_specifique = "ROSENBLATT v. BAER_MCL"

# Filtrez le dataset pour ce document spécifique
document_specifique = dataset[dataset['document'] == id_document_specifique]
#print(document_specifique['content'])
lista = [sentence for sentence in document_specifique['content']]
print(lista)
len(lista)


['MR. JUSTICE BRENNAN delivered the opinion of the Court.', "A jury in New Hampshire Superior Court awarded respondent damages in this civil libel action based on one of petitioner's columns in the Laconia Evening Citizen.", 'Respondent alleged that the column contained defamatory falsehoods concerning his performance as Supervisor of the Belknap Country Recreation Area, a facility owned and operated by Belknap County.', "In the interval between the trial and the decision of petitioner's appeal by the New Hampshire Supreme Court, we decided New York Times Co. v. Sullivan, 376 U. S. 254 .", 'We there held that consistent with the First and Fourteenth Amendments a State cannot award damages to a public official for defamatory falsehood relating to his official conduct unless the official proves actual malice\x97that the falsehood was published with knowledge of its falsity or with reckless disregard of whether it was true or false.', 'The New Hampshire Supreme Court affirmed the award, f

100

In [None]:
#Créer une fonction qui renvoie un liste de tuples sachant que chaque tuple represente un caractère et son étiquette associé

def liste_caractères_spacy(liste_phrases):
  liste_caractères = []
  for sentence in liste_phrases:
      concatener_et_etiqueter_spacy(sentence)
      liste_caractères += concatener_et_etiqueter_spacy(sentence)
  return liste_caractères

print(liste_caractères_spacy(document_specifique['content']))
len(liste_caractères_spacy(document_specifique['content']))


[('M', 0), ('R', 0), ('.', 0), ('J', 0), ('U', 0), ('S', 0), ('T', 0), ('I', 0), ('C', 0), ('E', 0), ('B', 0), ('R', 0), ('E', 0), ('N', 0), ('N', 0), ('A', 0), ('N', 0), ('d', 0), ('e', 0), ('l', 0), ('i', 0), ('v', 0), ('e', 0), ('r', 0), ('e', 0), ('d', 0), ('t', 0), ('h', 0), ('e', 0), ('o', 0), ('p', 0), ('i', 0), ('n', 0), ('i', 0), ('o', 0), ('n', 0), ('o', 0), ('f', 0), ('t', 0), ('h', 0), ('e', 0), ('C', 0), ('o', 0), ('u', 0), ('r', 0), ('t', 0), ('.', 1), ('A', 0), ('j', 0), ('u', 0), ('r', 0), ('y', 0), ('i', 0), ('n', 0), ('N', 0), ('e', 0), ('w', 0), ('H', 0), ('a', 0), ('m', 0), ('p', 0), ('s', 0), ('h', 0), ('i', 0), ('r', 0), ('e', 0), ('S', 0), ('u', 0), ('p', 0), ('e', 0), ('r', 0), ('i', 0), ('o', 0), ('r', 0), ('C', 0), ('o', 0), ('u', 0), ('r', 0), ('t', 0), ('a', 0), ('w', 0), ('a', 0), ('r', 0), ('d', 0), ('e', 0), ('d', 0), ('r', 0), ('e', 0), ('s', 0), ('p', 0), ('o', 0), ('n', 0), ('d', 0), ('e', 0), ('n', 0), ('t', 0), ('d', 0), ('a', 0), ('m', 0), ('a', 0),

14329

Segmentation Spacy

In [None]:
def seg_spacy(documents):
  liste_seg = []
  for sent in documents.sents:
     liste_seg.append(sent.text)
  return liste_seg

texte_concatene = combined_texts["ROSENBLATT v. BAER_MCL"]
doc = nlp(texte_concatene)
print(seg_spacy(doc))
len(seg_spacy(doc))




['MR.', 'JUSTICE BRENNAN delivered the opinion of the Court.', "A jury in New Hampshire Superior Court awarded respondent damages in this civil libel action based on one of petitioner's columns in the Laconia Evening Citizen.", 'Respondent alleged that the column contained defamatory falsehoods concerning his performance as Supervisor of the Belknap Country Recreation Area, a facility owned and operated by Belknap County.', "In the interval between the trial and the decision of petitioner's appeal by the New Hampshire Supreme Court, we decided New York Times Co. v. Sullivan, 376 U. S. 254 .", 'We there held that consistent with the First and Fourteenth Amendments a State cannot award damages to a public official for defamatory falsehood relating to his official conduct unless the official proves actual malice\x97that the falsehood was published with knowledge of its falsity or with reckless disregard of whether it was true or false.', 'The New Hampshire Supreme Court affirmed the award

113

In [None]:
print(liste_caractères_spacy(seg_spacy(doc)))
len(liste_caractères_spacy(seg_spacy(doc)))

[('M', 0), ('R', 0), ('.', 1), ('J', 0), ('U', 0), ('S', 0), ('T', 0), ('I', 0), ('C', 0), ('E', 0), ('B', 0), ('R', 0), ('E', 0), ('N', 0), ('N', 0), ('A', 0), ('N', 0), ('d', 0), ('e', 0), ('l', 0), ('i', 0), ('v', 0), ('e', 0), ('r', 0), ('e', 0), ('d', 0), ('t', 0), ('h', 0), ('e', 0), ('o', 0), ('p', 0), ('i', 0), ('n', 0), ('i', 0), ('o', 0), ('n', 0), ('o', 0), ('f', 0), ('t', 0), ('h', 0), ('e', 0), ('C', 0), ('o', 0), ('u', 0), ('r', 0), ('t', 0), ('.', 1), ('A', 0), ('j', 0), ('u', 0), ('r', 0), ('y', 0), ('i', 0), ('n', 0), ('N', 0), ('e', 0), ('w', 0), ('H', 0), ('a', 0), ('m', 0), ('p', 0), ('s', 0), ('h', 0), ('i', 0), ('r', 0), ('e', 0), ('S', 0), ('u', 0), ('p', 0), ('e', 0), ('r', 0), ('i', 0), ('o', 0), ('r', 0), ('C', 0), ('o', 0), ('u', 0), ('r', 0), ('t', 0), ('a', 0), ('w', 0), ('a', 0), ('r', 0), ('d', 0), ('e', 0), ('d', 0), ('r', 0), ('e', 0), ('s', 0), ('p', 0), ('o', 0), ('n', 0), ('d', 0), ('e', 0), ('n', 0), ('t', 0), ('d', 0), ('a', 0), ('m', 0), ('a', 0),

14329

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# tokenisation1 et tokenisation2 sont des listes de tuples (token, étiquette)
tokenisation1_spacy = liste_caractères_spacy(seg_spacy(doc))
tokenisation2_spacy = liste_caractères_spacy(document_specifique['content'])
# Extraire les étiquettes pour la comparaison
etiquettes1_spacy  = [int(etiquette) for _, etiquette in tokenisation1_spacy ]
etiquettes2_spacy  = [int(etiquette) for _, etiquette in tokenisation2_spacy ]

# Calcul des métriques
precision = precision_score(etiquettes1_spacy , etiquettes2_spacy )
rappel = recall_score(etiquettes1_spacy , etiquettes2_spacy)
score_f1 = f1_score(etiquettes1_spacy , etiquettes2_spacy)

print(f"Précision: {precision}, Rappel: {rappel}, Score F1: {score_f1}")

Précision: 0.83, Rappel: 0.7345132743362832, Score F1: 0.7793427230046949


# *Nltk*

Document d'experts

In [None]:
import nltk
nltk.download('punkt') # Décommentez cette ligne lors de la première exécution pour télécharger les ressources nécessaires

# Fonction pour concaténer les mots d'une phrase et créer des étiquettes
def concatener_et_etiqueter_nltk(phrase):
    # Utilisation de NLTK pour diviser la phrase en mots
    mots = nltk.word_tokenize(phrase)

    # Concaténation des mots en un seul mot en supprimant les espaces
    mot_concatene = ''.join(mots)

    # Création des étiquettes : 0 pour tous sauf le dernier caractère, 1 pour le dernier
    etiquettes = [0] * (len(mot_concatene) - 1) + [1]

    # Création d'une liste de tuples (caractère, étiquette)
    resultat = [(caractere, etiquettes[i]) for i, caractere in enumerate(mot_concatene)]

    return resultat


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
print(dataset['document'].unique())

['ROSENBLATT v. BAER_MCL' 'Sandoz Inc. v. Amgen Inc_MCL'
 'opinion-G16-(Sanford v. Kepner, 344 U.S. 13).html'
 'opinion-G16-(Shapiro v. McManus, 136 S. Ct. 450).h'
 'opinion-G16-(Taylor v. Sturgell, 553 U.S. 880)'
 'opinion-G16-(United States v. Mississippi Chemical'
 'Baldwin_v_Reese_retagged' 'California v. Greenwood_retagged'
 'opinion-G16-(California v. Superior Court of Cal.,'
 'Commissioner v. Groetzinger_MCL_retagged'
 'opinion-G16-(Florida Power _ Light Co. v. Electric'
 'opinion-G16-(Jenkins v. Georgia, 418 U.S. 153).htm']


In [None]:
# Choisissez un identifiant de document
id_document_specifique = "ROSENBLATT v. BAER_MCL"

# Filtrez le dataset pour ce document spécifique
document_specifique = dataset[dataset['document'] == id_document_specifique]
print(document_specifique['content'])

0     MR. JUSTICE BRENNAN delivered the opinion of t...
1     A jury in New Hampshire Superior Court awarded...
2     Respondent alleged that the column contained d...
3     In the interval between the trial and the deci...
4     We there held that consistent with the First a...
                            ...                        
95    Moreover, even if the claim falls within New Y...
96    Because the trial here was had before New York...
97    We remark only that, as is the case with quest...
98    The judgment is reversed and the case remanded...
99                                    It is so ordered.
Name: content, Length: 100, dtype: object


In [None]:
#Créer une fonction qui renvoie un liste de tuples sachant que chaque tuple represente un caractère et son étiquette associé
def liste_caractères_nltk(liste_phrases):
  liste_caractères = []
  for sentence in liste_phrases:
      concatener_et_etiqueter_spacy(sentence)
      liste_caractères += concatener_et_etiqueter_spacy(sentence)
  return liste_caractères

print(liste_caractères_nltk(document_specifique['content']))
len(liste_caractères_nltk(document_specifique['content']))

[('M', 0), ('R', 0), ('.', 0), ('J', 0), ('U', 0), ('S', 0), ('T', 0), ('I', 0), ('C', 0), ('E', 0), ('B', 0), ('R', 0), ('E', 0), ('N', 0), ('N', 0), ('A', 0), ('N', 0), ('d', 0), ('e', 0), ('l', 0), ('i', 0), ('v', 0), ('e', 0), ('r', 0), ('e', 0), ('d', 0), ('t', 0), ('h', 0), ('e', 0), ('o', 0), ('p', 0), ('i', 0), ('n', 0), ('i', 0), ('o', 0), ('n', 0), ('o', 0), ('f', 0), ('t', 0), ('h', 0), ('e', 0), ('C', 0), ('o', 0), ('u', 0), ('r', 0), ('t', 0), ('.', 1), ('A', 0), ('j', 0), ('u', 0), ('r', 0), ('y', 0), ('i', 0), ('n', 0), ('N', 0), ('e', 0), ('w', 0), ('H', 0), ('a', 0), ('m', 0), ('p', 0), ('s', 0), ('h', 0), ('i', 0), ('r', 0), ('e', 0), ('S', 0), ('u', 0), ('p', 0), ('e', 0), ('r', 0), ('i', 0), ('o', 0), ('r', 0), ('C', 0), ('o', 0), ('u', 0), ('r', 0), ('t', 0), ('a', 0), ('w', 0), ('a', 0), ('r', 0), ('d', 0), ('e', 0), ('d', 0), ('r', 0), ('e', 0), ('s', 0), ('p', 0), ('o', 0), ('n', 0), ('d', 0), ('e', 0), ('n', 0), ('t', 0), ('d', 0), ('a', 0), ('m', 0), ('a', 0),

14329

Segmentation de Nltk

In [None]:
from nltk.tokenize import sent_tokenize

def seg_nltk(textes):
    liste_seg = []
    # Segmenter le texte en phrases en utilisant NLTK
    phrases = sent_tokenize(textes)
    for phrase in phrases:
        liste_seg.append(phrase)
    return liste_seg
print(seg_nltk(texte_concatene))
len(seg_nltk(texte_concatene))


['MR. JUSTICE BRENNAN delivered the opinion of the Court.', "A jury in New Hampshire Superior Court awarded respondent damages in this civil libel action based on one of petitioner's columns in the Laconia Evening Citizen.", 'Respondent alleged that the column contained defamatory falsehoods concerning his performance as Supervisor of the Belknap Country Recreation Area, a facility owned and operated by Belknap County.', "In the interval between the trial and the decision of petitioner's appeal by the New Hampshire Supreme Court, we decided New York Times Co. v. Sullivan, 376 U. S. 254 .", 'We there held that consistent with the First and Fourteenth Amendments a State cannot award damages to a public official for defamatory falsehood relating to his official conduct unless the official proves actual malice\x97that the falsehood was published with knowledge of its falsity or with reckless disregard of whether it was true or false.', 'The New Hampshire Supreme Court affirmed the award, f

122

In [None]:
print(liste_caractères_nltk(seg_nltk(texte_concatene)))
len(liste_caractères_nltk(seg_nltk(texte_concatene)))

[('M', 0), ('R', 0), ('.', 0), ('J', 0), ('U', 0), ('S', 0), ('T', 0), ('I', 0), ('C', 0), ('E', 0), ('B', 0), ('R', 0), ('E', 0), ('N', 0), ('N', 0), ('A', 0), ('N', 0), ('d', 0), ('e', 0), ('l', 0), ('i', 0), ('v', 0), ('e', 0), ('r', 0), ('e', 0), ('d', 0), ('t', 0), ('h', 0), ('e', 0), ('o', 0), ('p', 0), ('i', 0), ('n', 0), ('i', 0), ('o', 0), ('n', 0), ('o', 0), ('f', 0), ('t', 0), ('h', 0), ('e', 0), ('C', 0), ('o', 0), ('u', 0), ('r', 0), ('t', 0), ('.', 1), ('A', 0), ('j', 0), ('u', 0), ('r', 0), ('y', 0), ('i', 0), ('n', 0), ('N', 0), ('e', 0), ('w', 0), ('H', 0), ('a', 0), ('m', 0), ('p', 0), ('s', 0), ('h', 0), ('i', 0), ('r', 0), ('e', 0), ('S', 0), ('u', 0), ('p', 0), ('e', 0), ('r', 0), ('i', 0), ('o', 0), ('r', 0), ('C', 0), ('o', 0), ('u', 0), ('r', 0), ('t', 0), ('a', 0), ('w', 0), ('a', 0), ('r', 0), ('d', 0), ('e', 0), ('d', 0), ('r', 0), ('e', 0), ('s', 0), ('p', 0), ('o', 0), ('n', 0), ('d', 0), ('e', 0), ('n', 0), ('t', 0), ('d', 0), ('a', 0), ('m', 0), ('a', 0),

14329

In [None]:
# tokenisation1 et tokenisation2 sont des listes de tuples (caractere, étiquette)
tokenisation_nltk1 = liste_caractères_nltk(seg_nltk(texte_concatene))
tokenisation_nltk2 = liste_caractères_nltk(document_specifique['content'])
# Extraire les étiquettes pour la comparaison
etiquettes_nltk1 = [int(etiquette) for _, etiquette in tokenisation_nltk1]
etiquettes_nltk2 = [int(etiquette) for _, etiquette in tokenisation_nltk2]

# Calcul des métriques
precision = precision_score(etiquettes_nltk1, etiquettes_nltk2)
rappel = recall_score(etiquettes_nltk1, etiquettes_nltk2)
score_f1 = f1_score(etiquettes_nltk1, etiquettes_nltk2)

print(f"Précision: {precision}, Rappel: {rappel}, Score F1: {score_f1}")

Précision: 0.83, Rappel: 0.680327868852459, Score F1: 0.7477477477477475


Spacy


In [None]:
def calcul_metrique_dataset_spacy(donnee):
  somme_precision = 0
  somme_rappel = 0
  somme_score_f1 = 0
  nb_documents = len(donnee['document'].unique())
  for titre in donnee['document'].unique():
    document_specifique12 = donnee[donnee['document'] == titre]
    liste_caractères_spacy(document_specifique12['content'])
    texte_concatene12 = combined_texts[titre]
    doc12 = nlp(texte_concatene12)
    liste_caractères_spacy(seg_spacy(doc12))

    # tokenisation1 et tokenisation2 sont des listes de tuples (token, étiquette)
    tokenisation1_spacy1 = liste_caractères_spacy(seg_spacy(doc12))
    tokenisation2_spacy1 = liste_caractères_spacy(document_specifique12['content'])
    # Extraire les étiquettes pour la comparaison
    etiquettes1_spacy1  = [int(etiquette) for _, etiquette in tokenisation1_spacy1 ]
    etiquettes2_spacy1  = [int(etiquette) for _, etiquette in tokenisation2_spacy1 ]

    # Calcul des métriques
    somme_precision += precision_score(etiquettes1_spacy1 , etiquettes2_spacy1 )
    somme_rappel += recall_score(etiquettes1_spacy1 , etiquettes2_spacy1)
    somme_score_f1 += f1_score(etiquettes1_spacy1 , etiquettes2_spacy1)

  return somme_precision/nb_documents,somme_rappel/nb_documents,somme_score_f1/nb_documents

In [None]:
print('Spacy')
precision,rappel,score_f1 = calcul_metrique_dataset_spacy(dataset)
print(f"Précision: {precision}, Rappel: {rappel}, Score F1: {score_f1}")

Spacy
Précision: 0.8676067776211984, Rappel: 0.742269741632747, Score F1: 0.7944537706246605


NLTK


In [None]:
def calcul_metrique_dataset_nltk(donnee1):
  somme_precision = 0
  somme_rappel = 0
  somme_score_f1 = 0
  nb_documents = len(donnee1['document'].unique())
  for titre in donnee1['document'].unique():
    document_specifique1 = donnee1[donnee1['document'] == titre]
    liste_caractères_nltk(document_specifique1['content'])
    texte_concatene1 = combined_texts[titre]
    liste_caractères_nltk(seg_nltk(texte_concatene1))

    # tokenisation1 et tokenisation2 sont des listes de tuples (token, étiquette)
    tokenisation1_nltk2 = liste_caractères_nltk(seg_nltk(texte_concatene1))
    tokenisation2_nltk2 = liste_caractères_nltk(document_specifique1['content'])
    # Extraire les étiquettes pour la comparaison
    etiquettes1_nltk1  = [int(etiquette) for _, etiquette in tokenisation1_nltk2 ]
    etiquettes2_nltk2 = [int(etiquette) for _, etiquette in tokenisation2_nltk2 ]

    # Calcul des métriques
    somme_precision += precision_score(etiquettes1_nltk1 , etiquettes2_nltk2)
    somme_rappel += recall_score(etiquettes1_nltk1 , etiquettes2_nltk2)
    somme_score_f1 += f1_score(etiquettes1_nltk1 , etiquettes2_nltk2)

  return somme_precision/nb_documents,somme_rappel/nb_documents,somme_score_f1/nb_documents

In [None]:
print('Nltk')
precision1,rappel1,score_f11 = calcul_metrique_dataset_nltk(dataset)
print(f"Précision: {precision1}, Rappel: {rappel1}, Score F1: {score_f11}")

Nltk
Précision: 0.826814836083842, Rappel: 0.6518944375653564, Score F1: 0.7234586528139105


In [None]:
# Afficher toutes les composantes du pipeline
print(nlp.pipe_names)


['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


# Evaluation



In [None]:
# Utilisation de la fonction
file_path_evaluation= '/content/drive/MyDrive/Ter_ATAL/Dataset_TER/eval.csv' #  le chemin de dataset
combined_texts_eval = concatener_phrases_par_document(file_path_evaluation,separator=',')
dataset_eval = pd.read_csv(file_path_evaluation)
# Afficher les résultats pour les premiers documents (exemple)
for doc, text in list(combined_texts_eval.items()):
    print(f"Document: {doc}\nTexte Concaténé:\n{text}\n")


Document: opinion-G16-(Mathews v. Diaz, 426 U.S. 67.html
Texte Concaténé:
MR. JUSTICE STEVENS delivered the opinion of the Court. The question presented by the Secretary's appeal is whether Congress may condition an alien's eligibility for participation in a federal medical insurance program on continuous residence in the United States for a five-year period and admission for permanent residence. The District Court held that the first condition was unconstitutional and that it could not be severed from the second. Since we conclude that both conditions are constitutional, we reverse. Each of the appellees is a resident alien who was lawfully admitted to the United States less than five years ago. Appellees Diaz and Clara are Cuban refugees who remain in this country at the discretion of the Attorney General; appellee Espinosa has been admitted for permanent *70 residence. All three are over 65 years old and have been denied enrollment in the Medicare Part B supplemental medical insuran

SpaCy



In [None]:
def calcul_metrique_dataset_spacy_eval(donnee):
  somme_precision = 0
  somme_rappel = 0
  somme_score_f1 = 0
  nb_documents = len(donnee['document'].unique())
  for titre in donnee['document'].unique():
    document_specifique12 = donnee[donnee['document'] == titre]
    liste_caractères_spacy(document_specifique12['content'])
    texte_concatene12 = combined_texts_eval[titre]
    doc12 = nlp(texte_concatene12)
    liste_caractères_spacy(seg_spacy(doc12))

    # tokenisation1 et tokenisation2 sont des listes de tuples (token, étiquette)
    tokenisation1_spacy1 = liste_caractères_spacy(seg_spacy(doc12))
    tokenisation2_spacy1 = liste_caractères_spacy(document_specifique12['content'])
    # Extraire les étiquettes pour la comparaison
    etiquettes1_spacy1  = [int(etiquette) for _, etiquette in tokenisation1_spacy1 ]
    etiquettes2_spacy1  = [int(etiquette) for _, etiquette in tokenisation2_spacy1 ]

    # Calcul des métriques
    somme_precision += precision_score(etiquettes1_spacy1 , etiquettes2_spacy1 )
    somme_rappel += recall_score(etiquettes1_spacy1 , etiquettes2_spacy1)
    somme_score_f1 += f1_score(etiquettes1_spacy1 , etiquettes2_spacy1)

  return somme_precision/nb_documents,somme_rappel/nb_documents,somme_score_f1/nb_documents

In [None]:
precision,rappel,score_f1 = calcul_metrique_dataset_spacy_eval(dataset_eval)
print(f"Précision: {precision}, Rappel: {rappel}, Score F1: {score_f1}")

Précision: 0.8224750000460489, Rappel: 0.8104000844831104, Score F1: 0.8157915083053234


Nltk

In [None]:
def calcul_metrique_dataset_nltk_eval(donnee1):
  somme_precision = 0
  somme_rappel = 0
  somme_score_f1 = 0
  nb_documents = len(donnee1['document'].unique())
  for titre in donnee1['document'].unique():
    document_specifique1 = donnee1[donnee1['document'] == titre]
    liste_caractères_nltk(document_specifique1['content'])
    texte_concatene1 = combined_texts_eval[titre]
    liste_caractères_nltk(seg_nltk(texte_concatene1))

    # tokenisation1 et tokenisation2 sont des listes de tuples (token, étiquette)
    tokenisation1_nltk2 = liste_caractères_nltk(seg_nltk(texte_concatene1))
    tokenisation2_nltk2 = liste_caractères_nltk(document_specifique1['content'])
    # Extraire les étiquettes pour la comparaison
    etiquettes1_nltk1  = [int(etiquette) for _, etiquette in tokenisation1_nltk2 ]
    etiquettes2_nltk2 = [int(etiquette) for _, etiquette in tokenisation2_nltk2 ]

    # Calcul des métriques
    somme_precision += precision_score(etiquettes1_nltk1 , etiquettes2_nltk2)
    somme_rappel += recall_score(etiquettes1_nltk1 , etiquettes2_nltk2)
    somme_score_f1 += f1_score(etiquettes1_nltk1 , etiquettes2_nltk2)

  return somme_precision/nb_documents,somme_rappel/nb_documents,somme_score_f1/nb_documents

In [None]:
precision1,rappel1,score_f11 = calcul_metrique_dataset_nltk_eval(dataset_eval)
print(f"Précision: {precision1}, Rappel: {rappel1}, Score F1: {score_f11}")

Précision: 0.7984509760220249, Rappel: 0.7074743068076289, Score F1: 0.7449147358855391


# Test

In [None]:
# Utilisation de la fonction
file_path_test= '/content/drive/MyDrive/Ter_ATAL/Dataset_TER/test.csv' #  le chemin de dataset
combined_texts_test = concatener_phrases_par_document(file_path_test,separator=',')
dataset_test = pd.read_csv(file_path_test)
# Afficher les résultats pour les premiers documents (exemple)
for doc, text in list(combined_texts_test.items()):
    print(f"Document: {doc}\nTexte Concaténé:\n{text}\n")

Document: Press Enterprise v Sup. Court_retagged_MCL
Texte Concaténé:
CHIEF JUSTICE BURGER delivered the opinion of the Court. We granted certiorari to decide whether the guarantees of open public proceedings in criminal trials cover proceedings for the voir dire examination of potential jurors. Albert Greenwood Brown, Jr., was tried and convicted of the rape and murder of a teenage girl, and sentenced to death in California Superior Court. Before the voir dire examination of prospective jurors began, petitioner, Press-Enterprise Co., moved that the voir dire be open to the public and the press. Petitioner contended that the public had an absolute right to attend the trial, and asserted that the trial commenced with the voir dire proceedings. The State opposed petitioner's motion, arguing that if the press were present, juror responses would lack the candor necessary to assure a fair trial. The trial judge agreed and permitted petitioner to attend only the "general voir dire." He state

SpaCy

In [None]:
def calcul_metrique_dataset_spacy_test(donnee):
  somme_precision = 0
  somme_rappel = 0
  somme_score_f1 = 0
  nb_documents = len(donnee['document'].unique())
  for titre in donnee['document'].unique():
    document_specifique12 = donnee[donnee['document'] == titre]
    liste_caractères_spacy(document_specifique12['content'])
    texte_concatene12 = combined_texts_test[titre]
    doc12 = nlp(texte_concatene12)
    liste_caractères_spacy(seg_spacy(doc12))

    # tokenisation1 et tokenisation2 sont des listes de tuples (token, étiquette)
    tokenisation1_spacy1 = liste_caractères_spacy(seg_spacy(doc12))
    tokenisation2_spacy1 = liste_caractères_spacy(document_specifique12['content'])
    # Extraire les étiquettes pour la comparaison
    etiquettes1_spacy1  = [int(etiquette) for _, etiquette in tokenisation1_spacy1 ]
    etiquettes2_spacy1  = [int(etiquette) for _, etiquette in tokenisation2_spacy1 ]

    # Calcul des métriques
    somme_precision += precision_score(etiquettes1_spacy1 , etiquettes2_spacy1 )
    somme_rappel += recall_score(etiquettes1_spacy1 , etiquettes2_spacy1)
    somme_score_f1 += f1_score(etiquettes1_spacy1 , etiquettes2_spacy1)

  return somme_precision/nb_documents,somme_rappel/nb_documents,somme_score_f1/nb_documents

In [None]:
precision,rappel,score_f1 = calcul_metrique_dataset_spacy_test(dataset_test)
print(f"Précision: {precision}, Rappel: {rappel}, Score F1: {score_f1}")

Précision: 0.8440039118509967, Rappel: 0.8250657784672476, Score F1: 0.8332873267083792


# NLTK

In [None]:
def calcul_metrique_dataset_nltk_test(donnee1):
  somme_precision = 0
  somme_rappel = 0
  somme_score_f1 = 0
  nb_documents = len(donnee1['document'].unique())
  for titre in donnee1['document'].unique():
    document_specifique1 = donnee1[donnee1['document'] == titre]
    liste_caractères_nltk(document_specifique1['content'])
    texte_concatene1 = combined_texts_test[titre]
    liste_caractères_nltk(seg_nltk(texte_concatene1))

    # tokenisation1 et tokenisation2 sont des listes de tuples (token, étiquette)
    tokenisation1_nltk2 = liste_caractères_nltk(seg_nltk(texte_concatene1))
    tokenisation2_nltk2 = liste_caractères_nltk(document_specifique1['content'])
    # Extraire les étiquettes pour la comparaison
    etiquettes1_nltk1  = [int(etiquette) for _, etiquette in tokenisation1_nltk2 ]
    etiquettes2_nltk2 = [int(etiquette) for _, etiquette in tokenisation2_nltk2 ]

    # Calcul des métriques
    somme_precision += precision_score(etiquettes1_nltk1 , etiquettes2_nltk2)
    somme_rappel += recall_score(etiquettes1_nltk1 , etiquettes2_nltk2)
    somme_score_f1 += f1_score(etiquettes1_nltk1 , etiquettes2_nltk2)

  return somme_precision/nb_documents,somme_rappel/nb_documents,somme_score_f1/nb_documents

In [None]:
precision1,rappel1,score_f11 = calcul_metrique_dataset_nltk_test(dataset_test)
print(f"Précision: {precision1}, Rappel: {rappel1}, Score F1: {score_f11}")

Précision: 0.8163717663706634, Rappel: 0.7471099887766556, Score F1: 0.7766389612353634


# Personnaliser la segmentation avec SpaCy

In [None]:

from spacy.language import Language

@Language.component("custom_sentence_boundary")

def custom_sentence_boundary(doc):
    prev_token = None
    for token in doc[:-1]:  # Loop through tokens in the document
        # Rule 1: '?' followed by lowercase should not end the sentence
        if token.text == '?' and token.nbor().is_lower:
            doc[token.i+1].is_sent_start = False

        # Rule 2: ':' followed by an uppercase letter should end the sentence
        if token.text == ':' and token.nbor().is_upper:
            doc[token.i+1].is_sent_start = True

        # Rule 3: Handle abbreviations like 'Dr.', 'Mr.'
        if token.text in ('Dr.', 'Mr.') and not token.nbor().is_punct:
            token.is_sent_start = False

        # Rule 4: '.' followed by [number] does not end a sentence
        if token.text == '.' and token.nbor().text.startswith('[') and token.nbor().text[-1].isdigit():
            token.is_sent_start = False
            if token.nbor().text.endswith(']'):
                doc[token.nbor().i+1].is_sent_start = True

        # Rule 5: '.' followed by '"' does not end a sentence
        if token.text == '.' and token.nbor().text == '"':
            token.is_sent_start = False

        # Rule 6: '...' does not end a sentence
        if token.text == '...':
            token.is_sent_start = False

        # Rule 7: '.' followed by a number does not end a sentence
        if token.text == '.' and token.nbor().like_num:
            token.is_sent_start = False

        # Rule 8: Handling cases like '9 Ab'
        if token.like_num and token.nbor().is_title:
            token.is_sent_start = False

        # Rule 9: ')' followed by lowercase starts a new sentence
        if token.text == ')' and token.nbor().is_lower:
            doc[token.nbor().i].is_sent_start = True

        # Rule 10: '-' followed by lowercase does not end a sentence
        if token.text == '-' and token.nbor().is_lower:
            token.is_sent_start = False

        # Rule 11: Numbers followed by "We" starts a new sentence
        if token.like_num and token.nbor().text.startswith('We'):
            doc[token.nbor().i].is_sent_start = True

        # Rule 12: '.' followed by a number and then uppercase letter does not end a sentence
        if token.text == '.' and token.nbor().like_num and token.nbor().nbor().is_upper:
            token.is_sent_start = False

        prev_token = token

    return doc
nlp = spacy.load("en_core_web_sm")

# Add the custom component to the pipeline
nlp.add_pipe('custom_sentence_boundary', before='parser')
print("Composants du pipeline:", nlp.pipe_names)


Composants du pipeline: ['tok2vec', 'tagger', 'custom_sentence_boundary', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [None]:
text_spacy = combined_texts["ROSENBLATT v. BAER_MCL"]
doc = nlp(text_spacy)

# Display the segmented sentences
sentences = [sent.text for sent in doc.sents]
print(sentences)
len(sentences)

['MR.', 'JUSTICE BRENNAN delivered the opinion of the Court.', "A jury in New Hampshire Superior Court awarded respondent damages in this civil libel action based on one of petitioner's columns in the Laconia Evening Citizen.", 'Respondent alleged that the column contained defamatory falsehoods concerning his performance as Supervisor of the Belknap Country Recreation Area, a facility owned and operated by Belknap County.', "In the interval between the trial and the decision of petitioner's appeal by the New Hampshire Supreme Court, we decided New York Times Co. v. Sullivan, 376 U. S. 254 .", 'We there held that consistent with the First and Fourteenth Amendments a State cannot award damages to a public official for defamatory falsehood relating to his official conduct unless the official proves actual malice\x97that the falsehood was published with knowledge of its falsity or with reckless disregard of whether it was true or false.', 'The New Hampshire Supreme Court affirmed the award

105

In [None]:
print(liste_caractères_spacy(sentences))
len(liste_caractères_spacy(sentences))

[('M', 0), ('R', 0), ('.', 1), ('J', 0), ('U', 0), ('S', 0), ('T', 0), ('I', 0), ('C', 0), ('E', 0), ('B', 0), ('R', 0), ('E', 0), ('N', 0), ('N', 0), ('A', 0), ('N', 0), ('d', 0), ('e', 0), ('l', 0), ('i', 0), ('v', 0), ('e', 0), ('r', 0), ('e', 0), ('d', 0), ('t', 0), ('h', 0), ('e', 0), ('o', 0), ('p', 0), ('i', 0), ('n', 0), ('i', 0), ('o', 0), ('n', 0), ('o', 0), ('f', 0), ('t', 0), ('h', 0), ('e', 0), ('C', 0), ('o', 0), ('u', 0), ('r', 0), ('t', 0), ('.', 1), ('A', 0), ('j', 0), ('u', 0), ('r', 0), ('y', 0), ('i', 0), ('n', 0), ('N', 0), ('e', 0), ('w', 0), ('H', 0), ('a', 0), ('m', 0), ('p', 0), ('s', 0), ('h', 0), ('i', 0), ('r', 0), ('e', 0), ('S', 0), ('u', 0), ('p', 0), ('e', 0), ('r', 0), ('i', 0), ('o', 0), ('r', 0), ('C', 0), ('o', 0), ('u', 0), ('r', 0), ('t', 0), ('a', 0), ('w', 0), ('a', 0), ('r', 0), ('d', 0), ('e', 0), ('d', 0), ('r', 0), ('e', 0), ('s', 0), ('p', 0), ('o', 0), ('n', 0), ('d', 0), ('e', 0), ('n', 0), ('t', 0), ('d', 0), ('a', 0), ('m', 0), ('a', 0),

14329

In [None]:
def calcul_metrique_personnalise_spacy_test(data_test):
  somme_precision = 0
  somme_rappel = 0
  somme_score_f1 = 0
  nb_documents = len(data_test['document'].unique())
  for titre in data_test['document'].unique():
    document_specifique21 = data_test[data_test['document'] == titre]
    liste_caractères_spacy(document_specifique21['content'])
    texte_concatene21 = combined_texts_test[titre]
    doc21 = nlp(texte_concatene21)
    sentence_personnalise = [sent.text for sent in doc21.sents]
    liste_caractères_spacy(sentence_personnalise)

    # tokenisation1 et tokenisation2 sont des listes de tuples (token, étiquette)
    tokenisation1_spacy11 =  liste_caractères_spacy(sentence_personnalise)
    tokenisation2_spacy11 = liste_caractères_spacy(document_specifique21['content'])
    # Extraire les étiquettes pour la comparaison
    etiquettes1_spacy12  = [int(etiquette) for _, etiquette in tokenisation1_spacy11 ]
    etiquettes2_spacy12 = [int(etiquette) for _, etiquette in tokenisation2_spacy11 ]

    # Calcul des métriques
    somme_precision += precision_score(etiquettes1_spacy12 , etiquettes2_spacy12 )
    somme_rappel += recall_score(etiquettes1_spacy12 , etiquettes2_spacy12)
    somme_score_f1 += f1_score(etiquettes1_spacy12 , etiquettes2_spacy12)

  return somme_precision/nb_documents,somme_rappel/nb_documents,somme_score_f1/nb_documents

In [None]:
precision1,rappel1,score_f11 = calcul_metrique_personnalise_spacy_test(dataset_test)
print(f"Précision: {precision1}, Rappel: {rappel1}, Score F1: {score_f11}")

Précision: 0.7932049971690552, Rappel: 0.7226777670864187, Score F1: 0.7516027329287377


In [None]:
print("Composants du pipeline:", nlp.pipe_names)


Composants du pipeline: ['tok2vec', 'tagger', 'custom_sentence_boundary', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
