In [2]:
import pandas as pd

# Chargement des données
data_path = '../data/clean/animal_images_cleaned.csv'
df = pd.read_csv(data_path)
pd.set_option('display.max_columns', None)  # Affiche toutes les colonnes
pd.set_option('display.width', 1000)  # Augmente la largeur totale des colonnes affichées
pd.set_option('display.max_colwidth', None)  # Affiche le contenu complet des colonnes

# Afficher les premières lignes du DataFrame pour vérification
print(df.head())


   id                          visual                                                                                                                                          caption
0   1                     kodiak bear                                               Kodiak Bear (Ursus arctos middendorffi) in Kodiak National Wildlife Refuge, Alaska, United States.
1   2  Lamb   Animal Park Muggensturm                                                                                    Resting lamb, (Ovis aries); Animal Park Muggensturm, Germany.
2   3           Animal diversity bpng  A representation of the diversity of animals, seeking not to favour the large and familiar, so there is only one vertebrate here, for instance.
3   4                Animal diversity                                                                                                                                 Animal Diversity
4   5   Animal diversity compositepng                                                

In [3]:
from datasets import Dataset, load_from_disk

# Chargement des données dans un format Dataset
dataset = Dataset.from_pandas(df)

# Sauvegarde du dataset pour une utilisation ultérieure
dataset.save_to_disk('../dataset')
# Chargement du dataset depuis le disque
dataset = load_from_disk('../dataset')

# Affichage des premières lignes du dataset chargé
print(dataset)
print(dataset[:5])


Saving the dataset (1/1 shards): 100%|██████████| 33/33 [00:00<00:00, 8096.16 examples/s] 

Dataset({
    features: ['id', 'visual', 'caption'],
    num_rows: 33
})
{'id': [1, 2, 3, 4, 5], 'visual': ['kodiak bear', 'Lamb   Animal Park Muggensturm', 'Animal diversity bpng', 'Animal diversity', 'Animal diversity compositepng'], 'caption': ['Kodiak Bear (Ursus arctos middendorffi) in Kodiak National Wildlife Refuge, Alaska, United States.', 'Resting lamb, (Ovis aries); Animal Park Muggensturm, Germany.', 'A representation of the diversity of animals, seeking not to favour the large and familiar, so there is only one vertebrate here, for instance.', 'Animal Diversity', 'Composite of animal species from different phyla']}





In [4]:
import spacy

# Chargement du modèle SpaCy
nlp = spacy.load('en_core_web_sm')  

# Fonction pour annoter le texte
def annotate_text(text):
    doc = nlp(text)
    annotated_text = [(token.text, token.pos_) for token in doc]
    return annotated_text

# Appliquer l'annotation à une colonne du DataFrame, la colonne caption étant la plus importante et la plus riche
df['annotated_caption'] = df['caption'].apply(annotate_text)

# Afficher les résultats
print(df[['caption', 'annotated_caption']].head())


                                                                                                                                           caption                                                                                                                                                                                                                                                                                                                                                                                            annotated_caption
0                                               Kodiak Bear (Ursus arctos middendorffi) in Kodiak National Wildlife Refuge, Alaska, United States.                                                                                                                      [(Kodiak, PROPN), (Bear, PROPN), ((, PUNCT), (Ursus, PROPN), (arctos, NOUN), (middendorffi, ADV), (), PUNCT), (in, ADP), (Kodiak, PROPN), (National, PROPN), (Wildlife, PROPN), (Refuge, PROPN),

In [5]:
# Sauvegarder les résultats annotés dans un nouveau CSV si nécessaire

df_annot=df[['caption', 'annotated_caption']]
df_annot.to_csv('../data/annotations/annotated_animal_images.csv', index=False)