**Aminata THIOUNE**

Groupe Augias

**<h1>MongoDB</h1>**

**MongoDB** est une base de données NoSQL orientée document qui a gagné en popularité en raison de sa flexibilité et de sa capacité à gérer des données non structurées à grande échelle. Contrairement aux bases de données relationnelles qui utilisent des tables et des lignes, MongoDB stocke les données sous forme de documents BSON (Binary JSON), permettant ainsi une structure de données plus souple et évolutive. Cela signifie que chaque document peut avoir un schéma différent, offrant une grande liberté lors de la modélisation des données.

L'un des principaux avantages de MongoDB est sa capacité à se scalabiliser horizontalement, ce qui permet de répartir les données sur plusieurs serveurs pour gérer des volumes de trafic importants. De plus, ses fonctionnalités intégrées, telles que l'indexation, la recherche textuelle et l'agrégation, facilitent l'accès et l'analyse des données. MongoDB est couramment utilisé dans des applications web modernes, des systèmes de gestion de contenu, et des plateformes d'e-commerce, où la rapidité et l'agilité dans la gestion des données sont essentielles.



##**Test**

In [None]:
!pip install pymongo

In [3]:
from pymongo import MongoClient

In [None]:
client = MongoClient('mongodb+srv://username:password@mongodb.3esaj.mongodb.net/?retryWrites=true&w=majority&appName=MongoDB')


In [5]:
# Créer une collection
database = client["tp_mongodb"]
collection = database["test"]

document_list = {"name" : "amina", "status" : "student"}, {"name" : "Dior", "status" : "student" }

# Pour insérer plusieurs documents dans la collection
# on peut utliser insert_one pour ajouter uun seul document
result = collection.insert_many(document_list)

print(result.acknowledged)


True


In [6]:
# Afficher la liste des collections
database.list_collection_names()

['test']

In [7]:
# Compter le nombre de documents dans la collection
count = collection.count_documents({})
print(count)

2


In [8]:
# Affcher les documents de la collection
# Utiliser find_one pour chercher un seul document de la collection
find = collection.find()
for document in find:
  print(document)

{'_id': ObjectId('67366dcacddcbbed4846a164'), 'name': 'amina', 'status': 'student'}
{'_id': ObjectId('67366dcacddcbbed4846a165'), 'name': 'Dior', 'status': 'student'}


In [9]:
# Retourne une estimation du nombre de documents dans la collection
count_2 = collection.estimated_document_count()
print(count_2)

2


In [10]:
# Affiche toutes les valeurs distinctes du champ "name" dans la collection
results_2 = collection.distinct("name")
for document in results_2:
    print(document)

Dior
amina


In [11]:
# Modifier un champs dans la collection
query_filter = { "name" : "amina" }
update_operation = { "$set" :
    { "name" : "Amy" }
}
result_3 = collection.update_one(query_filter, update_operation)
print(result_3.modified_count)

1


In [12]:
# Insérer un document dans la collection
result_4 =  collection.insert_one({"name" : "Fatou", "status" : "student"})
print(result_4.acknowledged)

True


In [13]:
# remplace complètement un document dans la collection avec un nouveau document sans modifierl'identifiant
query_filter = { "Age" : "19" }
replace_document = {"name" :"Dior", "Status" : "Student", "Age" : "19" }
result = collection.replace_one(query_filter, replace_document)
print(result.modified_count)

0


In [14]:
finds = collection.find()
for document in finds:
  print(document)

{'_id': ObjectId('67366dcacddcbbed4846a164'), 'name': 'Amy', 'status': 'student'}
{'_id': ObjectId('67366dcacddcbbed4846a165'), 'name': 'Dior', 'status': 'student'}
{'_id': ObjectId('67366dddcddcbbed4846a166'), 'name': 'Fatou', 'status': 'student'}


In [15]:
# Supprimer un document
query_filter = { "name" : "Fatou" }
result = collection.delete_one(query_filter)
print(result.deleted_count)

1


In [16]:
finds = collection.find()
for document in finds:
  print(document)

{'_id': ObjectId('67366dcacddcbbed4846a164'), 'name': 'Amy', 'status': 'student'}
{'_id': ObjectId('67366dcacddcbbed4846a165'), 'name': 'Dior', 'status': 'student'}


In [17]:
# Supprimer la base de données
client.drop_database('tp_mongodb')

##**Jointure sur des objets JSON**

---



In [None]:
import csv


def csv_to_json_first_method(csv_file):

    from json import dumps
    #create a dictionary
    data_dict = {}
    my_dict = {}
    with open(csv_file, encoding = 'latin1') as csvfile:
        my_reader = csv.DictReader(csvfile)
        print(my_reader.fieldnames)
        my_data = [my_row for my_row in my_reader]
        for my_row in my_data:
            #print(my_row)
            my_dict = {}
            my_dict[my_reader.fieldnames[0]] = my_row[my_reader.fieldnames[0]]
            my_dict[my_reader.fieldnames[1]] = my_row[my_reader.fieldnames[1]]
            data_dict[my_row[my_reader.fieldnames[2]]] = my_dict
    print("====================")
    my_my_dict = {}
    my_my_dict['test'] = data_dict
    print(my_my_dict)
    #for item in data_dict.items():
    #    print(item)
    #
    # convert both intermediary results to JSON object
    #
    y = dumps(my_my_dict)
    print("====================")
    print(y)
    print(type(y))
    print("====================")

    return y

def csv_to_json_second_method(csv_file):

    from json import dumps
    #create a dictionary
    data_dict = {}
    csv_rows = []
    #open a csv file handlerh
    with open(csv_file, encoding = 'latin1', newline='') as csv_file_handler:
        csv_reader = csv.DictReader(csv_file_handler)
        field = csv_reader.fieldnames
        for row in csv_reader:
            #print([{field[i]:row[field[i]] for i in range(len(field))}])
            csv_rows.extend([{field[i]:row[field[i]] for i in range(len(field))}])

    print("====================")
    data_dict['test'] = csv_rows
    #print(type(csv_rows))
    print(data_dict)
    #print(data_dict['test'][0])
    #print("====================")

    #
    # convert intermediary results to JSON object
    #
    z = dumps(data_dict)
    print("====================")
    print(z)
    print(type(z))
    print("====================")

    return z



def jointure(json1, json2):

    from json import loads
    from json import dumps

    # First, transform json objects to dictionaries

    d1_name = list(loads(json1))[0]
    #print(d1_name)
    d2_name = list(loads(json2))[0]
    #print(d2_name)

    d1 = loads(json1)[d1_name]
    d2 = loads(json2)[d2_name]

    #print(att_name,type(att_name))
    # Second, iterate through dictionaries
    d_res = {}
    for key1, val1 in d1.items():
        #print(key1, '==', val1)
        for key2, val2 in d2.items():
            #print(key1, '==', key2)
            #print([ord(c) for c in key1],key1,[ord(c) for c in att_name],att_name)
            if key1 == key2:
                d = {}
                d.update(val1)
                d.update(val2)
                #print(d)
                d_res[key1] = d
    my_my_dict = {}
    my_my_dict['test'] = d_res
    z = dumps(my_my_dict)

    return z

# Main program

json_one = csv_to_json_first_method("../data/test.csv")
json_two = csv_to_json_first_method("../data/test_1.csv")

d = jointure (json_one, json_two)
print(d)

['Test 2', 'Test 1', 'Prénom et nom']
{'test': {'Hervé P': {'Test 2': '12.00 / 18', 'Test 1': '10.00 / 16'}, 'Laurent H': {'Test 2': '12.00 / 18', 'Test 1': '15.00 / 16'}, 'Destin L': {'Test 2': '7.00 / 18', 'Test 1': '11.00 / 16'}, 'Guillaume C': {'Test 2': '4.00 / 18', 'Test 1': '10.00 / 16'}, 'Haytem D': {'Test 2': '7.00 / 18', 'Test 1': '12.00 / 16'}, 'Cam Chau N': {'Test 2': '9.00 / 18', 'Test 1': '6.00 / 16'}, 'Sarra Z': {'Test 2': '6.00 / 18', 'Test 1': '11.00 / 16'}, 'Romulus L': {'Test 2': '11.00 / 18', 'Test 1': '11.00 / 16'}, 'Thierno D': {'Test 2': '8.00 / 18', 'Test 1': '5.00 / 16'}, 'Rosenthal Preston R': {'Test 2': '13.00 / 18', 'Test 1': '13.00 / 16'}, 'Betty T': {'Test 2': '11.00 / 18', 'Test 1': '11.00 / 16'}, 'Mouloud B': {'Test 2': '17.00 / 18', 'Test 1': '13.00 / 16'}, 'Joseph L': {'Test 2': '11.00 / 18', 'Test 1': '11.00 / 16'}, 'Nataliya P': {'Test 2': '9.00 / 18', 'Test 1': '10.00 / 16'}}}
{"test": {"Herv\u00e9 P": {"Test 2": "12.00 / 18", "Test 1": "10.00 / 16"

##**JSON/MongoDB**

In [19]:
import json


def json_to_mongodb(json_file):
    # Charger le contenu du fichier JSON
    with open(json_file, 'r') as f:
        data = json.load(f)

    # Récupérer les éléments sous 'test', qui est un dictionnaire de dictionnaires
    items = data.get('test', {})

    # Créer une collection de mongodb
    collection = database["exo_mongo"]

    # Vérifier que 'items' est bien un dictionnaire
    if isinstance(items, dict):
        # Créer une liste de documents où chaque document inclut le nom
        documents = [{'name': name, 'notes': scores} for name, scores in items.items()]

        # Insérer les documents dans MongoDB
        collection.insert_many(documents)
    else:
        return "Le contenu sous 'test' n'est pas un dictionnaire."

    return collection



In [20]:
# Fonction pour sauvegarder des données JSON dans un fichier
def save_json_to_file(json_data, json_file):
    with open(json_file, 'w') as f:
        # Si `json_data` est une chaîne, la convertir en dictionnaire
        if isinstance(json_data, str):
            try:
                json_data = json.loads(json_data)
            except (TypeError, json.JSONDecodeError):
                pass
        json.dump(json_data, f, indent=4)


json_file = 'data.json'

save_json_to_file(json_one, json_file)

collection_2 = json_to_mongodb(json_file)


In [21]:
# Afficher la collection
result = collection_2.find()
for document in result:
  print(document)

{'_id': ObjectId('67366e1ecddcbbed4846a167'), 'name': 'Hervé P', 'notes': {'Test 2': '12.00 / 18', 'Test 1': '10.00 / 16'}}
{'_id': ObjectId('67366e1ecddcbbed4846a168'), 'name': 'Laurent H', 'notes': {'Test 2': '12.00 / 18', 'Test 1': '15.00 / 16'}}
{'_id': ObjectId('67366e1ecddcbbed4846a169'), 'name': 'Destin L', 'notes': {'Test 2': '7.00 / 18', 'Test 1': '11.00 / 16'}}
{'_id': ObjectId('67366e1ecddcbbed4846a16a'), 'name': 'Guillaume C', 'notes': {'Test 2': '4.00 / 18', 'Test 1': '10.00 / 16'}}
{'_id': ObjectId('67366e1ecddcbbed4846a16b'), 'name': 'Haytem D', 'notes': {'Test 2': '7.00 / 18', 'Test 1': '12.00 / 16'}}
{'_id': ObjectId('67366e1ecddcbbed4846a16c'), 'name': 'Cam Chau N', 'notes': {'Test 2': '9.00 / 18', 'Test 1': '6.00 / 16'}}
{'_id': ObjectId('67366e1ecddcbbed4846a16d'), 'name': 'Sarra Z', 'notes': {'Test 2': '6.00 / 18', 'Test 1': '11.00 / 16'}}
{'_id': ObjectId('67366e1ecddcbbed4846a16e'), 'name': 'Romulus L', 'notes': {'Test 2': '11.00 / 18', 'Test 1': '11.00 / 16'}}
{

In [22]:
# Supprimer la collection
database.drop_collection("exo_mongo")

{'nIndexesWas': 1,
 'ns': 'tp_mongodb.exo_mongo',
 'ok': 1.0,
 '$clusterTime': {'clusterTime': Timestamp(1731620390, 4),
  'signature': {'hash': b'\x12\xc1Z\xb4\xce\xe3\xd78D\x0e\xd5?\x11\xd5-\xb7:\xd7\xe6r',
   'keyId': 7374129255736672259}},
 'operationTime': Timestamp(1731620390, 4)}

##**Jointure de représentations JSON et interactions avec MongoDB**

In [None]:
"""
  1. Conversion CSV en JSON : Le code lit un fichier CSV, extrait les données des trois premières colonnes, et les organise dans un dictionnaire structuré, avant de le convertir en JSON.
  2. Jointure MongoDB : Il récupère deux documents à partir de MongoDB via leurs identifiants, puis effectue une jointure en fusionnant les clés communes des deux documents, en excluant les champs `_id`.
  3. Programme principal : Le programme insère deux fichiers CSV convertis en JSON dans MongoDB, effectue une jointure sur ces documents, puis affiche et nettoie la base de données.
"""

def csv_to_json_first_method(csv_file):

    from json import dumps
    #create a dictionary
    data_dict = {}
    my_dict = {}
    with open(csv_file, encoding = 'latin1') as csvfile:
        my_reader = csv.DictReader(csvfile)
        #print(my_reader.fieldnames)
        my_data = [my_row for my_row in my_reader]
        for my_row in my_data:
            #print(my_row)
            my_dict = {}
            my_dict[my_reader.fieldnames[0]] = my_row[my_reader.fieldnames[0]]
            my_dict[my_reader.fieldnames[1]] = my_row[my_reader.fieldnames[1]]
            data_dict[my_row[my_reader.fieldnames[2]]] = my_dict
    #print("====================")
    my_my_dict = {}
    my_my_dict['test'] = data_dict
  
    # convert both intermediary results to JSON object
    y = dumps(my_my_dict)
  
   

    return y



def jointure(mc,id1,id2):

    print(type(mc),id1,id2)
    doc1 = mc.find({'_id':id1})
    doc2 = mc.find({'_id':id2})

    # Second, iterate through dictionaries
    d_res = {}
    for d1 in doc1:
        d11 = list(d1.keys())
        res1 = d1
        #print('==',d11,'==')
    for d2 in doc2:
        d22 = list(d2.keys())
        res2 = d2
        #print('==',d22,'==')
    for d_111 in d11:
       for d_222 in d22:
           if d_111 != '_id' and d_222 != '_id':
               if d_111 == d_222:
                   d = {}
                   d.update(res1[d_111])
                   d.update(res2[d_222])
                   #print(d)
                   d_res[d_111] = d
                   #print("**",d_111,d_222,"**")
    my_my_dict = {}
    my_my_dict['test'] = d_res
    z = dumps(my_my_dict)

    # Save the join in the collection
    mc.insert_one(my_my_dict)

    return z

# Main program

if __name__ == "__main__":


    mydb = client["mydatabase"]
    mycol = mydb["mycollection"]


    from json import loads
    from json import dumps

    # First, transform json objects to dictionaries

    d1_name = list(loads(json_one))[0]
    #print(d1_name)
    d2_name = list(loads(json_two))[0]
    #print(d2_name)

    d1 = loads(json_one)[d1_name]
    d2 = loads(json_two)[d2_name]

    # store them into MongoDB

    #client.test_database.drop()
    post_id_one = mycol.insert_one(d1).inserted_id
    post_id_two = mycol.insert_one(d2).inserted_id

    # compute the join

    d = jointure (mycol,post_id_one,post_id_two)
    #print(d)

    # print the 3 documents in the collection
    from pprint import pprint

    cursor = mycol.find({})
    for document in cursor:
        pprint(document)

    # On fait du mÃ©nage
    mycol.drop()
    client.drop_database('mydatabase')

<class 'pymongo.synchronous.collection.Collection'> 67366e34cddcbbed4846a175 67366e34cddcbbed4846a176
{'Betty T': {'Test 1': '11.00 / 16', 'Test 2': '11.00 / 18'},
 'Cam Chau N': {'Test 1': '6.00 / 16', 'Test 2': '9.00 / 18'},
 'Destin L': {'Test 1': '11.00 / 16', 'Test 2': '7.00 / 18'},
 'Guillaume C': {'Test 1': '10.00 / 16', 'Test 2': '4.00 / 18'},
 'Haytem D': {'Test 1': '12.00 / 16', 'Test 2': '7.00 / 18'},
 'Hervé P': {'Test 1': '10.00 / 16', 'Test 2': '12.00 / 18'},
 'Joseph L': {'Test 1': '11.00 / 16', 'Test 2': '11.00 / 18'},
 'Laurent H': {'Test 1': '15.00 / 16', 'Test 2': '12.00 / 18'},
 'Mouloud B': {'Test 1': '13.00 / 16', 'Test 2': '17.00 / 18'},
 'Nataliya P': {'Test 1': '10.00 / 16', 'Test 2': '9.00 / 18'},
 'Romulus L': {'Test 1': '11.00 / 16', 'Test 2': '11.00 / 18'},
 'Rosenthal Preston R': {'Test 1': '13.00 / 16', 'Test 2': '13.00 / 18'},
 'Sarra Z': {'Test 1': '11.00 / 16', 'Test 2': '6.00 / 18'},
 'Thierno D': {'Test 1': '5.00 / 16', 'Test 2': '8.00 / 18'},
 '_id'

##**Intersections de représentations JSON et interactions avec MongoDB**

In [None]:
"""
  1. Conversion CSV en JSON : Le code lit un fichier CSV, extrait les données, les organise dans un dictionnaire structuré, puis les convertit en JSON.
  2. Intersection MongoDB : Il calcule l'intersection des clés communes entre deux documents MongoDB et fusionne leurs valeurs.
  3. Programme principal : Le programme insère deux fichiers CSV convertis en JSON dans MongoDB, effectue une intersection des documents, puis affiche et supprime les données.
"""


def csv_to_json_first_method(csv_file):

    from json import dumps
    #create a dictionary
    data_dict = {}
    my_dict = {}
    with open(csv_file, encoding = 'latin1') as csvfile:
        my_reader = csv.DictReader(csvfile)
        #print(my_reader.fieldnames)
        my_data = [my_row for my_row in my_reader]
        for my_row in my_data:
            #print("==",my_row,"==",len(my_row),type(my_row))
            my_dict = {}
            i = 0
            for my_key,my_val in my_row.items():
                if my_key == 'PrÃ©nom et nom':
                    data_dict[my_row[my_reader.fieldnames[i]]] = my_dict
                else:
                    my_dict[my_reader.fieldnames[i]] = my_row[my_reader.fieldnames[i]]
                i = i+1

          
    #print("====================")
    my_my_dict = {}
    my_my_dict['test'] = data_dict
    #
    # convert both intermediary results to JSON object
    #
    y = dumps(my_my_dict)

    return y

class SetEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        return json.JSONEncoder.default(self, obj)

#
# Dictionnaries intersection
#
def dict_intersect(*dicts):
    comm_keys = dicts[0].keys()
    for d in dicts[1:]:
        # intersect keys first
        comm_keys &= d.keys()
    # then build a result dict with nested comprehension
    result = {key:[d[key] for d in dicts] for key in comm_keys}
    # if you choose the following representation, we keep less values. Why?
    #result = {key:{d[key] for d in dicts} for key in comm_keys}
    res = {}
    for key,val in result.items():
        #print(key,'==',val)
        data_str = json.dumps(val, cls=SetEncoder)
        #print(data_str)
        res[key] = data_str

    return res


def intersection(mc,id1,id2):

    #print(type(mc),id1,id2)
    doc1 = mc.find({'_id':id1})
    doc2 = mc.find({'_id':id2})

    # Second, iterate through dictionaries
    d_res = {}
    for d1 in doc1:
        d11 = list(d1.keys())
        res1 = d1
        #print('==',d11,'==')
    for d2 in doc2:
        d22 = list(d2.keys())
        res2 = d2
        #print('==',d22,'==')
    d_res = {}
    for d_111 in d11:
       for d_222 in d22:
           if d_111 != '_id' and d_222 != '_id':
               if d_111 == d_222:
                   #d.update(res1[d_111])
                   #d.update(res2[d_222])
                   d_res[d_111] = dict_intersect(res1[d_111],res2[d_222])
                   #print("**",d_111,d_222,"**")
    my_my_dict = {}
    my_my_dict['test'] = d_res

    z = dumps(my_my_dict)

    # Save the intersect in the collection
    mc.insert_one(my_my_dict)

    return z

# Main program

if __name__ == "__main__":

    mydb = client["mydatabase"]
    mycol = mydb["mycollection"]

   

    from json import loads
    from json import dumps

    # First, transform json objects to dictionaries

    d1_name = list(loads(json_one))[0]
    #print(d1_name)
    d2_name = list(loads(json_two))[0]
    #print(d2_name)

    d1 = loads(json_one)[d1_name]
    d2 = loads(json_two)[d2_name]
    #print(d1)
    #print(d2)

    # store them into MongoDB

    #client.test_database.drop()
    post_id_one = mycol.insert_one(d1).inserted_id
    post_id_two = mycol.insert_one(d2).inserted_id

    # compute the intersect
    d = intersection (mycol,post_id_one,post_id_two)

    # print the 3 documents in the collection
    from pprint import pprint

    cursor = mycol.find({})
    for document in cursor:
        pprint(document)

    # On fait du mÃ©nage
    mycol.drop()
    client.drop_database('mydatabase')

{'Betty T': {'Test 1': '11.00 / 16', 'Test 2': '11.00 / 18'},
 'Cam Chau N': {'Test 1': '6.00 / 16', 'Test 2': '9.00 / 18'},
 'Destin L': {'Test 1': '11.00 / 16', 'Test 2': '7.00 / 18'},
 'Guillaume C': {'Test 1': '10.00 / 16', 'Test 2': '4.00 / 18'},
 'Haytem D': {'Test 1': '12.00 / 16', 'Test 2': '7.00 / 18'},
 'Hervé P': {'Test 1': '10.00 / 16', 'Test 2': '12.00 / 18'},
 'Joseph L': {'Test 1': '11.00 / 16', 'Test 2': '11.00 / 18'},
 'Laurent H': {'Test 1': '15.00 / 16', 'Test 2': '12.00 / 18'},
 'Mouloud B': {'Test 1': '13.00 / 16', 'Test 2': '17.00 / 18'},
 'Nataliya P': {'Test 1': '10.00 / 16', 'Test 2': '9.00 / 18'},
 'Romulus L': {'Test 1': '11.00 / 16', 'Test 2': '11.00 / 18'},
 'Rosenthal Preston R': {'Test 1': '13.00 / 16', 'Test 2': '13.00 / 18'},
 'Sarra Z': {'Test 1': '11.00 / 16', 'Test 2': '6.00 / 18'},
 'Thierno D': {'Test 1': '5.00 / 16', 'Test 2': '8.00 / 18'},
 '_id': ObjectId('67366e43cddcbbed4846a178')}
{'Betty T': {'Test 1': '11.00 / 16', 'Test 2': '11.00 / 18'},


##**INDEX**

En MongoDB, un index est une structure de données spéciale qui accélère les requêtes sur certains champs en facilitant la recherche de documents. Cependant, la création et la gestion des index peuvent entraîner des coûts supplémentaires en termes de mémoire et de performances d'écriture. Un index trie les valeurs des champs spécifiés, permettant à MongoDB de localiser rapidement les documents pertinents sans devoir parcourir l'intégralité de la collection.


Les index sont stockés en mémoire (RAM) pour améliorer les performances, ce qui peut augmenter l'utilisation de la mémoire si plusieurs index sont créés. Des index mal optimisés ou inutiles peuvent aussi affecter négativement les performances globales, car leur maintenance devient coûteuse en ressources.

In [None]:
#
# Performance of MongoDB with indexes
#
import csv
from json import dumps
import time

def perf_mongo(csv_file, n):

    from pymongo import MongoClient

    mydb = client["mydatabase"]
    mycol = mydb["mycollection"]
    # Empty the collection
    mycol.drop()
    # Create an index for the collection
    mycol.create_index([ ('M', 1) ])

    with open(csv_file, encoding = 'utf-8') as csvfile:
        my_reader = csv.DictReader(csvfile,delimiter='\t')
        my_data = [my_row for my_row in my_reader]
        #print(my_data)
        pres = dup = 0
        print('Start performance eval over',n,'inputs')
        # get the start time
        st = time.process_time()
        for my_row in my_data[0:n]:
            # print(my_row['M'],type(my_row['M']))
            #
            # find and replace <=> test if key exists
            #
            mycol.replace_one({my_row['M']: 1},{my_row['M']:1},upsert=True,hint=[ ('M', 1) ])
        # get the end time
        et = time.process_time()
        # get execution time
        res = et - st
        print('CPU Execution time:', res, 'seconds')
        print('We found',n - mycol.count_documents({}),'duplicates in the input')
        print()
        print('Wall time (also known as clock time or wall-clock time) is simply the total time')
        print('elapsed during the measurement. Itâ€™s the time you can measure with a stopwatch.')
        print('It is the difference between the time at which a program finished its execution and')
        print('the time at which the program started. It also includes waiting time for resources.')
        print()
        print('CPU Time, on the other hand, refers to the time the CPU was busy processing')
        print('the programâ€™s instructions. The time spent waiting for other task to complete')
        print('(like I/O operations) is not included in the CPU time. It does not include')
        print('the waiting time for resources.')
#Step 1

perf_mongo("../data/DEMO.csv", 14000)

Start performance eval over 14000 inputs
CPU Execution time: 35.610781425999996 seconds
We found 1509 duplicates in the input

Wall time (also known as clock time or wall-clock time) is simply the total time
elapsed during the measurement. Itâ€™s the time you can measure with a stopwatch.
It is the difference between the time at which a program finished its execution and
the time at which the program started. It also includes waiting time for resources.

CPU Time, on the other hand, refers to the time the CPU was busy processing
the programâ€™s instructions. The time spent waiting for other task to complete
(like I/O operations) is not included in the CPU time. It does not include
the waiting time for resources.


##**AGREGATION**

L'**agrégation** en MongoDB est un processus permettant de transformer et d'analyser des ensembles de documents via une série d'étapes dans une pipeline. Ces étapes, comme `$match`, `$group`, `$project`, et `$sort`, permettent de filtrer, regrouper, modifier, et trier les données pour obtenir des résultats spécifiques ou des résumés, tels que des statistiques et des rapports.

In [26]:
import csv
from json import dumps
import time

def perf_mongo(csv_file, n):

    from pymongo import MongoClient

    mydb = client["mydatabase"  ]
    mycol = mydb["mycollection"]
    # Empty the collection
    mycol.drop()
    # Create an index for the collection
    mycol.create_index([ ('M', 1) ])

    with open(csv_file, encoding = 'utf-8') as csvfile:
        my_reader = csv.DictReader(csvfile,delimiter='\t')
        my_data = [my_row for my_row in my_reader]
        #print(my_data)
        pres = dup = 0
        print('Start performance eval over',n,'inputs')
        # get the start time
        st = time.process_time()
        for my_row in my_data[0:n]:
            # print(my_row['M'],type(my_row['SENS']))
            #
            # insert {"M": my_row['M'], "SENS": my_row['SENS'] }
            #
            mycol.insert_one({"M": my_row['M'], "SENS": my_row['SENS'] })
        # get the end time
        et = time.process_time()
        # get execution time
        res = et - st
        print('CPU Execution time:', res, 'seconds')
        print()
        print('Wall time (also known as clock time or wall-clock time) is simply the total time')
        print('elapsed during the measurement. Itâ€™s the time you can measure with a stopwatch.')
        print('It is the difference between the time at which a program finished its execution and')
        print('the time at which the program started. It also includes waiting time for resources.')
        print()
        print('CPU Time, on the other hand, refers to the time the CPU was busy processing')
        print('the programâ€™s instructions. The time spent waiting for other task to complete')
        print('(like I/O operations) is not included in the CPU time. It does not include')
        print('the waiting time for resources.')

        #
        # We search all entries for word 'Ã  cheval sur'
        #
        print('================================================')
        print(' Find all definition(s) for word "Ã  cheval sur" ')
        print('================================================')

        # Recherche d'un mot spécifique sans $where
        word_to_find = "à cheval sur"  # corriger l'encodage si nécessaire
        for doc in mycol.find({"M": word_to_find}):
            print(doc)


        #
        # We compute the number of definition for each word
        #
        agg_result = mycol.aggregate(
            [{
                "$group" :
                {"_id" : "$M",
                 "Total" : {"$sum" : 1}
                 }}
             ])
        print('================================')
        print(" Number of definitions per word ")
        print('================================')
        for i in agg_result:
            print(i)

        #
        # We merge all the definition of a word
        #
        agg_result = mycol.aggregate(
            [  { "$sort": { "M": 1 } },
               {
                   "$group":
                               {
                                   "_id": "$M",
                                   #"meaning": { "$push":  { "item": "$SENS" } }
                                   "meaning": { "$push":  "$SENS" }
                               }
               }
             ]
        )
        print('==============================')
        print("    Merging of definitions")
        print('==============================')
        for i in agg_result:
            print(i)


#Step 1

perf_mongo("../data/DEMO.csv", 100)

Start performance eval over 100 inputs
CPU Execution time: 0.09375 seconds

Wall time (also known as clock time or wall-clock time) is simply the total time
elapsed during the measurement. Itâ€™s the time you can measure with a stopwatch.
It is the difference between the time at which a program finished its execution and
the time at which the program started. It also includes waiting time for resources.

CPU Time, on the other hand, refers to the time the CPU was busy processing
the programâ€™s instructions. The time spent waiting for other task to complete
(like I/O operations) is not included in the CPU time. It does not include
the waiting time for resources.
 Find all definition(s) for word "Ã  cheval sur" 
{'_id': ObjectId('67366e6ccddcbbed4846a1b9'), 'M': 'à cheval sur', 'SENS': '(assis)à califourchon'}
{'_id': ObjectId('67366e6ccddcbbed4846a1ba'), 'M': 'à cheval sur', 'SENS': 'intransigeant sr qc'}
{'_id': ObjectId('67366e6ccddcbbed4846a1bb'), 'M': 'à cheval sur', 'SENS': 'ne pa

In [None]:
"""
# Recherche d'un mot spécifique sans $where
word_to_find = "à cheval sur"  # corriger l'encodage si nécessaire
for doc in mycol.find({"M": word_to_find}):
    print(doc)

reformulation faite pour que le code marche
"""

In [27]:
from pymongo import MongoClient



client.drop_database('mydatabase')

db = client["mydatabase"]

#Create 'Books' collection
db.Matiere1.insert_many([
    { 'Test_2':'12.00 / 18','Test_1':'10.00 / 16','Prenom_nom':'HervÃ© P' },
    { 'Test_2':'12.00 / 18','Test_1':'15.00 / 16','Prenom_nom':'Laurent H' },
    { 'Test_2':'7.00 / 18','Test_1':'11.00 / 16','Prenom_nom':'Destin L' },
    { 'Test_2':'4.00 / 18','Test_1':'10.00 / 16','Prenom_nom':'Guillaume C' },
    { 'Test_2':'7.00 / 18','Test_1':'12.00 / 16','Prenom_nom':'Haytem D' },
    { 'Test_2':'9.00 / 18','Test_1':'6.00 / 16','Prenom_nom':'Cam Chau N' },
    { 'Test_2':'6.00 / 18','Test_1':'11.00 / 16','Prenom_nom':'Sarra Z' },
    { 'Test_2':'11.00 / 18','Test_1':'11.00 / 16','Prenom_nom':'Romulus L' },
    { 'Test_2':'8.00 / 18','Test_1':'5.00 / 16','Prenom_nom':'Thierno D' },
    { 'Test_2':'13.00 / 18','Test_1':'13.00 / 16','Prenom_nom':'Rosenthal Preston R' },
    { 'Test_2':'11.00 / 18','Test_1':'11.00 / 16','Prenom_nom':'Betty T' },
    { 'Test_2':'17.00 / 18','Test_1':'13.00 / 16','Prenom_nom':'Mouloud B' },
    { 'Test_2':'11.00 / 18','Test_1':'11.00 / 16','Prenom_nom':'Joseph L' },
    { 'Test_2':'9.00 / 18','Test_1':'10.00 / 16','Prenom_nom':'Nataliya P' }
 ])

#Create 'Authors' collection
db.Matiere2.insert_many([
    { 'T_3':'10.00 / 18','T_2':'12.00 / 18','T_1':'10.00 / 16','Nom':'HervÃ© P' },
    { 'T_3':'7.00 / 18','T_2':'12.00 / 18','T_1':'15.00 / 16','Nom':'Laurent H' },
    { 'T_3':'8.00 / 18','T_2':'7.00 / 18','T_1':'11.00 / 16','Nom':'Destin L' },
    { 'T_3':'10.00 / 18','T_2':'4.00 / 18','T_1':'10.00 / 16','Nom':'Guillaume C' },
    { 'T_3':'10.00 / 18','T_2':'7.00 / 18','T_1':'12.00 / 16','Nom':'Haytem D' },
    { 'T_3':'12.00 / 18','T_2':'9.00 / 18','T_1':'6.00 / 16','Nom':'Cam Chau N' },
    { 'T_3':'15.00 / 18','T_2':'6.00 / 18','T_1':'11.00 / 16','Nom':'Sarra Z' },
    { 'T_3':'14.00 / 18','T_2':'11.00 / 18','T_1':'11.00 / 16','Nom':'Romulus L' },
    { 'T_3':'13.00 / 18','T_2':'8.00 / 18','T_1':'5.00 / 16','Nom':'Thierno D' },
    { 'T_3':'11.00 / 18','T_2':'13.00 / 18','T_1':'13.00 / 16','Nom':'Rosenthal Preston R' },
    { 'T_3':'5.00 / 18','T_2':'11.00 / 18','T_1':'11.00 / 16','Nom':'Betty T' },
    { 'T_3':'4.00 / 18','T_2':'17.00 / 18','T_1':'13.00 / 16','Nom':'Mouloud B' },
    { 'T_3':'8.00 / 18','T_2':'11.00 / 18','T_1':'11.00 / 16','Nom':'Joseph L' },
    { 'T_3':'9.00 / 18','T_2':'9.00 / 18','T_1':'10.00 / 16','Nom':'Nataliya P' }
 ])

# Join two collections using $lookup operator
agg_result = db.Matiere1.aggregate([
   {
      "$lookup":
         {
           "from": "Matiere2",
           "localField": "Prenom_nom",
           "foreignField": "Nom",
           "as": "Detail"
         }
   }
])

from pprint import pprint

print("Type agrÃ©gation:",type(agg_result))

print("=====  AgrÃ©gation  =====")
for i in agg_result:
    pprint(i)


Type agrÃ©gation: <class 'pymongo.synchronous.command_cursor.CommandCursor'>
=====  AgrÃ©gation  =====
{'Detail': [{'Nom': 'HervÃ© P',
             'T_1': '10.00 / 16',
             'T_2': '12.00 / 18',
             'T_3': '10.00 / 18',
             '_id': ObjectId('67366ebecddcbbed4846a1ed')}],
 'Prenom_nom': 'HervÃ© P',
 'Test_1': '10.00 / 16',
 'Test_2': '12.00 / 18',
 '_id': ObjectId('67366ebecddcbbed4846a1df')}
{'Detail': [{'Nom': 'Laurent H',
             'T_1': '15.00 / 16',
             'T_2': '12.00 / 18',
             'T_3': '7.00 / 18',
             '_id': ObjectId('67366ebecddcbbed4846a1ee')}],
 'Prenom_nom': 'Laurent H',
 'Test_1': '15.00 / 16',
 'Test_2': '12.00 / 18',
 '_id': ObjectId('67366ebecddcbbed4846a1e0')}
{'Detail': [{'Nom': 'Destin L',
             'T_1': '11.00 / 16',
             'T_2': '7.00 / 18',
             'T_3': '8.00 / 18',
             '_id': ObjectId('67366ebecddcbbed4846a1ef')}],
 'Prenom_nom': 'Destin L',
 'Test_1': '11.00 / 16',
 'Test_2': '7.00 /

In [None]:
import csv
import json

def csv_to_json_first_method(csv_file):

    from json import dumps
    #create a dictionary
    data_dict = {}
    my_dict = {}
    with open(csv_file, encoding = 'latin1') as csvfile:
        my_reader = csv.DictReader(csvfile)
        #print(my_reader.fieldnames)
        my_data = [my_row for my_row in my_reader]
        for my_row in my_data:
            #print("==",my_row,"==",len(my_row),type(my_row))
            my_dict = {}
            i = 0
            for my_key,my_val in my_row.items():
                if my_key == 'PrÃ©nom et nom':
                    data_dict[my_row[my_reader.fieldnames[i]]] = my_dict
                else:
                    my_dict[my_reader.fieldnames[i]] = my_row[my_reader.fieldnames[i]]
                i = i+1

    my_my_dict = {}
    my_my_dict['test'] = data_dict
    #
    # convert both intermediary results to JSON object
    #
    y = dumps(my_my_dict)

    return y

def csv_to_json_second_method(csv_file):

    from json import dumps
    #create a dictionary
    data_dict = {}
    csv_rows = []
    #open a csv file handlerh
    with open(csv_file, encoding = 'latin1', newline='') as csv_file_handler:
        csv_reader = csv.DictReader(csv_file_handler)
        field = csv_reader.fieldnames
        for row in csv_reader:
            #print([{field[i]:row[field[i]] for i in range(len(field))}])
            csv_rows.extend([{field[i]:row[field[i]] for i in range(len(field))}])

    z = dumps(csv_rows)
   
    return z



class SetEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        return json.JSONEncoder.default(self, obj)

# Main program

if __name__ == "__main__":

    from pymongo import MongoClient

    mydb = client["mydatabase"]
    mycol_one = mydb["mycollection_one"]
    mycol_two = mydb["mycollection_two"]

    json_one = csv_to_json_second_method("../data/test_join.csv")
    json_two = csv_to_json_second_method("../data/test2_join.csv")

    from json import loads
    from json import dumps

    # First, transform json objects to dictionaries

    d1_name = list(loads(json_one))
    #print(d1_name)
    d2_name = list(loads(json_two))
    #print(d2_name)

    #for row in d1_name:
    #    mycol_one.insert_one(row)
    mycol_one.insert_many(d1_name)

    #for row in d2_name:
    #    mycol_two.insert_one(row)
    mycol_two.insert_many(d2_name)

    # compute the join with the lookup aggregation
    agg_result = mycol_one.aggregate([
        {
            '$lookup': {
                'from': 'mycol_two',
                'localField': 'Prenom_nom',
                'foreignField': 'Nom',
                'as': 'join_sample'
            }
        }
    ])

    # print the 3 documents in the collection
    from pprint import pprint

    print("Type agrÃ©gation:",type(agg_result))

    print("=====  AgrÃ©gation  =====")
    for i in agg_result:
        pprint(i)

    print("===== Collection 1 =====")
    for i in mycol_one.find({}):
        pprint(i)

    print("===== Collection 2 =====")
    for i in mycol_two.find({}):
        pprint(i)

    # On fait du mÃ©nage
    mycol_one.drop()
    mycol_two.drop()
    client.drop_database('mydatabase')

Type agrÃ©gation: <class 'pymongo.synchronous.command_cursor.CommandCursor'>
=====  AgrÃ©gation  =====
{'Prenom_nom': 'Hervé P',
 'Test_1': '10.00 / 16',
 'Test_2': '12.00 / 18',
 '_id': ObjectId('67366eefcddcbbed4846a1fb'),
 'join_sample': []}
{'Prenom_nom': 'Laurent H',
 'Test_1': '15.00 / 16',
 'Test_2': '12.00 / 18',
 '_id': ObjectId('67366eefcddcbbed4846a1fc'),
 'join_sample': []}
{'Prenom_nom': 'Destin L',
 'Test_1': '11.00 / 16',
 'Test_2': '7.00 / 18',
 '_id': ObjectId('67366eefcddcbbed4846a1fd'),
 'join_sample': []}
{'Prenom_nom': 'Guillaume C',
 'Test_1': '10.00 / 16',
 'Test_2': '4.00 / 18',
 '_id': ObjectId('67366eefcddcbbed4846a1fe'),
 'join_sample': []}
{'Prenom_nom': 'Haytem D',
 'Test_1': '12.00 / 16',
 'Test_2': '7.00 / 18',
 '_id': ObjectId('67366eefcddcbbed4846a1ff'),
 'join_sample': []}
{'Prenom_nom': 'Cam Chau N',
 'Test_1': '6.00 / 16',
 'Test_2': '9.00 / 18',
 '_id': ObjectId('67366eefcddcbbed4846a200'),
 'join_sample': []}
{'Prenom_nom': 'Sarra Z',
 'Test_1': '1

Le problème  réside dans l'utilisation des champs pour la jointure. Dans l'agrégation, on a spécifié localField comme Prenom_nom et foreignField comme Nom. Pour que la jointure réussisse, les valeurs dans ces deux champs doivent correspondre. Si les valeurs ne correspondent pas, l'attribut join_sample dans les résultats de l'agrégation sera vide ce qui est le cas de code


Voici une version corrigée du code :

In [29]:

import csv
import json
from pymongo import MongoClient
from pprint import pprint

def csv_to_json_second_method(csv_file):
    from json import dumps
    csv_rows = []
    with open(csv_file, encoding='latin1', newline='') as csv_file_handler:
        csv_reader = csv.DictReader(csv_file_handler)
        for row in csv_reader:
            csv_rows.append(row)
    return dumps(csv_rows)

# Main program
if __name__ == "__main__":

    mydb = client["mydatabase"]
    mycol_one = mydb["mycollection_one"]
    mycol_two = mydb["mycollection_two"]

    json_one = csv_to_json_second_method("../data/test_join.csv")
    json_two = csv_to_json_second_method("../data/test2_join.csv")

    d1_name = list(json.loads(json_one))
    d2_name = list(json.loads(json_two))

    mycol_one.insert_many(d1_name)
    mycol_two.insert_many(d2_name)

    # Print documents before aggregation
    print("===== Collection 1 =====")
    for i in mycol_one.find({}):
        pprint(i)

    print("===== Collection 2 =====")
    for i in mycol_two.find({}):
        pprint(i)

    # compute the join with the lookup aggregation
    agg_result = mycol_one.aggregate([
        {
            '$lookup': {
                'from': 'mycollection_two',  # Correction ici
                'localField': 'Prenom_nom',
                'foreignField': 'Nom',
                'as': 'join_sample'
            }
        }
    ])

    print("===== Agrégation =====")
    for i in agg_result:
        pprint(i)

    # Clean up
    mycol_one.drop()
    mycol_two.drop()
    client.drop_database('mydatabase')


===== Collection 1 =====
{'Prenom_nom': 'Hervé P',
 'Test_1': '10.00 / 16',
 'Test_2': '12.00 / 18',
 '_id': ObjectId('67366f0bcddcbbed4846a217')}
{'Prenom_nom': 'Laurent H',
 'Test_1': '15.00 / 16',
 'Test_2': '12.00 / 18',
 '_id': ObjectId('67366f0bcddcbbed4846a218')}
{'Prenom_nom': 'Destin L',
 'Test_1': '11.00 / 16',
 'Test_2': '7.00 / 18',
 '_id': ObjectId('67366f0bcddcbbed4846a219')}
{'Prenom_nom': 'Guillaume C',
 'Test_1': '10.00 / 16',
 'Test_2': '4.00 / 18',
 '_id': ObjectId('67366f0bcddcbbed4846a21a')}
{'Prenom_nom': 'Haytem D',
 'Test_1': '12.00 / 16',
 'Test_2': '7.00 / 18',
 '_id': ObjectId('67366f0bcddcbbed4846a21b')}
{'Prenom_nom': 'Cam Chau N',
 'Test_1': '6.00 / 16',
 'Test_2': '9.00 / 18',
 '_id': ObjectId('67366f0bcddcbbed4846a21c')}
{'Prenom_nom': 'Sarra Z',
 'Test_1': '11.00 / 16',
 'Test_2': '6.00 / 18',
 '_id': ObjectId('67366f0bcddcbbed4846a21d')}
{'Prenom_nom': 'Romulus L',
 'Test_1': '11.00 / 16',
 'Test_2': '11.00 / 18',
 '_id': ObjectId('67366f0bcddcbbed4846

In [None]:
import csv
import json

def csv_to_json_second_method(csv_file):

    from json import dumps
    #create a dictionary
    data_dict = {}
    csv_rows = []
    #open a csv file handlerh
    with open(csv_file, encoding = 'latin1', newline='') as csv_file_handler:
        csv_reader = csv.DictReader(csv_file_handler)
        field = csv_reader.fieldnames
        for row in csv_reader:
            #print([{field[i]:row[field[i]] for i in range(len(field))}])
            csv_rows.extend([{field[i]:row[field[i]] for i in range(len(field))}])


    z = dumps(csv_rows)
 
    return z



class SetEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        return json.JSONEncoder.default(self, obj)

# Main program

if __name__ == "__main__":

    from pymongo import MongoClient

    mydb = client["mydatabase"]
    mycol_one = mydb["mycollection_one"]
    mycol_two = mydb["mycollection_two"]

    json_one = csv_to_json_second_method("../data/test_join.csv")
    json_two = csv_to_json_second_method("../data/test2_join.csv")


    from json import loads
    from json import dumps

    # First, transform json objects to dictionaries

    d1_name = list(loads(json_one))
    #print(d1_name)
    d2_name = list(loads(json_two))
    #print(d2_name)

    for row in d1_name:
        #print(type(row),"===",row)
        mydb.mycol_one.insert_one(row)
    #mycol_one.insert_many(list(json_one))

    for row in d2_name:
        mydb.mycol_two.insert_one(row)
    #mycol_two.insert_many(list(json_two))

    # compute the join with the lookup aggregation
    agg_result = mydb.mycol_one.aggregate([
        { "$match": { "$or": [ { "Prenom_nom" : "Betty T" }, { "Prenom_nom" : "Sarra Z" }] } },
        {
            '$lookup': {
                'from': 'mycol_two',
                'localField': 'Prenom_nom',
                'foreignField': 'Nom',
                'as': 'join_sample'
            }
        },
        { "$unwind": "$join_sample" },
        #{ "$group": { "_id": "$join_sample", "count": { "$sum": 1 } } }
        { "$project" : { "name":{"$toUpper":"$Prenom_nom"}, "N2":"$Test_2", "N1":"$Test_1", "N3":"$join_sample.T_3", "N3":"$join_sample.T_2","N1":"$join_sample.T_1"}  },
        { "$sort" : { "name" : 1 } }
    ])

    # print the 3 documents in the collection
    from pprint import pprint

    print("Type agrÃ©gation:",type(agg_result))

    print("=====  AgrÃ©gation  =====")
    for i in agg_result:
        pprint(i["_id"])
        print('Nom :',i["name"])
        print('Note 1 :',i["N2"])
        print('Note 2 :',i["N1"])
        print('Note 3 :',i["N3"])
        print('Note 4 :',i["N2"])
        print('Note 5 :',i["N1"])
        print('------------------------')

  
    # On fait du mÃ©nage
    mydb.mycol_one.drop()
    mydb.mycol_two.drop()
    client.drop_database('mydatabase')

Type agrÃ©gation: <class 'pymongo.synchronous.command_cursor.CommandCursor'>
=====  AgrÃ©gation  =====
ObjectId('67366f2acddcbbed4846a23d')
Nom : BETTY T
Note 1 : 11.00 / 18
Note 2 : 11.00 / 16
Note 3 : 11.00 / 18
Note 4 : 11.00 / 18
Note 5 : 11.00 / 16
------------------------
ObjectId('67366f2acddcbbed4846a239')
Nom : SARRA Z
Note 1 : 6.00 / 18
Note 2 : 11.00 / 16
Note 3 : 6.00 / 18
Note 4 : 6.00 / 18
Note 5 : 11.00 / 16
------------------------
