**Purpose**  
Build a dataset to explore data of food products.

**Prerequisites**
- [X] The script need a OpenFoodFact mongo database running.

In [None]:
from pymongo import MongoClient

In [None]:
client = MongoClient('localhost', 27017, connectTimeoutMS=5000, serverSelectionTimeoutMS=5000)
msg = client.server_info()
if isinstance(msg, dict):
    print('MongoDB is running. Info:\n')
    for k, v in msg.items():
        print('-', k, ':', v)

In [None]:
db = client['off']
collection = db['products']
prefix = 'https://images.openfoodfacts.org/images/products/'

In [None]:
projection = {'id':1}
query = {
    'countries_tags':{'$in':["en:france"]},
    # "nutriscore_score":{'$ne':None},
    # "ecoscore_score":{'$ne':None},
    # "nova_group":{'$ne':None},
    }
cursor = collection.find(query, projection=projection, limit=False)
print('number of product:', len(list(cursor)))

In [None]:
# inspired from https://github.com/openfoodfacts/openfoodfacts-server/blob/main/html/data/data-fields.md
projection = {
    'id':1,
    'product_name':1,
    'brands':1,
    'categories':1,
    'categories_en':1,
    'categories_tags':1,
    'compared_to_category':1,
    'countries_tags':1,
    'food_groups':1,
    'food_groups_tags':1,
    'scans_n':1,
    'unique_scans_n':1,
    'origins':1,
    'origins_tags':1,
    'origins_en':1,
    'manufacturing_places':1,
    'manufacturing_places_tags':1,
    'stores':1,
    'purchase_places':1,
    'countries':1,
    'countries_tags':1,
    # ingredient
    'labels':1,
    'labels_tags':1,
    'ingredients_n':1,
    'additives_n':1,
    'nutrition_data_prepared_per':1,
    'ingredients_text_en':1,
    'ingredients_text_fr':1,
    'ingredients_tags':1,
    'additives_tags':1,
    'ingredients_hierarchy':1,
    'serving_size':1,
    'serving_quantity':1,
    'product_quantity':1,
    'quantity':1,
    # packaging
    'packaging_tags':1,
    'popularity_tags':1,
    # score
    'ecoscore_grade':1,
    'ecoscore_score':1,
    'nova_group':1,
    'nova_groups_tags':1,
    'nutriscore_score':1,
    # nutriments
    'nutriments':1,
    }
query = {
    'countries_tags':{'$in':["en:france"]},
    "nutriscore_score":{'$ne':None},
    "ecoscore_score":{'$ne':None},
    "nova_group":{'$ne':None},
    }
cursor = collection.find(query, projection=projection, limit=10_000)

In [None]:
from typing import List, Dict, Any
from pandas import DataFrame

class OpenFoodFactParser():
    
    def __init__(self, json_data:List[Dict[str, Any]]):
        self.data = json_data
        
    def to_df(self):
        data_ = {}
        for product in self.data:
            data_['index'] = product['id']
            for k, v in product.items():
                if isinstance(v, list):
                    data_[k] = ';'.join(v)
                elif isinstance(v, dict):
                    data_[k] = ';'.join([f'{k_}:{v_}' for k_, v_ in v.items()])
                if isinstance(v, ('str', int, float)):
                    data_[k] = v
        return DataFrame.from_dict(data_)