## __1. Download the Open Food Dataset__

In [1]:
import requests
import os

In [3]:
# Ensure data directory exists
os.makedirs("../data", exist_ok=True)

In [4]:
# Dataset URL and local save path
url = "https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv.gz"
local_path = "../data/products.csv.gz"

In [5]:
# Download with streaming in chuncks
response = requests.get(url, stream=True)
with open(local_path, "wb") as f:
    for chunk in response.iter_content(chunk_size=1024):
        if chunk:
            f.write(chunk)
print("Download complete! Saved to:", local_path)

Download complete! Saved to: ../data/products.csv.gz


In [6]:
import pandas as pd
import gzip

In [7]:
# Load the first 1000 rows using gzip
with gzip.open(local_path, 'rt', encoding='utf-8') as f:
    df = pd.read_csv(f, sep='\t', nrows=1000, on_bad_lines='skip')

In [10]:
# Preview a chunck of dataset
print(f"Sample loaded. Shape: {df.shape}")
df.head(10)

Sample loaded. Shape: (1000, 209)


Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,last_modified_by,last_updated_t,last_updated_datetime,...,water-hardness_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g,sulphate_100g,nitrate_100g,acidity_100g,carbohydrates-total_100g
0,54,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1582569031,2020-02-24T18:30:31Z,1733085204,2024-12-01T20:33:24Z,,1740205422,2025-02-22T06:23:42Z,...,,,,,,,,,,
1,63,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1673620307,2023-01-13T14:31:47Z,1746258398,2025-05-03T07:46:38Z,roboto-app,1746258398,2025-05-03T07:46:38Z,...,,,,,,,,,,
2,114,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1580066482,2020-01-26T19:21:22Z,1737247862,2025-01-19T00:51:02Z,smoothie-app,1743312145,2025-03-30T05:22:25Z,...,,,,,,,,,,
3,1,http://world-en.openfoodfacts.org/product/0000...,inf,1634745456,2021-10-20T15:57:36Z,1748552638,2025-05-29T21:03:58Z,fitcounter,1748552638,2025-05-29T21:03:58Z,...,,,,,,,,,,
4,105,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1572117743,2019-10-26T19:22:23Z,1738073570,2025-01-28T14:12:50Z,,1743653496,2025-04-03T04:11:36Z,...,,,,,,,,,,
5,2,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1722606455,2024-08-02T13:47:35Z,1748553804,2025-05-29T21:23:24Z,prepperapp,1748553804,2025-05-29T21:23:24Z,...,,,,,,,,,,
6,3,http://world-en.openfoodfacts.org/product/0000...,prepperapp,1716818343,2024-05-27T13:59:03Z,1748608673,2025-05-30T12:37:53Z,brio1981,1748608673,2025-05-30T12:37:53Z,...,,,,,,,,,,
7,4,http://world-en.openfoodfacts.org/product/0000...,elcoco,1560176426,2019-06-10T14:20:26Z,1748094869,2025-05-24T13:54:29Z,smoothie-app,1748094869,2025-05-24T13:54:29Z,...,,,,,,,,,,
8,475,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1714206330,2024-04-27T08:25:30Z,1714207074,2024-04-27T08:37:54Z,roboto-app,1740362538,2025-02-24T02:02:18Z,...,,,,,,,,,,
9,5,http://world-en.openfoodfacts.org/product/0000...,touchette,1605337720,2020-11-14T07:08:40Z,1746375623,2025-05-04T16:20:23Z,detrumpezvous,1746375623,2025-05-04T16:20:23Z,...,,,,,,,,,,


In [17]:
df.columns[:20]

Index(['code', 'url', 'creator', 'created_t', 'created_datetime',
       'last_modified_t', 'last_modified_datetime', 'last_modified_by',
       'last_updated_t', 'last_updated_datetime', 'product_name',
       'abbreviated_product_name', 'generic_name', 'quantity', 'packaging',
       'packaging_tags', 'packaging_en', 'packaging_text', 'brands',
       'brands_tags'],
      dtype='object')

In [18]:
df['product_name']

0                  Limonade artisanale a la rose
1                                 CIABATTA OLIVE
2                                  Chocolate n 3
3      Weihnachtsgelee (selbstgemacht von Jutta)
4           Paleta gran reserva - Sierra nevada-
                         ...                    
995                                          NaN
996                          bevanda al mirtillo
997                                     Collagen
998                          impact protein whey
999                                        Toast
Name: product_name, Length: 1000, dtype: object

## __2. Upload the Raw File to S3 Bucket__

In [23]:
import boto3

In [24]:
# Define S3 bucket and object key
bucket_name = "nutriflow-pipeline"
object_key = "raw/products.csv.gz"
local_path = "../data/products.csv.gz"

In [28]:
# Upload to S3
s3 = boto3.client("s3")
s3.upload_file(local_path, bucket_name, object_key)

print(f"Raw dataset uploaded to: {bucket_name}/{object_key}")

Raw dataset uploaded to: nutriflow-pipeline/raw/products.csv.gz
