In [None]:
%pip install --upgrade pip 
%pip install kaggle

In [None]:
import os, json, yaml, requests, subprocess
from kaggle.api.kaggle_api_extended import KaggleApi

In [None]:
data_dir = "../data"
kaggle = {
    "title": "Peer-to-Peer Boliviano (BOB) Exchange Data",
    "subtitle": "Github Actions ETL Pipeline",
    "description": "This project contains the ETL pipeline for the Peer-to-Peer Boliviano (BOB) Exchange Data. The data is collected from various sources and transformed into a clean format for analysis. \nThe pipeline includes data extraction, transformation, and loading processes, along with data quality checks.\n",
    "id": "andreschirinos/p2p-bob-exchange",
    "licenses": [
        {
            "name": "CC0-1.0",
            "title": "CC0 1.0",
            "path": "https://creativecommons.org/publicdomain/zero/1.0/",
        }
    ],
    "resources": [
        {
            "path": "advertiser.parquet",
            "description": "Advertiser data from the BOB exchange",
            "schema": {
                "fields": [
                    {
                        "name": "advertiser_userno",
                        "order": 0,
                        "description": "Unique identifier for the advertiser",
                        "type": "string",
                    },
                    {
                        "name": "advertiser_nickname",
                        "order": 1,
                        "description": "Nickname of the advertiser",
                        "type": "string",
                    },
                    {
                        "name": "advertiser_monthordercount",
                        "order": 2,
                        "description": "Number of orders placed by the advertiser in the last month",
                        "type": "number",
                    },
                    {
                        "name": "advertiser_monthfinishrate",
                        "order": 3,
                        "description": "Finish rate of the advertiser in the last month",
                        "type": "number",
                    },
                    {
                        "name": "advertiser_positiverate",
                        "order": 4,
                        "description": "Positive rate of the advertiser",
                        "type": "number",
                    },
                    {
                        "name": "advertiser_usertype",
                        "order": 5,
                        "description": "Type of the advertiser (e.g., user, merchant)",
                        "type": "string",
                    },
                    {
                        "name": "advertiser_usergrade",
                        "order": 6,
                        "description": "Grade of the advertiser",
                        "type": "string",
                    },
                    {
                        "name": "advertiser_u seridentity",
                        "order": 7,
                        "description": "Identity of the advertiser (e.g., MASS_MERCHANT, BLOCK_MERCHANT)",
                        "type": "string",
                    },
                    {
                        "name": "advertiser_viplevel",
                        "order": 8,
                        "description": "VIP level of the advertiser",
                        "type": "number",
                    },
                    {
                        "name": "advertiser_isblocked",
                        "order": 9,
                        "description": "Indicates if the advertiser is blocked",
                        "type": "boolean",
                    },
                    {
                        "name": "advertiser_activetimeinsecond",
                        "order": 10,
                        "description": "Active time of the advertiser in seconds",
                        "type": "number",
                    },
                    {
                        "name": "timestamp",
                        "order": 11,
                        "description": "Timestamp of the data collection",
                        "type": "datetime",
                    },
                    {
                        "name": "source",
                        "order": 12,
                        "description": "Source of the data (e.g, binance)",
                        "type": "string",
                    },
                ]
            },
            "name": "advertiser",
            "profile": "tabular-data-resource",
            "title": "Advertiser Table",
            "format": "format",
            "encoding": "utf-8",
        }
    ],
    "keywords": [
        "p2p",
        "exchange",
        "data",
        "pipeline",
        "etl",
        "bob",
        "bolivia",
        "cryptocurrency",
        "bitcoin",
        "blockchain",
    ],
    "name": "p2p-bob-exchange",
    "homepage": "https://sociest.org",
    "version": "1.0.0",
    "contributors": [{"title": "Andres Chirinos", "role": "author"}],
}

In [None]:
api = KaggleApi()
api.authenticate()

In [None]:
declared_files = {
    resource["path"] for resource in kaggle["resources"]
}
all_files = set(os.listdir(data_dir))
files_to_remove = all_files - declared_files
for file_name in files_to_remove:
    file_path = os.path.join(data_dir, file_name)
    if os.path.isfile(file_path):
        os.remove(file_path)
        print(f"Archivo eliminado: {file_name}")

In [None]:
metadata_path = os.path.join(data_dir, "datapackage.json")
with open(metadata_path, "w") as f:
    json.dump(kaggle, f)

In [None]:
try:
    api.dataset_status(kaggle["id"])
    print("Dataset already exists. Updating...")

    api.dataset_create_version(
        folder=data_dir,
        version_notes="Versión automática generada",
        # quiet=True,
        convert_to_csv=False,
        delete_old_versions=False,
        dir_mode="zip",
    )

except requests.exceptions.HTTPError as e:
    print(e)
    if e.response.status_code in [403,401]:
        print("Dataset not found. Creating a new one...")

        api.dataset_create_new(
            folder=data_dir,
            convert_to_csv=False,
            dir_mode="zip",
            public=True,
            # quiet=True,
        )

In [None]:
print("Cargado")