In [None]:
%pip install --upgrade pip 
%pip install pandas pyarrow fastparquet kaggle

In [None]:
import pandas as pd
import json, re, os
from kaggle.api.kaggle_api_extended import KaggleApi

In [None]:
data_dir = "../data"
kaggle = {
    "title": "Peer-to-Peer Boliviano (BOB) Exchange Data",
    "subtitle": "Github Actions ETL Pipeline",
    "description": "This project contains the ETL pipeline for the Peer-to-Peer Boliviano (BOB) Exchange Data. The data is collected from various sources and transformed into a clean format for analysis. \nThe pipeline includes data extraction, transformation, and loading processes, along with data quality checks.\n",
    "id": "andreschirinos/p2p-bob-exchange",
    "licenses": [
        {
            "name": "CC0-1.0",
            "title": "CC0 1.0",
            "path": "https://creativecommons.org/publicdomain/zero/1.0/",
        }
    ],
    "resources": [
        {
            "path": "advertiser.parquet",
            "description": "Advertiser data from the BOB exchange",
            "schema": {
                "fields": [
                    {
                        "name": "advertiser_userno",
                        "order": 0,
                        "description": "Unique identifier for the advertiser",
                        "type": "string",
                    },
                    {
                        "name": "advertiser_nickname",
                        "order": 1,
                        "description": "Nickname of the advertiser",
                        "type": "string",
                    },
                    {
                        "name": "advertiser_monthordercount",
                        "order": 2,
                        "description": "Number of orders placed by the advertiser in the last month",
                        "type": "number",
                    },
                    {
                        "name": "advertiser_monthfinishrate",
                        "order": 3,
                        "description": "Finish rate of the advertiser in the last month",
                        "type": "number",
                    },
                    {
                        "name": "advertiser_positiverate",
                        "order": 4,
                        "description": "Positive rate of the advertiser",
                        "type": "number",
                    },
                    {
                        "name": "advertiser_usertype",
                        "order": 5,
                        "description": "Type of the advertiser (e.g., user, merchant)",
                        "type": "string",
                    },
                    {
                        "name": "advertiser_usergrade",
                        "order": 6,
                        "description": "Grade of the advertiser",
                        "type": "string",
                    },
                    {
                        "name": "advertiser_u seridentity",
                        "order": 7,
                        "description": "Identity of the advertiser (e.g., MASS_MERCHANT, BLOCK_MERCHANT)",
                        "type": "string",
                    },
                    {
                        "name": "advertiser_viplevel",
                        "order": 8,
                        "description": "VIP level of the advertiser",
                        "type": "number",
                    },
                    {
                        "name": "advertiser_isblocked",
                        "order": 9,
                        "description": "Indicates if the advertiser is blocked",
                        "type": "boolean",
                    },
                    {
                        "name": "advertiser_activetimeinsecond",
                        "order": 10,
                        "description": "Active time of the advertiser in seconds",
                        "type": "number",
                    },
                    {
                        "name": "timestamp",
                        "order": 11,
                        "description": "Timestamp of the data collection",
                        "type": "datetime",
                    },
                    {
                        "name": "source",
                        "order": 12,
                        "description": "Source of the data (e.g, binance)",
                        "type": "string",
                    },
                ]
            },
            "name": "advertiser",
            "profile": "tabular-data-resource",
            "title": "Advertiser Table",
            "format": "format",
            "encoding": "utf-8",
        }
    ],
    "keywords": [
        "p2p",
        "exchange",
        "data",
        "pipeline",
        "etl",
        "bob",
        "bolivia",
        "cryptocurrency",
        "bitcoin",
        "blockchain",
    ],
    "name": "p2p-bob-exchange",
    "homepage": "https://sociest.org",
    "version": "1.0.0",
    "contributors": [{"title": "Andres Chirinos", "role": "author"}],
}

In [None]:
input_file = os.path.join(data_dir, 'raw-data.csv')

In [None]:
api = KaggleApi()
api.authenticate()

In [None]:
print(f"Transformando datos de {input_file}...")
df = pd.read_csv(input_file)

df.columns = [col.strip().lower().replace(' ', '_').replace('.','_') for col in df.columns]

# Advertiser

In [None]:
# Separa las tablas
advertiser_cols = [col for col in df.columns if col.startswith("advertiser") or col in ["timestamp", "source"]]
df_advertiser = df[advertiser_cols]

In [None]:
# Declara formatos
custom_funcs = {
    "advertiser_userno": lambda col: col.astype(str),
    "advertiser_monthordercount": lambda col: col.astype(int),
    "advertiser_monthfinishrate": lambda col: col.astype(float),
    "advertiser_positiverate": lambda col: col.astype(float),
    "advertiser_usertype": lambda col: col.astype("category"),
    "advertiser_usergrade": lambda col: col.astype(int),
    "advertiser_useridentity": lambda col: col.astype("category"),
    "advertiser_badges": lambda col: col.astype(str),
    "advertiser_viplevel": lambda col: col.fillna(0).astype(int),
    "advertiser_isblocked": lambda col: col.astype(bool),
    "advertiser_activetimeinsecond": lambda col: col.fillna(-1).astype(int),
    
    "timestamp": lambda col: pd.to_datetime(col, unit="s"),
    "source": lambda col: col.astype("category"),
}

In [None]:
# Pone los formatos
default_func = lambda col: col

for col in df_advertiser.columns:
    df_advertiser.loc[:, col] = custom_funcs.get(col, default_func)(df_advertiser[col])

In [None]:
df[["adv_advno", "advertiser_userno"]].astype(str).groupby("advertiser_userno").agg(','.join)

In [None]:
df_advertiser["advices"] = df[["adv_advno", "advertiser_userno"]].astype(str).groupby("advertiser_userno")["adv_advno"].transform(lambda x: ','.join(x))

In [None]:
df_advertiser.drop_duplicates(subset=["advertiser_userno"], inplace=True)

In [None]:
df_advertiser.to_parquet(os.path.join(data_dir,'advertiser.parquet'), index=False)
print("advertiser Transformación terminada.")

# Advice

In [None]:
advice_cols = [col for col in df.columns if (not col.startswith("advertiser") or col in ["advertiser_userno"]) and not col in []]
df_advice = df[advice_cols]

In [None]:
# Declara formatos
custom_funcs = {
    "adv_advno": lambda col: col.astype(str),
    "adv_classify": lambda col: col.astype("category"),
    "adv_tradetype": lambda col: col.astype("category"),
    "adv_asset": lambda col: col.astype("category"),
    "adv_fiatunit": lambda col: col.astype("category"),
    "adv_price": lambda col: col.astype(float),
    "adv_surplusamount": lambda col: col.astype(float),
    "adv_tradablequantity": lambda col: col.astype(float),
    "adv_maxsingletransamount": lambda col: col.astype(float),
    "adv_minsingletransamount": lambda col: col.astype(float),
    "adv_paytimelimit": lambda col: col.astype(int),
    "adv_takeradditionalkycrequired": lambda col: col.astype(bool),
    "adv_assetscale": lambda col: col.astype(int),
    "adv_fiatscale": lambda col: col.astype(int),
    "adv_pricescale": lambda col: col.astype(int),
    "adv_fiatsymbol": lambda col: col.astype("category"),
    "adv_istradable": lambda col: col.astype(bool),
    "adv_dynamicmaxsingletransamount": lambda col: col.astype(float),
    "adv_minsingletransquantity": lambda col: col.astype(float),
    "adv_maxsingletransquantity": lambda col: col.astype(float),
    "adv_dynamicmaxsingletransquantity": lambda col: col.astype(float),
    "adv_commissionrate": lambda col: col.astype(float),
    "adv_issafepayment": lambda col: col.astype(bool),
    
    "adv_trademethods": lambda col: col.apply(lambda x: ",".join([method['identifier'] for method in eval(x)])),
    
    "advertiser_userno": lambda col: col.astype(str),

    "timestamp": lambda col: pd.to_datetime(col, unit="s"),
    "source": lambda col: col.astype("category"),
}

In [None]:
# Pone los formatos
default_func = lambda col: col

for col in df_advice.columns:
    df_advice.loc[:, col] = custom_funcs.get(col, default_func)(df_advice[col])

In [None]:
df_advice.columns = df_advice.columns.str.replace("^adv_", "", regex=True)

In [None]:
api.dataset_download_file(kaggle["id"], "advice.parquet", path=data_dir, force=True, quiet=False)

In [None]:
df_last_advice = pd.read_parquet(os.path.join(data_dir, "advice.parquet"))
df_advice = pd.concat([df_last_advice, df_advice], ignore_index=True)

In [None]:
df_advice.to_parquet(os.path.join(data_dir,'advice.parquet'), index=False)
print("advice Transformación terminada.")

# Trade Methods

In [None]:
trade_methods_cols = [
    col
    for col in df.columns
    if (col in ["adv_advno", "adv_trademethods"]) and not col in []
]
df_advice_trade_info = df[trade_methods_cols]

In [None]:
df_advice_trade_table = df_advice_trade_info["adv_trademethods"].apply(eval).explode()

df_trade_methods_table = pd.json_normalize(df_advice_trade_table)

df_trade_methods_table["adv_advno"] = df_advice_trade_info.loc[df_advice_trade_table.index, "adv_advno"].values

df_trade_methods_table = df_trade_methods_table.drop_duplicates()

df_trade_methods = df_trade_methods_table.groupby("identifier").agg({
    "adv_advno": set,
    "tradeMethodName": set,
    "tradeMethodShortName": set,
    "tradeMethodBgColor": set
}).reset_index()

df_trade_methods = df_trade_methods.applymap(lambda x: {str(i) for i in x if i is not None} if isinstance(x, set) else x)
df_trade_methods = df_trade_methods.applymap(lambda x: ','.join(x) if isinstance(x, set) else x)

In [None]:
df_trade_methods.to_parquet(os.path.join(data_dir,'trade_methods.parquet'), index=False)
print("trade_methods Transformación terminada.")