In [1]:
# Install dependencies
%pip install --upgrade pip 
%pip install pandas pyarrow fastparquet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import json, re

# PARAMETERS
input_file = '../data/raw-data.csv'

In [3]:
# TRANSFORM
print(f"Transformando datos de {input_file}...")
df = pd.read_csv(input_file)

df.columns = [col.strip().lower().replace(' ', '_').replace('.','_') for col in df.columns]

Transformando datos de ../data/raw-data.csv...


# Advertiser

In [4]:
# Separa las tablas
advertiser_cols = [col for col in df.columns if col.startswith("advertiser") or col in ["timestamp", "source"]]
df_advertiser = df[advertiser_cols]

In [5]:
# Declara formatos
custom_funcs = {
    "advertiser_userno": lambda col: col.astype(str),
    "advertiser_monthordercount": lambda col: col.astype(int),
    "advertiser_monthfinishrate": lambda col: col.astype(float),
    "advertiser_positiverate": lambda col: col.astype(float),
    "advertiser_usertype": lambda col: col.astype("category"),
    "advertiser_usergrade": lambda col: col.astype(int),
    "advertiser_useridentity": lambda col: col.astype("category"),
    "advertiser_badges": lambda col: col.astype(str),
    "advertiser_viplevel": lambda col: col.fillna(0).astype(int),
    "advertiser_isblocked": lambda col: col.astype(bool),
    "advertiser_activetimeinsecond": lambda col: col.fillna(-1).astype(int),
    
    "timestamp": lambda col: pd.to_datetime(col, unit="s"),
    "source": lambda col: col.astype("category"),
}

In [6]:
# Pone los formatos
default_func = lambda col: col

for col in df_advertiser.columns:
    df_advertiser.loc[:, col] = custom_funcs.get(col, default_func)(df_advertiser[col])

In [8]:
df_advertiser.to_parquet('../data/advertiser.parquet', index=False)
print("advertiser Transformación terminada.")

advertiser Transformación terminada.


# Advice

In [9]:
advice_cols = [col for col in df.columns if (not col.startswith("advertiser") or col in ["advertiser_userno", "advertiser_nickname"]) and not col in []]
df_advice = df[advice_cols]

In [10]:
# Declara formatos
custom_funcs = {
    "adv_advno": lambda col: col.astype(str),
    "adv_classify": lambda col: col.astype("category"),
    "adv_tradetype": lambda col: col.astype("category"),
    "adv_asset": lambda col: col.astype("category"),
    "adv_fiatunit": lambda col: col.astype("category"),
    "adv_price": lambda col: col.astype(float),
    "adv_surplusamount": lambda col: col.astype(float),
    "adv_tradablequantity": lambda col: col.astype(float),
    "adv_maxsingletransamount": lambda col: col.astype(float),
    "adv_minsingletransamount": lambda col: col.astype(float),
    "adv_paytimelimit": lambda col: col.astype(int),
    "adv_takeradditionalkycrequired": lambda col: col.astype(bool),
    "adv_assetscale": lambda col: col.astype(int),
    "adv_fiatscale": lambda col: col.astype(int),
    "adv_pricescale": lambda col: col.astype(int),
    "adv_fiatsymbol": lambda col: col.astype("category"),
    "adv_istradable": lambda col: col.astype(bool),
    "adv_dynamicmaxsingletransamount": lambda col: col.astype(float),
    "adv_minsingletransquantity": lambda col: col.astype(float),
    "adv_maxsingletransquantity": lambda col: col.astype(float),
    "adv_dynamicmaxsingletransquantity": lambda col: col.astype(float),
    "adv_commissionrate": lambda col: col.astype(float),
    "adv_issafepayment": lambda col: col.astype(bool),
    
    "adv_trademethods": lambda col: col.apply(lambda x: [method['identifier'] for method in eval(x)]),
    
    "advertiser_userno": lambda col: col.astype(str),
    "advertiser_nickname": lambda col: col.astype(str),

    "timestamp": lambda col: pd.to_datetime(col, unit="s"),
    "source": lambda col: col.astype("category"),
}

In [11]:
# Pone los formatos
default_func = lambda col: col

for col in df_advice.columns:
    df_advice.loc[:, col] = custom_funcs.get(col, default_func)(df_advice[col])

In [12]:
df_advice.to_parquet('../data/advice.parquet', index=False)
print("advice Transformación terminada.")

advice Transformación terminada.


# Trade Methods

In [15]:
trade_methods_cols = [
    col
    for col in df.columns
    if (col in ["adv_advno", "adv_trademethods"]) and not col in []
]
df_advice_trade_info = df[trade_methods_cols]

In [24]:
df_advice_trade_table = df_advice_trade_info["adv_trademethods"].apply(eval).explode()

df_trade_methods_table = pd.json_normalize(df_advice_trade_table)

df_trade_methods_table["adv_advno"] = df_advice_trade_info.loc[df_advice_trade_table.index, "adv_advno"].values

df_trade_methods_table = df_trade_methods_table.drop_duplicates()

df_trade_methods = df_trade_methods_table.groupby("identifier").agg({
    "adv_advno": set,
    "tradeMethodName": set,
    "tradeMethodShortName": set,
    "tradeMethodBgColor": set
}).reset_index()

df_trade_methods = df_trade_methods.applymap(lambda x: {str(i) for i in x if i is not None} if isinstance(x, set) else x)
df_trade_methods = df_trade_methods.applymap(lambda x: ','.join(x) if isinstance(x, set) else x)

  df_trade_methods = df_trade_methods.applymap(lambda x: {str(i) for i in x if i is not None} if isinstance(x, set) else x)
  df_trade_methods = df_trade_methods.applymap(lambda x: ','.join(x) if isinstance(x, set) else x)


In [25]:
df_trade_methods

Unnamed: 0,identifier,adv_advno,tradeMethodName,tradeMethodShortName,tradeMethodBgColor
0,BANK,"12644843695965220864,12691825643026161664,1371...",Bank Transfer,Bank Transfer,#F0B90B
1,BancoDeBolivia,"12695193660822462464,12644843695965220864,1273...",Banco Nacional de Bolivia,,#26B460
2,BancoDeCredito,"12728505761121427456,13726025298694373376,1274...",Banco de Credito,,#FF7B00
3,BancoEconomico,"13726025298694373376,12747332864523661312,1271...",Banco Economico,,#E7142A
4,BancoFassil,11470102002332057600,Banco Fassil,Banco Fassil,#05A5A0
5,BancoGanadero,"13737177849829740544,12695193660822462464,1264...",Banco Ganadero,Banco Ganadero,#80BF00
6,BancoSantaCruz,"12728505761121427456,12747238631733075968,1274...",Banco Mercantil Santa Cruz,,#004A2C
7,BancoSolidario,"12712312109981478912,13718978970930118656,1373...",Banco Solidario,Banco Solidario,#00BDD2
8,BancoUnion,"11460602077182488576,12644843695965220864,1269...",Banco Union,Banco Union,#00AEEF
9,SoliPagos,"12715889572558929920,12689770603625492480,1269...",SoliPagos,,#03A9D7


In [26]:
df_trade_methods.to_parquet('../data/trade_methods.parquet', index=False)
print("trade_methods Transformación terminada.")

trade_methods Transformación terminada.
