# EXPORT KAGGLE DATA TO BIGQUERY

## IMPORTS

In [None]:
import os
os.environ['KAGGLE_USERNAME'] = "" # KAGGLE USERNAME
os.environ['KAGGLE_KEY'] = "" # KAGGLE TOKEN
!pip install kaggle
import kaggle
kaggle.api.authenticate()

In [None]:
import findspark
import pandas as pd
from ks_crypto.lib.spark_conn import create_yarn_connection
from pyspark.sql import functions as F, types as T

findspark.init()
spark = create_yarn_connection()

In [None]:
BUCKET_NAME = 'ks-crypto'

BASE_PATH = '/home/dat/'

FEATURES_FULL_PATH = f"{BASE_PATH}elliptic_bitcoin_dataset/elliptic_txs_features.csv"
FEATURES_FULL_TABLENAME = "kschool-crypto:ks_crypto_dataset.anon_address_features" 

EDGES_FULL_PATH = f"{BASE_PATH}elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv"
EDGES_FULL_TABLENAME = "kschool-crypto:ks_crypto_dataset.anon_transactions_edges"

CLASSES_FULL_PATH = f"{BASE_PATH}elliptic_bitcoin_dataset/elliptic_txs_classes.csv"
CLASSES_FULL_TABLENAME = "kschool-crypto:ks_crypto_dataset.anon_address_classes"

DANON_FULL_PATH = f"{BASE_PATH}Result.csv"
DANON_FULL_TABLENAME = "kschool-crypto:ks_crypto_dataset.danon_transactions"

## 1. DOWNLOAD KAGGLE DATA 

In [None]:
dataset_name='ellipticco/elliptic-data-set'

kaggle.api.dataset_download_files(dataset=dataset_name, 
                                  path=BASE_PATH,
                                  unzip=True)

dataset_name='alexbenzik/deanonymized-995-pct-of-elliptic-transactions'
output_path='./home/dat/'

kaggle.api.dataset_download_files(dataset=dataset_name, 
                                  path=BASE_PATH,
                                  unzip=True)

## 2. READ DATA

In [None]:
features_pd = \
    pd.read_csv(FEATURES_FULL_PATH, header=None)

features_pd.columns = \
    ['anon_address', 'timestep'] + \
    [f'anon_trans_feat_{i}' for i in range(93)] + \
    [f'anon_agg_feat_{i}' for i in range(72)]

double_blacklist = ['anon_address', 'timestep']
float_col_list = \
    [F.col(col).cast(T.FloatType()).alias(col) for col in features_pd.columns if col not in double_blacklist]

features_df = \
    spark.createDataFrame(features_pd)\
    .select(F.col('anon_address').cast('string').alias('anon_address'),
            F.col('timestep').cast('int').alias('timestep'),
            *float_col_list)

features_df.printSchema()

In [None]:
edges_pd = \
    pd.read_csv(EDGES_FULL_PATH)\
    .rename(columns = {'txId1': 'anon_input_address', 
                       "txId2":'anon_output_address'})

edges_df = \
    spark.createDataFrame(edges_pd)\
    .select(F.col('anon_input_address').cast('string').alias('anon_input_address'),
            F.col('anon_output_address').cast('string').alias('anon_output_address'))

edges_df.printSchema()

In [None]:
classes_pd = \
    pd.read_csv(CLASSES_FULL_PATH)\
    .rename(columns={'txId': 'anon_transaction_hash', 
                     'class': 'class'})

classes_df = \
    spark.createDataFrame(classes_pd)\
    .select(F.col('anon_transaction_hash').cast('string'), 
            F.when(F.col('class') == '1', 'illicit')
            .when(F.col('class') == '2', 'licit')
            .otherwise(F.col('class')).alias('class'))

classes_df.printSchema()

In [None]:
danon_pd = \
    pd.read_csv(DANON_FULL_PATH)\
    .rename(columns={'txId': 'anon_transaction_hash', 
                     'transaction': 'transaction_hash'})

danon_df = \
    spark.createDataFrame(danon_pd)\
    .select(F.col('anon_transaction_hash').cast('string'), 
            F.col('transaction_hash').cast('string'))

danon_df.printSchema()

## 3. EXPORT DATA

In [None]:
features_df \
    .write.format('bigquery') \
    .option('table', FEATURES_FULL_TABLENAME) \
    .option("temporaryGcsBucket", BUCKET_NAME) \
    .save()

In [None]:
classes_df \
    .write.format('bigquery') \
    .option('table', CLASSES_FULL_TABLENAME) \
    .option("temporaryGcsBucket", BUCKET_NAME) \
    .save()

In [None]:
edges_df \
    .write.format('bigquery') \
    .option('table', EDGES_FULL_TABLENAME) \
    .option("temporaryGcsBucket", BUCKET_NAME) \
    .save()

In [None]:
danon_df \
    .write.format('bigquery') \
    .option('table', DANON_FULL_TABLENAME) \
    .option("temporaryGcsBucket", BUCKET_NAME) \
    .save()