In [5]:
import os
import requests
import pandas as pd
from io import BytesIO
from zipfile import ZipFile

## Download zip file and convert to dataframe

### Future:
* check if current round data exists by checking for current dataset's round number in filename
* only download if current data doesn't exist
* save result with round number in file name


In [6]:
def numerai_api_query(query):
    
    numerai_api_url = 'https://api-tournament.numer.ai/'
    headers = {'Content-Type':'application/json',
               'Accept':'application/json'
              }
    with requests.Session() as r:
        return r.post(url=numerai_api_url,
                      json=query,
                      headers=headers).json()

In [11]:
def get_current_round():
    rounds_query = {'query': '{rounds {number}}'}
    data = numerai_api_query(rounds_query)['data']['rounds']
    round_numbers = [number for rounds in data 
                     for number in rounds.values()]
    round_numbers.sort(reverse=True)
    return round_numbers[0]

In [12]:
get_current_round()

91

In [13]:
def get_dataset_url():
    
    dataset_query = {'query':'{dataset}'}
    return numerai_api_query(dataset_query)['data']['dataset']


def download_dataset_as_df(dataset_url):
    with requests.Session() as r:
        dataset_download = r.get(dataset_url, stream=True).content
    
        with ZipFile(BytesIO(dataset_download)) as dataset_zip:
            with dataset_zip.open('numerai_training_data.csv') as train_data:
                df_train = pd.read_csv(train_data)
            with dataset_zip.open('numerai_tournament_data.csv') as live_data:
                df_live = pd.read_csv(live_data)
            
    return pd.concat([df_train, df_live])

In [14]:
dataset_url = get_dataset_url()
round_number = dataset_url.split('/')[3]
round_number

'91'

In [16]:
df = download_dataset_as_df(dataset_url)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 637024 entries, 0 to 243410
Data columns (total 54 columns):
id           637024 non-null object
era          637024 non-null object
data_type    637024 non-null object
feature1     637024 non-null float64
feature2     637024 non-null float64
feature3     637024 non-null float64
feature4     637024 non-null float64
feature5     637024 non-null float64
feature6     637024 non-null float64
feature7     637024 non-null float64
feature8     637024 non-null float64
feature9     637024 non-null float64
feature10    637024 non-null float64
feature11    637024 non-null float64
feature12    637024 non-null float64
feature13    637024 non-null float64
feature14    637024 non-null float64
feature15    637024 non-null float64
feature16    637024 non-null float64
feature17    637024 non-null float64
feature18    637024 non-null float64
feature19    637024 non-null float64
feature20    637024 non-null float64
feature21    637024 non-null float64
feat

In [28]:
def save_dataset(df, round_num):
    
    #project_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
    project_dir = os.path.join(os.getcwd(), os.pardir)
    raw_data_path = os.path.join(project_dir, 'data', 'raw')
    raw_data_file = os.path.join(raw_data_path, '{}_numerai_raw.csv'.format(round_num))
    df.to_csv(raw_data_file)
    
    

In [30]:
save_dataset(df, round_number)


/home/sean/Projects/numerai/numerai/notebooks/..
/bin/sh: 1: ll: not found


In [31]:
!ls ~/Projects/numerai/numerai/data/raw/

91_numerai_raw.csv


In [32]:
project_dir = os.path.join(os.getcwd(), os.pardir)
raw_data_path = os.path.join(project_dir, 'data', 'raw')
files = [csv for csv in os.listdir(raw_data_path)]
    

In [33]:
files

['.gitkeep', '91_numerai_raw.csv']