Skip to content

Commit

Permalink
Merge pull request #1396 from CartoDB/to-dataframe-csv
Browse files Browse the repository at this point in the history
to_csv & to_dataframe
  • Loading branch information
oleurud committed Dec 24, 2019
2 parents a50f73b + 6422351 commit 0c1bf8d
Show file tree
Hide file tree
Showing 6 changed files with 125 additions and 33 deletions.
40 changes: 33 additions & 7 deletions cartoframes/data/observatory/catalog/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,8 +396,8 @@ def _join_geographies_geodataframes(geographies_gdf1, geographies_gdf2):
return join_gdf['id'].unique()

@check_do_enabled
def download(self, file_path, credentials=None):
"""Download dataset data as a local file. You need Data Observatory enabled in your CARTO
def to_csv(self, file_path, credentials=None):
"""Download dataset data as a local csv file. You need Data Observatory enabled in your CARTO
account, please contact us at support@carto.com for more information.
For premium datasets (those with `is_public_data` set to False), you need a subscription to the dataset.
Expand All @@ -410,10 +410,35 @@ def download(self, file_path, credentials=None):
a default credentials (if set with :py:meth:`set_default_credentials
<cartoframes.auth.set_default_credentials>`) will be used.
:raises CartoException: If you have not a valid license for the dataset being downloaded.
:raises ValueError: If the credentials argument is not valid.
"""
_credentials = get_credentials(credentials)

if not self._is_subscribed(_credentials):
raise Exception('You are not subscribed to this Dataset yet. '
'Please, use the subscribe method first.')

self._download(_credentials, file_path)

@check_do_enabled
def to_dataframe(self, credentials=None):
"""Download dataset data as a pandas.DataFrame. You need Data Observatory enabled in your CARTO
account, please contact us at support@carto.com for more information.
For premium datasets (those with `is_public_data` set to False), you need a subscription to the dataset.
Check the subscription guides for more information.
Args:
credentials (:py:class:`Credentials <cartoframes.auth.Credentials>`, optional):
credentials of CARTO user account. If not provided,
a default credentials (if set with :py:meth:`set_default_credentials
<cartoframes.auth.set_default_credentials>`) will be used.
Returns:
os.path with the local file path with the file downloaded
pandas.DataFrame
:raises Exception: If you have not a valid license for the dataset being downloaded.
:raises CartoException: If you have not a valid license for the dataset being downloaded.
:raises ValueError: If the credentials argument is not valid.
"""
_credentials = get_credentials(credentials)
Expand All @@ -422,7 +447,7 @@ def download(self, file_path, credentials=None):
raise Exception('You are not subscribed to this Dataset yet. '
'Please, use the subscribe method first.')

self._download(file_path, _credentials)
return self._download(_credentials)

@check_do_enabled
def subscribe(self, credentials=None):
Expand All @@ -436,8 +461,9 @@ def subscribe(self, credentials=None):
See :py:meth:`subscription_info <cartoframes.data.observatory.Dataset.subscription_info>` for more
info
Once you subscribe to a dataset, you can :py:attr:`Dataset.download` its data and use the
:obj:`Enrichment` functions. See the enrichment guides for more info.
Once you subscribe to a dataset, you can download its data by :py:attr:`Dataset.to_csv` or
:py:attr:`Dataset.to_dataframe` and use the :obj:`Enrichment` functions.
See the enrichment guides for more info.
You can check the status of your subscriptions by calling the
:py:meth:`subscriptions <cartoframes.data.observatory.Catalog.subscriptions>` method in the :obj:`Catalog` with
Expand Down
12 changes: 7 additions & 5 deletions cartoframes/data/observatory/catalog/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def _get_print_id(self):

return self.id

def _download(self, file_path, credentials):
def _download(self, credentials, file_path=None):
if not self._is_available_in('bq'):
raise CartoException('{} is not ready for Download. Please, contact us for more information.'.format(self))

Expand All @@ -125,10 +125,12 @@ def _download(self, file_path, credentials):
query = 'SELECT * FROM `{}`'.format(full_remote_table_name)
job = bq_client.query(query)

bq_client.download_to_file(job, file_path, column_names=column_names)

log.info('Data saved: {}.'.format(file_path))
log.info("To read it you can do: `pandas.read_csv('{}')`.".format(file_path))
if file_path:
bq_client.download_to_file(job, file_path, column_names=column_names)
log.info('Data saved: {}.'.format(file_path))
log.info("To read it you can do: `pandas.read_csv('{}')`.".format(file_path))
else:
return bq_client.download_to_dataframe(job)

def _is_available_in(self, platform=_PLATFORM_BQ):
return self.data['available_in'] and platform in self.data['available_in']
Expand Down
37 changes: 31 additions & 6 deletions cartoframes/data/observatory/catalog/geography.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,8 @@ def get_all(cls, filters=None, credentials=None):
return cls._entity_repo.get_all(filters, credentials)

@check_do_enabled
def download(self, file_path, credentials=None):
"""Download geography data as a local file. You need Data Observatory enabled in your CARTO
def to_csv(self, file_path, credentials=None):
"""Download geography data as a local csv file. You need Data Observatory enabled in your CARTO
account, please contact us at support@carto.com for more information.
For premium geographies (those with `is_public_data` set to False), you need a subscription to the geography.
Expand All @@ -213,8 +213,33 @@ def download(self, file_path, credentials=None):
a default credentials (if set with :py:meth:`set_default_credentials
<cartoframes.auth.set_default_credentials>`) will be used.
:raises CartoException: If you have not a valid license for the dataset being downloaded.
:raises ValueError: If the credentials argument is not valud.
"""
_credentials = get_credentials(credentials)

if not self._is_subscribed(_credentials):
raise Exception('You are not subscribed to this Geography yet. '
'Please, use the subscribe method first.')

self._download(_credentials, file_path)

@check_do_enabled
def to_dataframe(self, credentials=None):
"""Download geography data as a pandas.DataFrame. You need Data Observatory enabled in your CARTO
account, please contact us at support@carto.com for more information.
For premium geographies (those with `is_public_data` set to False), you need a subscription to the geography.
Check the subscription guides for more information.
Args:
credentials (:py:class:`Credentials <cartoframes.auth.Credentials>`, optional):
credentials of CARTO user account. If not provided,
a default credentials (if set with :py:meth:`set_default_credentials
<cartoframes.auth.set_default_credentials>`) will be used.
Returns:
A string with the local file path with the file downloaded
pandas.DataFrame
:raises CartoException: If you have not a valid license for the dataset being downloaded.
:raises ValueError: If the credentials argument is not valud.
Expand All @@ -225,7 +250,7 @@ def download(self, file_path, credentials=None):
raise Exception('You are not subscribed to this Geography yet. '
'Please, use the subscribe method first.')

self._download(file_path, _credentials)
return self._download(_credentials)

@check_do_enabled
def subscribe(self, credentials=None):
Expand All @@ -239,8 +264,8 @@ def subscribe(self, credentials=None):
See :py:meth:`subscription_info <cartoframes.data.observatory.Geography.subscription_info>` for more
info
Once you :py:attr:`Geography.subscribe` to a geography you can :py:attr:`Geography.download` its data and
use the enrichment functions. See the enrichment guides for more info.
Once you :py:attr:`Geography.subscribe` to a geography you can download its data by :py:attr:`Geography.to_csv`
or :py:attr:`Geography.to_dataframe` and use the enrichment functions. See the enrichment guides for more info.
You can check the status of your subscriptions by calling the
:py:meth:`subscriptions <cartoframes.data.observatory.Catalog.subscriptions>` method in the :obj:`Catalog` with
Expand Down
20 changes: 19 additions & 1 deletion tests/unit/data/client/test_bigquery_client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import csv
import pandas as pd

from unittest.mock import Mock, patch

Expand Down Expand Up @@ -39,7 +40,7 @@ def teardown_method(self):

@patch.object(BigQueryClient, 'get_table_column_names')
@patch.object(BigQueryClient, '_download_by_bq_storage_api')
def test_download_full(self, download_mock, column_names_mock):
def test_download_to_file_full(self, download_mock, column_names_mock):
data = [{'0': 'word', '1': 'word word'}]
columns = ['column1', 'column2']

Expand All @@ -60,3 +61,20 @@ def test_download_full(self, download_mock, column_names_mock):

assert rows[0] == columns
assert rows[1] == list(data[0].values())

@patch.object(BigQueryClient, 'get_table_column_names')
@patch.object(BigQueryClient, '_download_by_bq_storage_api')
def test_download_to_dataframe_full(self, download_mock, column_names_mock):
data = [{'column1': 'word', 'column2': 'word word'}]
columns = ['column1', 'column2']

column_names_mock.return_value = Mock(return_value=columns)
download_mock.return_value = data

expected_df = pd.DataFrame(data, columns=columns)

bq_client = BigQueryClient(self.credentials)
job = QueryJobMock(data)
df = bq_client.download_to_dataframe(job)

assert df.equals(expected_df)
15 changes: 7 additions & 8 deletions tests/unit/data/observatory/catalog/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,22 +302,22 @@ def test_dataset_download(self, mocked_bq_client, get_by_id_mock, get_all_mock):
credentials = Credentials('fake_user', '1234')

# Then
dataset.download('fake_path', credentials)
dataset.to_csv('fake_path', credentials)

@patch.object(DatasetRepository, 'get_all')
@patch.object(DatasetRepository, 'get_by_id')
@patch('cartoframes.data.observatory.catalog.entity._get_bigquery_client')
def test_dataset_download_not_subscribed(self, mocked_bq_client, get_by_id_mock, get_all_mock):
# Given
get_by_id_mock.return_value = test_dataset2
def test_dataset_not_subscribed_download_fails(self, mocked_bq_client, get_by_id_mock, get_all_mock):
# mock dataset
get_by_id_mock.return_value = test_dataset2 # is private
dataset = Dataset.get(test_dataset2.id)
get_all_mock.return_value = []
mocked_bq_client.return_value = BigQueryClientMock()
credentials = Credentials('fake_user', '1234')

# When
with pytest.raises(Exception) as e:
dataset.download('fake_path', credentials)
dataset.to_csv('fake_path', credentials)

# Then
assert str(e.value) == (
Expand All @@ -335,8 +335,7 @@ def test_dataset_download_not_subscribed_but_public(self, mocked_bq_client, get_
mocked_bq_client.return_value = BigQueryClientMock()
credentials = Credentials('fake_user', '1234')

# Then
dataset.download('fake_path', credentials)
dataset.to_csv('fake_path', credentials)

@patch.object(DatasetRepository, 'get_all')
@patch.object(DatasetRepository, 'get_by_id')
Expand All @@ -353,7 +352,7 @@ def test_dataset_download_without_do_enabled(self, mocked_bq_client, get_by_id_m

# When
with pytest.raises(Exception) as e:
dataset.download('fake_path', credentials)
dataset.to_csv('fake_path', credentials)

# Then
assert str(e.value) == (
Expand Down
34 changes: 28 additions & 6 deletions tests/unit/data/observatory/catalog/test_geography.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,29 @@ def test_geographies_are_exported_as_dataframe(self):
assert isinstance(sliced_geography, pd.Series)
assert sliced_geography.equals(expected_geography_df)

@patch.object(GeographyRepository, 'get_all')
@patch.object(GeographyRepository, 'get_by_id')
@patch('cartoframes.data.observatory.catalog.entity._get_bigquery_client')
def test_geography_not_available_in_bq_download_fails(self, mocked_bq_client, get_by_id_mock, get_all_mock):
# mock geography
get_by_id_mock.return_value = test_geography2
geography = Geography.get(test_geography2.id)

# mock subscriptions
get_all_mock.return_value = [geography]

# mock big query client
mocked_bq_client.return_value = BigQueryClientMock()

# test
credentials = Credentials('fake_user', '1234')

with pytest.raises(Exception) as e:
geography.to_csv('fake_path', credentials)

error = '{} is not ready for Download. Please, contact us for more information.'.format(geography)
assert str(e.value) == error

@patch.object(GeographyRepository, 'get_all')
@patch.object(GeographyRepository, 'get_by_id')
@patch('cartoframes.data.observatory.catalog.entity._get_bigquery_client')
Expand All @@ -228,22 +251,22 @@ def test_geography_download(self, mocked_bq_client, get_by_id_mock, get_all_mock
credentials = Credentials('fake_user', '1234')

# Then
geography.download('fake_path', credentials)
geography.to_csv('fake_path', credentials)

@patch.object(GeographyRepository, 'get_all')
@patch.object(GeographyRepository, 'get_by_id')
@patch('cartoframes.data.observatory.catalog.entity._get_bigquery_client')
def test_geography_download_not_subscribed(self, mocked_bq_client, get_by_id_mock, get_all_mock):
# Given
get_by_id_mock.return_value = test_geography2 # is private
get_by_id_mock.return_value = test_geography2
geography = Geography.get(test_geography2.id)
get_all_mock.return_value = []
mocked_bq_client.return_value = BigQueryClientMock()
credentials = Credentials('fake_user', '1234')

# When
with pytest.raises(Exception) as e:
geography.download('fake_path', credentials)
geography.to_csv('fake_path', credentials)

# Then
assert str(e.value) == (
Expand All @@ -261,8 +284,7 @@ def test_geography_download_not_subscribed_but_public(self, mocked_bq_client, ge
mocked_bq_client.return_value = BigQueryClientMock()
credentials = Credentials('fake_user', '1234')

# Then
geography.download('fake_path', credentials)
geography.to_csv('fake_path', credentials)

@patch.object(GeographyRepository, 'get_all')
@patch.object(GeographyRepository, 'get_by_id')
Expand All @@ -279,7 +301,7 @@ def test_geography_download_without_do_enabled(self, mocked_bq_client, get_by_id

# When
with pytest.raises(Exception) as e:
geography.download('fake_path', credentials)
geography.to_csv('fake_path', credentials)

# Then
assert str(e.value) == (
Expand Down

0 comments on commit 0c1bf8d

Please sign in to comment.