Skip to content

Commit

Permalink
Support URLDataSource
Browse files Browse the repository at this point in the history
  • Loading branch information
ayemos committed Dec 25, 2017
1 parent 835fb09 commit 9fb2d13
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 1 deletion.
2 changes: 2 additions & 0 deletions akagi/contents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
from akagi.contents import s3_content
from akagi.contents import local_file_content
from akagi.contents import spreadsheet_content
from akagi.contents import url_content

from akagi.contents.s3_content import S3Content
from akagi.contents.local_file_content import LocalFileContent
from akagi.contents.spreadsheet_content import SpreadsheetContent
from akagi.contents.url_content import URLContent
26 changes: 26 additions & 0 deletions akagi/contents/url_content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import re
import os
from six.moves.urllib import parse, request

from akagi.content import Content
from akagi.iterator import Iterator


class URLContent(Content):
is_local = False

def __init__(self, urlstr, file_format='csv'):
self._url = parse.urlparse(urlstr)
self.file_format = file_format
self.iterator_class = Iterator.get_iterator_class(file_format)

def __iter__(self):
return self.iterator_class(self)

@property
def key(self):
return os.path.join(self._url.netloc, re.sub(r'^/', '', self._url.path))

@property
def _body(self):
return request.urlopen(self._url.geturl())
11 changes: 10 additions & 1 deletion akagi/data_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from akagi.log import logger

from akagi.contents import S3Content, LocalFileContent, SpreadsheetContent
from akagi.contents import S3Content, LocalFileContent, SpreadsheetContent, URLContent


def data_files_for_s3_prefix(bucket_name, prefix, file_format='csv'):
Expand All @@ -20,6 +20,10 @@ def data_files_for_dir(dir_path, file_format='csv'):
os.listdir(dir_path)]


def data_files_for_urls(urls, file_format='csv'):
return [DataFile.url(url, file_format) for url in urls]


class DataFile(object):
def __init__(self, content):
self._content = content
Expand All @@ -32,6 +36,11 @@ def s3(cls, bucket, key, file_format='csv'):
content = S3Content(bucket, key, file_format)
return DataFile(content)

@classmethod
def url(cls, url, file_format='csv'):
content = URLContent(url, file_format)
return DataFile(content)

@classmethod
def local_file(cls, path, file_format='csv'):
content = LocalFileContent(path, file_format)
Expand Down
2 changes: 2 additions & 0 deletions akagi/data_sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
from akagi.data_sources import s3_data_source
from akagi.data_sources import local_data_source
from akagi.data_sources import spreadsheet_data_source
from akagi.data_sources import url_data_source


from akagi.data_sources.redshift_data_source import RedshiftDataSource
from akagi.data_sources.s3_data_source import S3DataSource
from akagi.data_sources.local_data_source import LocalDataSource
from akagi.data_sources.spreadsheet_data_source import SpreadsheetDataSource
from akagi.data_sources.url_data_source import URLDataSource
20 changes: 20 additions & 0 deletions akagi/data_sources/url_data_source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from akagi.data_source import DataSource

from akagi.data_file import data_files_for_urls


class URLDataSource(DataSource):
'''URLDataSource replesents a set of files on remote location.
'''

@classmethod
def for_urls(cls, urls, file_format='csv', no_cache=False):
return URLDataSource(urls, file_format=file_format, no_cache=no_cache)

def __init__(self, urls, file_format='csv', no_cache=False):
self._urls = urls
self._file_format = file_format

@property
def data_files(self):
return data_files_for_urls(self._urls, self._file_format)

0 comments on commit 9fb2d13

Please sign in to comment.