gcsfs/core.py

# -*- coding: utf-8 -*-
"""
Google Cloud Storage pythonic interface
"""
from __future__ import print_function

import decorator

import array
from base64 import b64encode
import google.auth as gauth
from google.auth.transport.requests import AuthorizedSession
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from google.oauth2 import service_account
from hashlib import md5
import io
import json
import logging
import traceback
import os
import posixpath
import pickle
import re
import requests
import sys
import time
import warnings

from requests.exceptions import RequestException
from .utils import HtmlError
from .utils import is_retriable
from .utils import read_block

PY2 = sys.version_info.major == 2

logger = logging.getLogger(__name__)

@decorator.decorator
def _tracemethod(f, self, *args, **kwargs):
   logger.debug("%s(args=%s, kwargs=%s)", f.__name__, args, kwargs)
   if logger.isEnabledFor(logging.DEBUG-1):
       tb_io = io.StringIO()
       traceback.print_stack(file=tb_io)
       logger.log(logging.DEBUG - 1, tb_io.getvalue())

   return f(self, *args, **kwargs)

# client created 23-Sept-2017
not_secret = {"client_id": "586241054156-0asut23a7m10790r2ik24309flribp7j"
                           ".apps.googleusercontent.com",
              "client_secret": "w6VkI99jS6e9mECscNztXvQv"}
client_config = {'installed': {
    'client_id': not_secret['client_id'],
    'client_secret': not_secret['client_secret'],
    "auth_uri": "https://accounts.google.com/o/oauth2/auth",
    "token_uri": "https://accounts.google.com/o/oauth2/token"
}}
tfile = os.path.join(os.path.expanduser("~"), '.gcs_tokens')
ACLs = {"authenticatedread", "bucketownerfullcontrol", "bucketownerread",
        "private", "projectprivate", "publicread"}
bACLs = {"authenticatedRead", "private", "projectPrivate", "publicRead",
         "publicReadWrite"}
DEFAULT_PROJECT = os.environ.get('GCSFS_DEFAULT_PROJECT', '')
DEBUG = False

GCS_MIN_BLOCK_SIZE = 2 ** 18
DEFAULT_BLOCK_SIZE = 5 * 2 ** 20
READ_BLOCK_SIZE = 5 * 2 ** 20

if PY2:
    FileNotFoundError = IOError


def quote_plus(s):
    """
    Convert some URL elements to be HTTP-safe.

    Not the same as in urllib, because, for instance, parentheses and commas
    are passed through.

    Parameters
    ----------
    s: input URL/portion

    Returns
    -------
    corrected URL
    """
    s = s.replace('/', '%2F')
    s = s.replace(' ', '%20')
    return s


def norm_path(path):
    """Canonicalize path by split and rejoining."""
    # TODO Should canonical path include protocol?
    return "/".join(split_path(path))

def split_path(path):
    """
    Normalise GCS path string into bucket and key.

    Parameters
    ----------
    path : string
        Input path, like `gcs://mybucket/path/to/file`.
        Path is of the form: '[gs|gcs://]bucket[/key]'

    Returns
    -------
        (bucket, key) tuple

    Examples
    --------
    >>> split_path("gcs://mybucket/path/to/file")
    ['mybucket', 'path/to/file']
    >>> split_path("mybucket/path/to/file")
    ['mybucket', 'path/to/file']
    >>> split_path("gs://mybucket")
    ['mybucket', '']
    """
    if path.startswith('gcs://'):
        path = path[6:]
    if path.startswith('gs://'):
        path = path[5:]
    if path.startswith('/'):
        path = path[1:]
    if '/' not in path:
        return path, ""
    else:
        return path.split('/', 1)


def validate_response(r, path):
    """
    Check the requests object r, raise error if it's not ok.

    Parameters
    ----------
    r: requests response object
    path: associated URL path, for error messages
    """
    if not r.ok:
        m = str(r.content)
        error = None
        try:
            error = r.json()['error']
            msg = error['message']
        except:
            msg = str(r.content)

        if DEBUG:
            print(r.url, r.headers, sep='\n')
        if "Not Found" in m:
            raise FileNotFoundError(path)
        elif "forbidden" in m:
            raise IOError("Forbidden: %s\n%s" % (path, msg))
        elif "invalid" in m:
            raise ValueError("Bad Request: %s\n%s" % (path, msg))
        elif error:
            raise HtmlError(error)
        else:
            raise RuntimeError(m)


class GCSFileSystem(object):
    """
    Connect to Google Cloud Storage.

    The following modes of authentication are supported:
    - ``token=None``, GCSFS will attempt to guess your credentials in the
      following order: gcloud CLI default, gcsfs cached token, google compute
      metadata service, anonymous.
    - ``token='google_default'``, your default gcloud credentials will be used,
      which are typically established by doing ``gcloud login`` in a terminal.
    - ``token=='cache'``, credentials from previously successful gcsfs
      authentication will be used (use this after "browser" auth succeeded)
    - ``token='anon'``, no authentication is preformed, and you can only
      access data which is accessible to allUsers (in this case, the project and
      access level parameters are meaningless)
    - ``token='browser'``, you get an access code with which you can
      authenticate via a specially provided URL
    - if ``token='cloud'``, we assume we are running within google compute
      or google container engine, and query the internal metadata directly for
      a token.
    - you may supply a token generated by the
      [gcloud](https://cloud.google.com/sdk/docs/)
      utility; this is either a python dictionary, the name of a file
      containing the JSON returned by logging in with the gcloud CLI tool,
      or a Credentials object. gcloud typically stores its tokens in locations
      such as
      ``~/.config/gcloud/application_default_credentials.json``,
      `` ~/.config/gcloud/credentials``, or
      ``~\AppData\Roaming\gcloud\credentials``, etc.

    Parameters
    ----------
    project : string
        project_id to work under. Note that this is not the same as, but ofter
        very similar to, the project name.
        This is required in order
        to list all the buckets you have access to within a project and to
        create/delete buckets, or update their access policies.
        If ``token='google_default'``, the value is overriden by the default,
        if ``token='anon'``, the value is ignored.
    access : one of {'read_only', 'read_write', 'full_control'}
        Full control implies read/write as well as modifying metadata,
        e.g., access control.
    token: None, dict or string
        (see description of authentication methods, above)
    consistency: 'none', 'size', 'md5'
        Check method when writing files. Can be overridden in open().
    cache_timeout: float, seconds
        Cache expiration time in seconds for object metadata cache.
        Set cache_timeout <= 0 for no caching, None for no cache expiration.
    """
    scopes = {'read_only', 'read_write', 'full_control'}
    retries = 4  # number of retries on http failure
    base = "https://www.googleapis.com/storage/v1/"
    _singleton = [None]
    default_block_size = DEFAULT_BLOCK_SIZE

    def __init__(self, project=DEFAULT_PROJECT, access='full_control',
                 token=None, block_size=None, consistency='none', cache_timeout = 60 ):
        if access not in self.scopes:
            raise ValueError('access must be one of {}', self.scopes)
        if project is None:
            warnings.warn('GCS project not set - cannot list or create buckets')
        if block_size is not None:
            self.default_block_size = block_size
        self.project = project
        self.access = access
        self.scope = "https://www.googleapis.com/auth/devstorage." + access
        self.consistency = consistency
        self.token = token
        self.session = None
        self.connect(method=token)


        self._singleton[0] = self

        self.cache_timeout = cache_timeout
        self._listing_cache = {}

    @classmethod
    def current(cls):
        """ Return the most recently created GCSFileSystem

        If no GCSFileSystem has been created, then create one
        """
        if not cls._singleton[0]:
            return GCSFileSystem()
        else:
            return cls._singleton[0]

    @staticmethod
    def load_tokens():
        try:
            with open(tfile, 'rb') as f:
                tokens = pickle.load(f)
            # backwards compatability
            tokens = {k: (GCSFileSystem._dict_to_credentials(v)
                          if isinstance(v, dict) else v)
                      for k, v in tokens.items()}
        except IOError:
            tokens = {}
        GCSFileSystem.tokens = tokens

    def _connect_google_default(self):
        credentials, project = gauth.default()
        self.project = project
        self.session = AuthorizedSession(credentials)

    def _connect_cloud(self):
        credentials = gauth.compute_engine.Credentials()
        self.session = AuthorizedSession(credentials)

    def _connect_cache(self):
        project, access = self.project, self.access
        if (project, access) in self.tokens:
            credentials = self.tokens[(project, access)]
            self.session = AuthorizedSession(credentials)

    @staticmethod
    def _dict_to_credentials(token):
        """
        Convert old dict-style token.

        Does not preserve access token itself, assumes refresh required.
        """
        return Credentials(
            None, refresh_token=token['refresh_token'],
            client_secret=token['client_secret'],
            client_id=token['client_id'],
            token_uri='https://www.googleapis.com/oauth2/v4/token'
        )

    def _connect_token(self, token):
        """
        Connect using a concrete token

        Parameters
        ----------
        token: str, dict or Credentials
            If a str, try to load as a Service file, or next as a JSON; if
            dict, try to interpret as credentials; if Credentials, use directly.
        """
        if isinstance(token, str):
            if not os.path.exists(token):
                raise FileNotFoundError(token)
            try:
                # is this a "service" token?
                self._connect_service(token)
                return
            except:
                # some other kind of token file
                # will raise exception if is not json
                token = json.load(open(token))
        if isinstance(token, dict):
            credentials = GCSFileSystem._dict_to_credentials(token)
        elif isinstance(token, Credentials):
            credentials = token
        else:
            raise ValueError('Token format no understood')
        self.session = AuthorizedSession(credentials)

    def _connect_service(self, fn):
        # raises exception if file does not match expectation
        credentials = service_account.Credentials.from_service_account_file(fn)
        self.session = AuthorizedSession(credentials)

    def _connect_anon(self):
        self.session = requests.Session()

    def _connect_browser(self):
        flow = InstalledAppFlow.from_client_config(client_config, [self.scope])
        credentials = flow.run_console()
        self.tokens[(self.project, self.access)] = credentials
        self._save_tokens()
        self.session = AuthorizedSession(credentials)

    def connect(self, method=None):
        """
        Establish session token. A new token will be requested if the current
        one is within 100s of expiry.

        Parameters
        ----------
        method: str (google_default|cache|cloud|token|anon|browser) or None
            Type of authorisation to implement - calls `_connect_*` methods.
            If None, will try sequence of methods.
        """
        if method not in ['google_default', 'cache', 'cloud', 'token', 'anon',
                          'browser', None]:
            self._connect_token(method)
        elif method is None:
            for meth in ['google_default', 'cache', 'cloud', 'anon']:
                try:
                    self.connect(method=meth)
                except:
                    logger.debug('Connection with method "%s" failed' % meth)
                if self.session:
                    break
        else:
            self.__getattribute__('_connect_' + method)()
            self.method = method

    @staticmethod
    def _save_tokens():
        try:
            with open(tfile, 'wb') as f:
                pickle.dump(GCSFileSystem.tokens, f, 2)
        except Exception as e:
            warnings.warn('Saving token cache failed: ' + str(e))

    def _call(self, method, path, *args, **kwargs):
        logger.debug("_call(%s, %s, args=%s, kwargs=%s)", method, path, args, kwargs)

        for k, v in list(kwargs.items()):
            # only pass parameters that have values
            if v is None:
                del kwargs[k]
        json = kwargs.pop('json', None)
        meth = getattr(self.session, method)
        if args:
            path = path.format(*[quote_plus(p) for p in args])
        for retry in range(self.retries):
            try:
                time.sleep(2**retry - 1)
                r = meth(self.base + path, params=kwargs, json=json)
                validate_response(r, path)
                break
            except (HtmlError, RequestException) as e:
                logger.exception("_call exception: %s", e)
                if retry == self.retries - 1:
                    raise e
                if is_retriable(e):
                    # retry
                    continue
                raise e
        try:
            out = r.json()
        except ValueError:
            out = r.content
        return out

    @property
    def buckets(self):
        """Return list of available project buckets."""
        return [b["name"] for b in self._list_buckets()["items"]]


    @classmethod
    def _process_object(self, bucket, object_metadata):
        object_metadata["size"] = int(object_metadata.get("size", 0))
        object_metadata["path"] = posixpath.join(bucket, object_metadata["name"])

        return object_metadata

    def _get_object(self, path):
        """Return object information at the given path."""
        logger.debug("_get_object(%s)", path)
        bucket, key = split_path(path)

        # Check if parent dir is in listing cache
        parent = "/".join([bucket, posixpath.dirname(key.rstrip("/"))]) + "/"
        parent_cache = self._maybe_get_cached_listing(parent)
        if parent_cache:
            cached_obj = [o for o in parent_cache["items"] if o["name"] == key]
            if cached_obj:
                logger.debug("found cached object: %s", cached_obj)
                return cached_obj[0]
            else:
                # Should error on missing cache or reprobe?
                pass

        if not key:
            # Attempt to "get" the bucket root, return error instead of
            # listing.
            raise FileNotFoundError(path)

        result = self._process_object(bucket, self._call('get', 'b/{}/o/{}', bucket, key))

        logger.debug("_get_object result: %s", result)
        return result


    @_tracemethod
    def _maybe_get_cached_listing(self, path):
        logger.debug("_maybe_get_cached_listing: %s", path)
        if path in self._listing_cache:
            retrieved_time, listing = self._listing_cache[path]
            cache_age = time.time() - retrieved_time
            if self.cache_timeout is not None and cache_age > self.cache_timeout:
                logger.debug(
                    "expired cache path: %s retrieved_time: %.3f cache_age: %.3f cache_timeout: %.3f",
                    path, retrieved_time, cache_age, self.cache_timeout
                )
                del self._listing_cache[path]
                return None

            return listing

        return None

    @_tracemethod
    def _list_objects(self, path):
        path = norm_path(path)

        clisting = self._maybe_get_cached_listing(path)
        if clisting:
            return clisting

        listing = self._do_list_objects(path)
        retrieved_time = time.time()

        self._listing_cache[path] = (retrieved_time, listing)
        return listing

    @_tracemethod
    def _do_list_objects(self, path, max_results = None):
        """Return depaginated object listing for the given {bucket}/{prefix}/ path."""
        bucket, prefix = split_path(path)
        if not prefix:
            prefix = None

        prefixes = []
        items = []
        page = self._call(
            'get', 'b/{}/o/', bucket, delimiter="/", prefix=prefix, maxResults=max_results)

        assert page["kind"] == "storage#objects"
        prefixes.extend(page.get("prefixes", []))
        items.extend(page.get("items", []))
        next_page_token = page.get('nextPageToken', None)

        while next_page_token is not None:
            page = self._call(
                'get', 'b/{}/o/', bucket, delimiter="/", prefix=prefix, maxResults=max_results,
                pageToken=next_page_token)

            assert page["kind"] == "storage#objects"
            prefixes.extend(page.get("prefixes", []))
            items.extend(page.get("items", []))
            next_page_token = page.get('nextPageToken', None)

        result = {
            "kind" : "storage#objects",
            "prefixes" : prefixes,
            "items" : items,
        }

        logger.debug("_list_objects result: %s", {k : len(result[k]) for k in ("prefixes", "items")})
        items = [self._process_object(bucket, i) for i in items]

        return result

    def _list_buckets(self):
        """Return list of all buckets under the current project."""

        logger.debug("_list_buckets")

        items = []
        page = self._call(
            'get', 'b/', project=self.project
        )

        assert page["kind"] == "storage#buckets"
        items.extend(page.get("items", []))
        next_page_token = page.get('nextPageToken', None)

        while next_page_token is not None:
            page = self._call(
                'get', 'b/', project=self.roject, pageToken=next_page_token)

            assert page["kind"] == "storage#buckets"
            items.extend(page.get("items", []))
            next_page_token = page.get('nextPageToken', None)

        result = {
            "kind" : "storage#buckets",
            "items" : items,
        }

        logger.debug("_list_buckets result: %s", {k : len(result[k]) for k in ("items",)})

        return result

    @_tracemethod
    def invalidate_cache(self, path=None):
        """
        Invalidate listing cache for given path, so that it is reloaded on next use.

        Parameters
        ----------
        path: string or None
            If None, clear all listings cached else listings at or under given path.
        """

        if not path:
            logger.debug("invalidate_cache clearing cache")
            self._listing_cache.clear()
        else:
            path = norm_path(path)
            logger.debug("invalidate_cache prefix: %s", path)

            invalid_keys = [k for k in self._listing_cache if k.startswith(path)]
            logger.debug("invalidate_cache keys: %s", invalid_keys)

            for k in invalid_keys:
                self._listing_cache.pop(k, None)

    @_tracemethod
    def mkdir(self, bucket, acl='projectPrivate',
              default_acl='bucketOwnerFullControl'):
        """
        New bucket

        Parameters
        ----------
        bucket: str
            bucket name
        acl: string, one of bACLs
            access for the bucket itself
        default_acl: str, one of ACLs
            default ACL for objects created in this bucket
        """
        self._call('post', 'b/', predefinedAcl=acl, project=self.project,
                   predefinedDefaultObjectAcl=default_acl,
                   json={"name": bucket})
        self.invalidate_cache(bucket)

    @_tracemethod
    def rmdir(self, bucket):
        """Delete an empty bucket"""
        self._call('delete', 'b/' + bucket)
        self.invalidate_cache(bucket)

    @_tracemethod
    def ls(self, path, detail=False):
        """List objects under the given '/{bucket}/{prefix} path."""
        path = norm_path(path)

        if path in ['/', '']:
            return self.buckets
        elif path.endswith("/"):
            return self._ls(path, detail)
        else:
            combined_listing = self._ls(path, detail) + self._ls(path + "/", detail)
            if detail:
                combined_entries = dict((l["path"],l) for l in combined_listing )
                combined_entries.pop(path+"/", None)
                return list(combined_entries.values())
            else:
                return list(set(combined_listing) - {path + "/"})

    def _ls(self, path, detail=False):
        listing = self._list_objects(path)
        bucket, key = split_path(path)

        if not detail:
            result = []

            # Convert item listing into list of 'item' and 'subdir/'
            # entries. Items may be of form "key/", in which case there
            # will be duplicate entries in prefix and item_names.
            item_names = [
                f["name"] for f in listing["items"] if f["name"]
            ]
            prefixes = [p for p in listing["prefixes"]]

            logger.debug("path: %s item_names: %s prefixes: %s", path, item_names, prefixes)

            return [
                posixpath.join(bucket, n) for n in set(item_names + prefixes)
            ]

        else:
            item_details = listing["items"]

            pseudodirs = [{
                    'bucket': bucket,
                    'name': prefix,
                    'path': bucket + "/" + prefix,
                    'kind': 'storage#object',
                    'size': 0,
                    'storageClass': 'DIRECTORY',
                }
                for prefix in listing["prefixes"]
            ]

            return item_details + pseudodirs

    @_tracemethod
    def walk(self, path, detail=False):
        """ Return all real keys belows path. """
        bucket, prefix = split_path(path)

        if not bucket:
            raise ValueError(
                "walk path must include target bucket: %s" % path)

        path = '/'.join([bucket, prefix])

        if path.endswith('/'):
            results = []
            listing = self.ls(path, detail=True)

            files = [l for l in listing if l["storageClass"] != "DIRECTORY"]
            dirs = [l for l in listing if l["storageClass"] == "DIRECTORY"]
            for d in dirs:
                files.extend(
                        self.walk(posixpath.join(bucket, d["name"]), detail=True))
        else:
            files = self.walk(path + "/", detail=True)
            files.extend([
                f for f in self.ls(posixpath.dirname(path), detail=True)
                if f["name"] == prefix
            ])

        if detail:
            return files
        else:
            return [posixpath.join(f["bucket"], f['name']) for f in files]

    @_tracemethod
    def du(self, path, total=False, deep=False):
        if deep:
            files = self.walk(path, True)
        else:
            files = [f for f in self.ls(path, True)]
        if total:
            return sum(f['size'] for f in files)
        return {f['path']: f['size'] for f in files}

    @_tracemethod
    def glob(self, path):
        """
        Find files by glob-matching.

        Note that the bucket part of the path must not contain a "*"
        """
        path = path.rstrip('/')
        bucket, key = split_path(path)
        path = '/'.join([bucket, key])
        if "*" in bucket:
            raise ValueError('Bucket cannot contain a "*"')
        if '*' not in path:
            path = path.rstrip('/') + '/*'
        if '/' in path[:path.index('*')]:
            ind = path[:path.index('*')].rindex('/')
            root = path[:ind + 1]
        else:
            root = ''
        allfiles = self.walk(root)
        pattern = re.compile("^" + path.replace('//', '/')
                             .rstrip('/').replace('**', '.+')
                             .replace('*', '[^/]+')
                             .replace('?', '.') + "$")
        out = [f for f in allfiles if re.match(pattern,
               f.replace('//', '/').rstrip('/'))]
        return out

    @_tracemethod
    def exists(self, path):
        bucket, key = split_path(path)
        try:
            if key:
                return bool(self.info(path))
            else:
                if bucket in self.buckets:
                    return True
                else:
                    try:
                        # Bucket may be present & viewable, but not owned by
                        # the current project. Attempt to list.
                        self._list_objects(path)
                        return True
                    except (FileNotFoundError, IOError, ValueError):
                        # bucket listing failed as it doesn't exist or we can't
                        # see it
                        return False
        except FileNotFoundError:
            return False

    @_tracemethod
    def info(self, path):
        bucket, key = split_path(path)
        if not key:
            # Return a pseudo dir for the bucket root
            return {
                'bucket': bucket,
                'name': bucket + "/",
                'kind': 'storage#object',
                'size': 0,
                'storageClass': 'DIRECTORY',
            }

        try:
            return self._get_object(path)
        except FileNotFoundError:
            logger.debug("info FileNotFound at path: %s", path)
            # ls containing directory of path to determine
            # if a pseudodirectory is needed for this entry.
            ikey = key.rstrip("/")
            dkey = ikey + "/"
            assert ikey, "Stripped path resulted in root object."

            parent_listing = self.ls(
                posixpath.join(bucket, posixpath.dirname(ikey)), detail=True)
            pseudo_listing = [
                i for i in parent_listing
                if i["storageClass"] == "DIRECTORY" and i["name"] == dkey ]

            if pseudo_listing:
                return pseudo_listing[0]
            else:
                raise

    @_tracemethod
    def url(self, path):
        return self.info(path)['mediaLink']

    @_tracemethod
    def cat(self, path):
        """ Simple one-shot get of file data """
        details = self.info(path)
        return _fetch_range(details, self.session)

    @_tracemethod
    def get(self, rpath, lpath, blocksize=5 * 2 ** 20):
        with self.open(rpath, 'rb', block_size=blocksize) as f1:
            with open(lpath, 'wb') as f2:
                while True:
                    d = f1.read(blocksize)
                    if not d:
                        break
                    f2.write(d)

    @_tracemethod
    def put(self, lpath, rpath, blocksize=5 * 2 ** 20, acl=None):
        with self.open(rpath, 'wb', block_size=blocksize, acl=acl) as f1:
            with open(lpath, 'rb') as f2:
                while True:
                    d = f2.read(blocksize)
                    if not d:
                        break
                    f1.write(d)

    @_tracemethod
    def head(self, path, size=1024):
        with self.open(path, 'rb') as f:
            return f.read(size)

    @_tracemethod
    def tail(self, path, size=1024):
        if size > self.info(path)['size']:
            return self.cat(path)
        with self.open(path, 'rb') as f:
            f.seek(-size, 2)
            return f.read()

    @_tracemethod
    def merge(self, path, paths, acl=None):
        """Concatenate objects within a single bucket"""
        bucket, key = split_path(path)
        source = [{'name': split_path(p)[1]} for p in paths]
        self._call('post', 'b/{}/o/{}/compose', bucket, key,
                   destinationPredefinedAcl=acl,
                   json={'sourceObjects': source,
                         "kind": "storage#composeRequest",
                         'destination': {'name': key, 'bucket': bucket}})

    @_tracemethod
    def copy(self, path1, path2, acl=None):
        b1, k1 = split_path(path1)
        b2, k2 = split_path(path2)
        self._call('post', 'b/{}/o/{}/copyTo/b/{}/o/{}', b1, k1, b2, k2,
                   destinationPredefinedAcl=acl)

    @_tracemethod
    def mv(self, path1, path2, acl=None):
        self.copy(path1, path2, acl)
        self.rm(path1)

    @_tracemethod
    def rm(self, path, recursive=False):
        """Delete keys. If recursive, also delete all keys
        given by walk(path)"""
        if recursive:
            for p in self.walk(path):
                self.rm(p)
        else:
            bucket, key = split_path(path)
            self._call('delete', "b/{}/o/{}", bucket, key)
            self.invalidate_cache(posixpath.dirname(norm_path(path)))

    @_tracemethod
    def open(self, path, mode='rb', block_size=None, acl=None,
             consistency=None, metadata=None):
        """
        See ``GCSFile``.

        consistency: None or str
            If None, use default for this instance
        """
        if block_size is None:
            block_size = self.default_block_size
        const = consistency or self.consistency
        if 'b' in mode:
            return GCSFile(self, path, mode, block_size, consistency=const,
                           metadata=metadata)
        else:
            mode = mode.replace('t', '') + 'b'
            return io.TextIOWrapper(
                GCSFile(self, path, mode, block_size, consistency=const,
                        metadata=metadata))

    @_tracemethod
    def touch(self, path):
        with self.open(path, 'wb'):
            pass

    def read_block(self, fn, offset, length, delimiter=None):
        """ Read a block of bytes from a GCS file

        Starting at ``offset`` of the file, read ``length`` bytes.  If
        ``delimiter`` is set then we ensure that the read starts and stops at
        delimiter boundaries that follow the locations ``offset`` and ``offset
        + length``.  If ``offset`` is zero then we start at zero.  The
        bytestring returned WILL include the end delimiter string.

        If offset+length is beyond the eof, reads to eof.

        Parameters
        ----------
        fn: string
            Path to filename on GCS
        offset: int
            Byte offset to start read
        length: int
            Number of bytes to read
        delimiter: bytes (optional)
            Ensure reading starts and stops at delimiter bytestring

        Examples
        --------
        >>> gcs.read_block('data/file.csv', 0, 13)  # doctest: +SKIP
        b'Alice, 100\\nBo'
        >>> gcs.read_block('data/file.csv', 0, 13, delimiter=b'\\n')  # doctest: +SKIP
        b'Alice, 100\\nBob, 200\\n'

        Use ``length=None`` to read to the end of the file.
        >>> gcs.read_block('data/file.csv', 0, None, delimiter=b'\\n')  # doctest: +SKIP
        b'Alice, 100\\nBob, 200\\nCharlie, 300'

        See Also
        --------
        distributed.utils.read_block
        """
        with self.open(fn, 'rb') as f:
            size = f.size
            if length is None:
                length = size
            if offset + length > size:
                length = size - offset
            bytes = read_block(f, offset, length, delimiter)
        return bytes

    def __getstate__(self):
        d = self.__dict__.copy()
        d["_listing_cache"] = {}
        logger.debug("Serialize with state: %s", d)
        return d

    def __setstate__(self, state):
        self.__dict__.update(state)
        self.connect(self.token)


GCSFileSystem.load_tokens()


class GCSFile:

    @_tracemethod
    def __init__(self, gcsfs, path, mode='rb', block_size=DEFAULT_BLOCK_SIZE,
                 acl=None, consistency='md5', metadata=None):
        """
        Open a file.

        Parameters
        ----------
        gcsfs: instance of GCSFileSystem
        path: str
            location in GCS, like 'bucket/path/to/file'
        mode: str
            Normal file modes. Currently only 'wb' amd 'rb'.
        block_size: int
            Buffer size for reading or writing
        acl: str
            ACL to apply, if any, one of ``ACLs``. New files are normally
            "bucketownerfullcontrol", but a default can be configured per
            bucket.
        consistency: str, 'none', 'size', 'md5'
            Check for success in writing, applied at file close.
            'size' ensures that the number of bytes reported by GCS matches
            the number we wrote; 'md5' does a full checksum. Any value other
            than 'size' or 'md5' is assumed to mean no checking.
        metadata: dict
            Custom metadata, in key/value pairs, added at file creation
        """
        bucket, key = split_path(path)
        if not key:
            raise OSError('Attempt to open a bucket')
        self.gcsfs = gcsfs
        self.bucket = bucket
        self.key = key
        self.metadata = metadata
        self.mode = mode
        self.blocksize = block_size
        self.cache = b""
        self.loc = 0
        self.acl = acl
        self.end = None
        self.start = None
        self.closed = False
        self.trim = True
        self.consistency = consistency
        if self.consistency == 'md5':
            self.md5 = md5()
        if mode not in {'rb', 'wb'}:
            raise NotImplementedError('File mode not supported')
        if mode == 'rb':
            self.details = gcsfs.info(path)
            self.size = self.details['size']
        else:
            if block_size < GCS_MIN_BLOCK_SIZE:
                warnings.warn('Setting block size to minimum value, 2**18')
                self.blocksize = GCS_MIN_BLOCK_SIZE
            self.buffer = io.BytesIO()
            self.offset = 0
            self.forced = False
            self.location = None

    def info(self):
        """ File information about this path """
        return self.details

    def url(self):
        return self.details['mediaLink']

    def tell(self):
        """ Current file location """
        return self.loc

    @_tracemethod
    def seek(self, loc, whence=0):
        """ Set current file location

        Parameters
        ----------
        loc : int
            byte location
        whence : {0, 1, 2}
            from start of file, current location or end of file, resp.
        """
        if not self.mode == 'rb':
            raise ValueError('Seek only available in read mode')
        if whence == 0:
            nloc = loc
        elif whence == 1:
            nloc = self.loc + loc
        elif whence == 2:
            nloc = self.size + loc
        else:
            raise ValueError(
                "invalid whence (%s, should be 0, 1 or 2)" % whence)
        if nloc < 0:
            raise ValueError('Seek before start of file')
        self.loc = nloc
        return self.loc

    def readline(self, length=-1):
        """
        Read and return a line from the stream.

        If length is specified, at most size bytes will be read.
        """
        self._fetch(self.loc, self.loc + 1)
        while True:
            found = self.cache[self.loc - self.start:].find(b'\n') + 1
            if 0 < length < found:
                return self.read(length)
            if found:
                return self.read(found)
            if self.end > self.size:
                return self.read(length)
            self._fetch(self.start, self.end + self.blocksize)

    def __next__(self):
        data = self.readline()
        if data:
            return data
        else:
            raise StopIteration

    next = __next__

    def __iter__(self):
        return self

    def readlines(self):
        """ Return all lines in a file as a list """
        return list(self)

    def write(self, data):
        """
        Write data to buffer.

        Buffer only sent to GCS on flush() or if buffer is greater than
        or equal to blocksize.

        Parameters
        ----------
        data : bytes
            Set of bytes to be written.
        """
        if self.mode not in {'wb', 'ab'}:
            raise ValueError('File not in write mode')
        if self.closed:
            raise ValueError('I/O operation on closed file.')
        if self.forced:
            raise ValueError('This file has been force-flushed, can only close')
        out = self.buffer.write(ensure_writable(data))
        self.loc += out
        if self.buffer.tell() >= self.blocksize:
            self.flush()
        return out

    @_tracemethod
    def flush(self, force=False):
        """
        Write buffered data to GCS.

        Uploads the current buffer, if it is larger than the block-size, or if
        the file is being closed.

        Parameters
        ----------
        force : bool
            When closing, write the last block even if it is smaller than
            blocks are allowed to be. Disallows further writing to this file.
        """
        if self.mode not in {'wb', 'ab'}:
            if self.mode == "rb" and not force:
                return

            raise ValueError('Flush on a file not in write mode')
        if self.closed:
            raise ValueError('Flush on closed file')

        if self.buffer.tell() == 0 and not force:
            # no data in the buffer to write
            return
        if self.buffer.tell() < GCS_MIN_BLOCK_SIZE and not force:
            warnings.warn(
                "GCSFile.flush(force=False) with buffer size (%s) below minimum GCS chunk size (2 ** 18), "
                "skipping block upload." % self.buffer.tell()
            )
            return

        if force and self.forced:
            raise ValueError("Force flush cannot be called more than once")


        if not self.offset:
            if force and self.buffer.tell() <= self.blocksize:
                # Force-write a buffer below blocksizev with a single write
                self._simple_upload()
            else:
                # At initialize a multipart upload, setting self.location
                self._initiate_upload()

        if self.location is not None:
            # Continue with multipart upload has been initalized
            self._upload_chunk(final=force)

        if force:
            self.forced = True

    @_tracemethod
    def _upload_chunk(self, final=False):
        self.buffer.seek(0)
        data = self.buffer.read()
        head = {}
        l = self.buffer.tell()
        if final:
            if l:
                head['Content-Range'] = 'bytes %i-%i/%i' % (
                    self.offset, self.offset + l - 1, self.offset + l)
            else:
                # closing when buffer is empty
                head['Content-Range'] = 'bytes */%i' % self.offset
                data = None
        else:

            head['Content-Range'] = 'bytes %i-%i/*' % (
                self.offset, self.offset + l - 1)
        head.update({'Content-Type': 'application/octet-stream',
                     'Content-Length': str(l)})
        r = self.gcsfs.session.post(
            self.location, params={'uploadType': 'resumable'},
            headers=head, data=data)
        validate_response(r, self.location)
        if 'Range' in r.headers:
            assert not final, "Response looks like upload is partial"
            shortfall = (self.offset + l - 1) - int(
                    r.headers['Range'].split('-')[1])
            if shortfall:
                if self.consistency == 'md5':
                    self.md5.update(data[:-shortfall])
                self.buffer = io.BytesIO(data[-shortfall:])
                self.buffer.seek(shortfall)
            else:
                if self.consistency == 'md5':
                    self.md5.update(data)
                self.buffer = io.BytesIO()
            self.offset += l - shortfall
        else:
            assert final, "Response looks like upload is over"
            size, md5 = int(r.json()['size']), r.json()['md5Hash']
            if self.consistency == 'size':
                assert size == self.buffer.tell() + self.offset, "Size mismatch"
            if self.consistency == 'md5':
                assert b64encode(
                    self.md5.digest()) == md5.encode(), "MD5 checksum failed"
            self.buffer = io.BytesIO()
            self.offset += l

    @_tracemethod
    def _initiate_upload(self):
        r = self.gcsfs.session.post(
            'https://www.googleapis.com/upload/storage/v1/b/%s/o'
            % quote_plus(self.bucket),
            params={'uploadType': 'resumable'},
            json={'name': self.key, 'metadata': self.metadata})
        self.location = r.headers['Location']

    @_tracemethod
    def _simple_upload(self):
        """One-shot upload, less than 5MB"""
        self.buffer.seek(0)
        data = self.buffer.read()
        path = ('https://www.googleapis.com/upload/storage/v1/b/%s/o'
                % quote_plus(self.bucket))
        r = self.gcsfs.session.post(
            path, params={'uploadType': 'media', 'name': self.key}, data=data)
        validate_response(r, path)
        size, md5 = int(r.json()['size']), r.json()['md5Hash']
        if self.consistency == 'size':
            assert size == self.buffer.tell(), "Size mismatch"
        if self.consistency == 'md5':
            self.md5.update(data)
            assert b64encode(self.md5.digest()) == md5.encode(), "MD5 checksum failed"

    @_tracemethod
    def _fetch(self, start, end):
        # force read to 5MB boundaries
        start = start // (READ_BLOCK_SIZE) * READ_BLOCK_SIZE
        end = (end // (READ_BLOCK_SIZE) + 1) * READ_BLOCK_SIZE
        if self.start is None and self.end is None:
            # First read
            self.start = start
            self.end = end + self.blocksize
            self.cache = _fetch_range(self.details, self.gcsfs.session, start,
                                      self.end)
        if start < self.start:
            if self.end - end > self.blocksize:
                self.start = start
                self.end = end + self.blocksize
                self.cache = _fetch_range(self.details, self.gcsfs.session,
                                          self.start, self.end)
            else:
                new = _fetch_range(self.details, self.gcsfs.session, start,
                                   self.start)
                self.start = start
                self.cache = new + self.cache
        if end > self.end:
            if self.end > self.size:
                return
            if end - self.end > self.blocksize:
                self.start = start
                self.end = end + self.blocksize
                self.cache = _fetch_range(self.details, self.gcsfs.session,
                                          self.start, self.end)
            else:
                new = _fetch_range(self.details, self.gcsfs.session, self.end,
                                   end + self.blocksize)
                self.end = end + self.blocksize
                self.cache = self.cache + new

    def read(self, length=-1):
        """
        Return data from cache, or fetch pieces as necessary

        Parameters
        ----------
        length : int (-1)
            Number of bytes to read; if <0, all remaining bytes.
        """
        if self.mode != 'rb':
            raise ValueError('File not in read mode')
        if length < 0:
            length = self.size
        if self.closed:
            raise ValueError('I/O operation on closed file.')
        self._fetch(self.loc, self.loc + length)
        out = self.cache[self.loc - self.start:
                         self.loc - self.start + length]
        self.loc += len(out)
        if self.trim:
            num = (self.loc - self.start) // self.blocksize - 1
            if num > 0:
                self.start += self.blocksize * num
                self.cache = self.cache[self.blocksize * num:]
        return out

    @_tracemethod
    def close(self):
        """ Close file """
        if self.closed:
            return
        if self.mode == 'rb':
            self.cache = None
        else:
            self.flush(force=True)
            self.gcsfs.invalidate_cache(
                posixpath.dirname("/".join([self.bucket, self.key])))
        self.closed = True

    def readable(self):
        """Return whether the GCSFile was opened for reading"""
        return self.mode == 'rb'

    def seekable(self):
        """Return whether the GCSFile is seekable (only in read mode)"""
        return self.readable()

    def writable(self):
        """Return whether the GCSFile was opened for writing"""
        return self.mode in {'wb', 'ab'}

    @_tracemethod
    def __del__(self):
        self.close()

    def __str__(self):
        return "<GCSFile %s/%s>" % (self.bucket, self.key)

    __repr__ = __str__

    @_tracemethod
    def __enter__(self):
        return self

    @_tracemethod
    def __exit__(self, *args):
        self.close()


def _fetch_range(obj_dict, session, start=None, end=None):
    """ Get data from GCS

    obj_dict : an entry from ls() or info()
    session: requests.Session instance
    start, end : None or integers
        if not both None, fetch only given range
    """
    if DEBUG:
        print('Fetch: ', start, end)
    logger.debug("Fetch: {}, {}-{}", obj_dict['name'], start, end)
    if start is not None or end is not None:
        start = start or 0
        end = end or 0
        head = {'Range': 'bytes=%i-%i' % (start, end - 1)}
    else:
        head = None
    back = session.get(obj_dict['mediaLink'], headers=head)
    data = back.content
    if data == b'Request range not satisfiable':
        return b''
    return data


def put_object(credentials, bucket, name, data, session):
    """ Simple put, up to 5MB of data

    credentials : from auth()
    bucket : string
    name : object name
    data : binary
    session: requests.Session instance
    """
    out = session.post('https://www.googleapis.com/upload/storage/'
                       'v1/b/%s/o?uploadType=media&name=%s' % (
                           quote_plus(bucket), quote_plus(name)),
                       headers={'Authorization': 'Bearer ' +
                                                 credentials.access_token,
                                'Content-Type': 'application/octet-stream',
                                'Content-Length': len(data)}, data=data)
    assert out.status_code == 200


def ensure_writable(b):
    if PY2 and isinstance(b, array.array):
        return b.tostring()
    return b