Release v0.11.3.beta1 (#195)

aliyun · Jan 17, 2023 · e612ec8 · e612ec8
1 parent 1acf0d2
commit e612ec8
Show file tree

Hide file tree

Showing 202 changed files with 9,839 additions and 3,630 deletions.
diff --git a/README.md b/README.md
@@ -61,7 +61,7 @@ $ python setup.py install
 >>> dual = o.get_table('dual')
 >>> dual.name
 'dual'
->>> dual.schema
+>>> dual.table_schema
 odps.Schema {
   c_int_a                 bigint
   c_int_b                 bigint
@@ -80,7 +80,7 @@ datetime.datetime(2014, 6, 6, 13, 28, 24)
 False
 >>> dual.size
 448
->>> dual.schema.columns
+>>> dual.table_schema.columns
 [<column c_int_a, type bigint>,
  <column c_int_b, type bigint>,
  <column c_double_a, type double>,

diff --git a/README.rst b/README.rst
@@ -64,7 +64,7 @@ Usage
    >>> dual = o.get_table('dual')
    >>> dual.name
    'dual'
-   >>> dual.schema
+   >>> dual.table_schema
    odps.Schema {
      c_int_a                 bigint
      c_int_b                 bigint
@@ -83,7 +83,7 @@ Usage
    False
    >>> dual.size
    448
-   >>> dual.schema.columns
+   >>> dual.table_schema.columns
    [<column c_int_a, type bigint>,
     <column c_int_b, type bigint>,
     <column c_double_a, type double>,
@@ -139,7 +139,7 @@ Command-line and IPython enhancement
 
    In [3]: %sql select * from pyodps_iris limit 5
    |==========================================|   1 /  1  (100.00%)         2s
-   Out[3]:
+   Out[3]: 
       sepallength  sepalwidth  petallength  petalwidth         name
    0          5.1         3.5          1.4         0.2  Iris-setosa
    1          4.9         3.0          1.4         0.2  Iris-setosa

diff --git a/benchmarks/perf_tabletunnel.py b/benchmarks/perf_tabletunnel.py
@@ -25,7 +25,7 @@
 
 from odps.compat import unittest, Decimal
 from odps.tests.core import TestBase
-from odps.models import Schema
+from odps.models import TableSchema
 from datetime import datetime
 
 # remember to reset False before committing
@@ -46,7 +46,7 @@ def setUp(self):
             self.pr.enable()
         fields = ['a', 'b', 'c', 'd', 'e', 'f']
         types = ['bigint', 'double', 'datetime', 'boolean', 'string', 'decimal']
-        self.SCHEMA = Schema.from_lists(fields, types)
+        self.SCHEMA = TableSchema.from_lists(fields, types)
 
     def tearDown(self):
         if ENABLE_PROFILE:
@@ -63,7 +63,7 @@ def tearDown(self):
 
     def testWrite(self):
         table_name = 'pyodps_test_tunnel_write_performance'
-        self.odps.create_table(table_name, schema=self.SCHEMA, if_not_exists=True)
+        self.odps.create_table(table_name, self.SCHEMA, if_not_exists=True)
         ss = self.tunnel.create_upload_session(table_name)
         r = ss.new_record()
 
@@ -85,7 +85,7 @@ def testWrite(self):
     def testRead(self):
         table_name = 'pyodps_test_tunnel_read_performance'
         self.odps.delete_table(table_name, if_exists=True)
-        t = self.odps.create_table(table_name, schema=self.SCHEMA)
+        t = self.odps.create_table(table_name, self.SCHEMA)
 
         def gen_data():
             for i in range(self.DATA_AMOUNT):
@@ -118,7 +118,7 @@ def gen_data():
 
     def testBufferedWrite(self):
         table_name = 'test_tunnel_bufferred_write'
-        self.odps.create_table(table_name, schema=self.SCHEMA, if_not_exists=True)
+        self.odps.create_table(table_name, self.SCHEMA, if_not_exists=True)
         ss = self.tunnel.create_upload_session(table_name)
         r = ss.new_record()
 

diff --git a/benchmarks/perf_types.py b/benchmarks/perf_types.py
@@ -21,7 +21,7 @@
 
 from odps.compat import unittest
 from odps.tests.core import TestBase
-from odps.models import Schema, Record
+from odps.models import TableSchema, Record
 
 
 class Test(TestBase):
@@ -36,7 +36,7 @@ def setUp(self):
         self.pr.enable()
         fields = ['bigint', 'double', 'datetime', 'boolean', 'string', 'decimal']
         types = ['bigint', 'double', 'datetime', 'boolean', 'string', 'decimal']
-        self.SCHEMA = Schema.from_lists(fields, types)
+        self.SCHEMA = TableSchema.from_lists(fields, types)
 
     def tearDown(self):
         p = Stats(self.pr)

diff --git a/cupid/io/table/core.py b/cupid/io/table/core.py
@@ -20,7 +20,7 @@
 from types import GeneratorType
 
 from odps import options
-from odps.models import Table, Schema
+from odps.models import Table, TableSchema
 from odps.models.partition import Partition as TablePartition
 
 from cupid.rpc import CupidRpcController, CupidTaskServiceRpcChannel, SandboxRpcChannel
@@ -134,7 +134,7 @@ def _register_reader(self):
         schema_types = [d['type'] for d in schema_json]
         pt_schema_names = [d['name'] for d in partition_schema_json]
         pt_schema_types = [d['type'] for d in partition_schema_json]
-        schema = Schema.from_lists(schema_names, schema_types, pt_schema_names, pt_schema_types)
+        schema = TableSchema.from_lists(schema_names, schema_types, pt_schema_names, pt_schema_types)
 
         return resp.readIterator, schema
 
@@ -389,15 +389,15 @@ def create_download_session(session, table_or_parts, split_size=None, split_coun
     for t in table_or_parts:
         if isinstance(t, Table):
             if not columns:
-                columns = t.schema.names
+                columns = t.table_schema.names
             table_kw = dict(
                 projectName=t.project.name,
                 tableName=t.name,
                 columns=','.join(columns),
             )
         elif isinstance(t, TablePartition):
             if not columns:
-                columns = t.table.schema.names
+                columns = t.table.table_schema.names
             table_kw = dict(
                 projectName=t.table.project.name,
                 tableName=t.table.name,

diff --git a/cupid/runtime/runtime_test.py b/cupid/runtime/runtime_test.py
@@ -17,7 +17,7 @@
 import time
 import unittest
 
-from odps.models import Schema, Record
+from odps.models import TableSchema, Record
 
 CLIENT_READ_PIPE_NUM = 2
 CLIENT_WRITE_PIPE_NUM = 2
@@ -60,7 +60,7 @@ def testInterfaces(self):
         del ro
 
     def testTableIO(self):
-        schema = Schema.from_lists(['key', 'value', 'double', 'datetime', 'boolean'],
+        schema = TableSchema.from_lists(['key', 'value', 'double', 'datetime', 'boolean'],
                                    ['bigint', 'string', 'double', 'datetime', 'boolean'])
         label = self.client.sync_call('test', 'write_label')
         print('Write label: ' + label)

diff --git a/docs/source/base-functions.rst b/docs/source/base-functions.rst
@@ -3,7 +3,7 @@
 函数
 ========
 
-ODPS用户可以编写自定义 `函数 <https://docs.aliyun.com/#/pub/odps/basic/definition&function>`_ 用在ODPS SQL中。
+ODPS用户可以编写自定义 `函数 <https://help.aliyun.com/document_detail/27823.html>`_ 用在ODPS SQL中。
 
 基本操作
 ---------

diff --git a/docs/source/base-projects.rst b/docs/source/base-projects.rst
@@ -3,7 +3,7 @@
 项目空间
 =========
 
-`项目空间 <https://docs.aliyun.com/#/pub/odps/basic/definition&project>`_ 是ODPS的基本组织单元，
+`项目空间 <https://help.aliyun.com/document_detail/27818.html>`_ 是ODPS的基本组织单元，
 有点类似于Database的概念。
 
 我们通过 ODPS 入口对象的 ``get_project`` 来取到某个项目空间。

diff --git a/docs/source/base-resources.rst b/docs/source/base-resources.rst
@@ -3,7 +3,7 @@
 资源
 =======
 
-`资源 <https://docs.aliyun.com/#/pub/odps/basic/definition&resource>`_ 在ODPS上常用在UDF和MapReduce中。
+`资源 <https://help.aliyun.com/document_detail/27822.html>`_ 在ODPS上常用在UDF和MapReduce中。
 
 列出所有资源还是可以使用 ``list_resources``，判断资源是否存在使用 ``exist_resource``。
 删除资源时，可以调用 ``delete_resource``，或者直接对于Resource对象调用 ``drop`` 方法。
@@ -22,22 +22,27 @@
 
 .. code-block:: python
 
-   resource = o.create_resource('test_file_resource', 'file', file_obj=open('/to/path/file'))  # 使用file-like的对象
-   resource = o.create_resource('test_py_resource', 'py', file_obj='import this')  # 使用字符串
+   resource = o.create_resource('test_file_resource', 'file', fileobj=open('/to/path/file'))  # 使用file-like的对象
+   resource = o.create_resource('test_py_resource', 'py', fileobj='import this')  # 使用字符串
 
 
 可以通过 ``temp=True`` 创建一个临时资源。
 
 .. code-block:: python
 
-   resource = o.create_resource('test_file_resource', 'file', file_obj=open('/to/path/file'), temp=True)
+   resource = o.create_resource('test_file_resource', 'file', fileobj=open('/to/path/file'), temp=True)
 
+.. note::
 
-读取和修改文件资源
-~~~~~~~~~~~~~~~~~~
+    在 fileobj 参数中传入字符串，创建的资源内容为 **字符串本身** 而非字符串代表的路径指向的文件。
+
+    如果文件过大（例如大小超过 64MB），PyODPS 可能会使用分块上传模式，而这不被旧版 MaxCompute 部署所支持。
+    如需在旧版 MaxCompute 中上传大文件，请配置 ``options.upload_resource_in_chunks = False`` 。
 
-对文件资源调用 ``open`` 方法，或者在odps入口调用 ``open_resource`` 都能打开一个资源，
-打开后的对象会是file-like的对象。
+读取和修改文件资源
+~~~~~~~~~~~~~~
+对文件资源调用 ``open`` 方法，或者在 MaxCompute 入口调用 ``open_resource`` 都能打开一个资源，
+打开后的对象会是 file-like 的对象。
 类似于Python内置的 ``open`` 方法，文件资源也支持打开的模式。我们看例子：
 
 .. code-block:: python
@@ -69,6 +74,23 @@
 同时，PyODPS中，文件资源支持以二进制模式打开，打开如说一些压缩文件等等就需要以这种模式，
 因此 ``rb`` 就是指以二进制读模式打开文件，``r+b`` 是指以二进制读写模式打开。
 
+对于较大的文件资源，可以使用流式方式读写文件，使用方法为在调用 ``open_resource`` 时增加一个
+``stream=True`` 选项：
+
+.. code-block:: python
+
+   >>> with o.open_resource('test_file_resource', mode='w') as fp:  # 写模式打开
+   >>>     fp.writelines(['Hello\n', 'World\n'])  # 写入多行
+   >>>     fp.write('Hello World')
+   >>>     fp.flush()  # 手动调用会将更新提交到 MaxCompute
+   >>>
+   >>> with resource.open('r', stream=True) as fp:  # 以读模式打开
+   >>>     content = fp.read()  # 读取全部的内容
+   >>>     line = fp.readline()  # 回到资源开头
+   >>>     lines = fp.readlines()  # 读成多行
+
+当 ``stream=True`` 时，只支持 ``r`` ， ``rb`` ， ``w`` ， ``wb`` 四种模式。
+
 表资源
 -------
 

diff --git a/docs/source/base-schemas.rst b/docs/source/base-schemas.rst
@@ -0,0 +1,53 @@
+.. _schema:
+
+Schema
+=======
+
+.. note::
+
+    Schema 属于 MaxCompute 的公测功能，需要通过 `新功能测试申请 <https://help.aliyun.com/document_detail/128366.htm>`_ 开通。
+
+`Schema <https://help.aliyun.com/document_detail/437084.html>`_ 是 MaxCompute
+介于项目和表 / 资源 / 函数之间的概念，对表 / 资源 / 函数进行进一步归类。
+
+你可以使用 ``create_schema`` 创建一个 Schema 对象：
+
+.. code-block:: python
+
+    schema = o.create_schema("test_schema")
+    print(schema)
+
+使用 ``delete_schema`` 删除一个 Schema 对象：
+
+.. code-block:: python
+
+    schema = o.delete_schema("test_schema")
+
+使用 ``list_schema`` 列举所有 Schema 对象：
+
+.. code-block:: python
+
+    for schema in o.list_schema():
+        print(schema)
+
+在开启 Schema 后，MaxCompute 入口对象默认操作的 MaxCompute 对象都位于名为 ``DEFAULT``
+的 Schema 下。为操作其他 Schema 下的对象，需要在创建入口对象时指定 Schema，例如：
+
+.. code-block:: python
+
+    o = ODPS('**your-access-id**', '**your-secret-access-key**', '**your-default-project**',
+             endpoint='**your-end-point**', schema='**your-schema-name**')
+
+也可以为不同对象的操作方法指定 ``schema`` 参数。例如，下面的方法列举了 ``test_schema``
+下所有的表：
+
+.. code-block:: python
+
+    for table in o.list_tables(schema='test_schema'):
+        print(table)
+
+在执行 SQL 时，可以指定默认 Schema：
+
+.. code-block:: python
+
+    o.execute_sql("SELECT * FROM dual", default_schema="test_schema")
diff --git a/docs/source/base-sql.rst b/docs/source/base-sql.rst
@@ -125,7 +125,7 @@ PyODPS 默认不限制能够从 Instance 读取的数据规模。对于受保护
 
 .. code-block:: python
 
-    from odps.models import Schema
+    from odps.models import TableSchema
 
     myfunc = '''\
     from odps.udf import annotate
@@ -147,7 +147,7 @@ PyODPS 默认不限制能够从 Instance 读取的数据规模。对于受保护
 
     table = o.create_table(
         'test_table',
-        schema=Schema.from_lists(['size'], ['bigint']),
+        TableSchema.from_lists(['size'], ['bigint']),
         if_not_exists=True
     )
 

diff --git a/docs/source/base-sqlalchemy.rst b/docs/source/base-sqlalchemy.rst
@@ -40,6 +40,24 @@ PyODPS 支持集成 SQLAlchemy，可以使用 SQLAlchemy 查询 MaxCompute 数
 
    conn = engine.connect()
 
+如果需要为 SQL 作业配置执行选项，可以使用 PyODPS 提供的 ``options`` 对象：
+
+.. code-block:: python
+
+    from odps import options
+    from sqlalchemy import create_engine
+
+    options.sql.settings = {'odps.sql.hive.compatible': 'true'}
+    engine = create_engine('odps://<access_id>:<access_key>@<project>/?endpoint=<endpoint>')
+
+也可以直接配置在连接字符串中：
+
+.. code-block:: python
+
+    from sqlalchemy import create_engine
+    engine = create_engine('odps://<access_id>:<access_key>@<project>/?endpoint=<endpoint>&odps.sql.hive.compatible=true')
+
+使用上述方式时，每个 engine 对象都会拥有不同的选项。
 
 调用 SQLAlchemy 接口
 ----------------------