apache · kiszk · Dec 24, 2019 · Dec 24, 2019 · kou · Dec 24, 2019
diff --git a/python/manylinux1/README.md b/python/manylinux1/README.md
@@ -24,7 +24,7 @@ This folder provides base Docker images and an infrastructure to build
 Linux distributions published in last four years.
 
 The process is split up in two parts: There are base Docker images that build
-the native, Python-indenpendent dependencies. For these you can select if you
+the native, Python-independent dependencies. For these you can select if you
 want to also build the dependencies used for the Parquet support. Depending on
 these images, there is also a bash script that will build the pyarrow wheels
 for all supported Python versions and place them in the `dist` folder.

diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh
@@ -25,7 +25,7 @@
 #     $ docker-compose build python-manylinux1
 #   or pull:
 #     $ docker-compose pull python-manylinux1
-#   an then run:
+#   and then run:
 #     $ docker-compose run -e PYTHON_VERSION=3.7 python-manylinux1
 
 source /multibuild/manylinux_utils.sh

diff --git a/python/pyarrow/_cuda.pyx b/python/pyarrow/_cuda.pyx
@@ -248,7 +248,7 @@ cdef class Context:
         return pyarrow_wrap_cudabuffer(cudabuf)
 
     def buffer_from_data(self, object data, int64_t offset=0, int64_t size=-1):
-        """Create device buffer and initalize with data.
+        """Create device buffer and initialize with data.
 
         Parameters
         ----------
@@ -293,7 +293,7 @@ cdef class Context:
         device accessible memory.
 
         When the object contains a non-contiguous view of device
-        accessbile memory then the returned device buffer will contain
+        accessible memory then the returned device buffer will contain
         contiguous view of the memory, that is, including the
         intermediate data that is otherwise invisible to the input
         object.

diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
@@ -519,7 +519,7 @@ cdef class Dataset:
 
         A schema must be passed because most of the data sources' schema is
         unknown before executing possibly expensive scanning operation, but
-        projecting, filtering, predicate pushduwn requires a well defined
+        projecting, filtering, predicate pushdown requires a well defined
         schema to work on.
 
         Parameters
@@ -991,7 +991,7 @@ cdef class CastExpression(UnaryExpression):
 
     def __init__(self, Expression operand not None, DataType to not None,
                  bint safe=True):
-        # TODO(kszucs): safe is consitently used across pyarrow, but on long
+        # TODO(kszucs): safe is consistently used across pyarrow, but on long
         #               term we should expose the CastOptions object
         cdef:
             CastOptions options

diff --git a/python/pyarrow/_flight.pyx b/python/pyarrow/_flight.pyx
@@ -988,7 +988,7 @@ cdef class FlightClient:
     @classmethod
     def connect(cls, location, tls_root_certs=None, override_hostname=None):
         warnings.warn("The 'FlightClient.connect' method is deprecated, use "
-                      "FlightClient contructor or pyarrow.flight.connect "
+                      "FlightClient constructor or pyarrow.flight.connect "
                       "function instead")
         return FlightClient(location, tls_root_certs=tls_root_certs,
                             override_hostname=override_hostname)

diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd
@@ -334,14 +334,14 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil:
                 CDataSourceDiscovery):
         @staticmethod
         CResult[shared_ptr[CDataSourceDiscovery]] MakeFromPaths "Make"(
-            shared_ptr[CFileSystem] filesytem,
+            shared_ptr[CFileSystem] filesystem,
             vector[c_string] paths,
             shared_ptr[CFileFormat] format,
             CFileSystemDiscoveryOptions options
         )
         @staticmethod
         CResult[shared_ptr[CDataSourceDiscovery]] MakeFromSelector "Make"(
-            shared_ptr[CFileSystem] filesytem,
+            shared_ptr[CFileSystem] filesystem,
             CFileSelector,
             shared_ptr[CFileFormat] format,
             CFileSystemDiscoveryOptions options

diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi
@@ -1353,7 +1353,7 @@ def foreign_buffer(address, size, base=None):
     optionally backed by the Python *base* object.
 
     The *base* object, if given, will be kept alive as long as this buffer
-    is alive, including accross language boundaries (for example if the
+    is alive, including across language boundaries (for example if the
     buffer is referenced by C++ code).
     """
     cdef:

diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
@@ -356,7 +356,7 @@ def _sanitize_table(table, new_schema, flavor):
     Cast timestamps a particular resolution.
     Valid values: {None, 'ms', 'us'}
 data_page_size : int, default None
-    Set a target threshhold for the approximate encoded size of data
+    Set a target threshold for the approximate encoded size of data
     pages within a column chunk. If None, use the default data page
     size of 1MByte.
 allow_truncated_timestamps : boolean, default False
@@ -994,7 +994,7 @@ class ParquetDataset(object):
     kinds of filters that are possible using boolean logic.
 
     This function also supports passing in as List[Tuple]. These predicates
-    are evaluated as a conjunction. To express OR in predictates, one must
+    are evaluated as a conjunction. To express OR in predicates, one must
     use the (preferred) List[List[Tuple]] notation.
 metadata_nthreads: int, default 1
     How many threads to allow the thread pool which is used to read the

diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
@@ -1407,7 +1407,7 @@ cdef class Table(_PandasConvertible):
 
     def field(self, i):
         """
-        Select a schema field by its colunm name or numeric index.
+        Select a schema field by its column name or numeric index.
 
         Parameters
         ----------

diff --git a/python/pyarrow/tests/pandas_examples.py b/python/pyarrow/tests/pandas_examples.py
@@ -26,7 +26,7 @@
 
 def dataframe_with_arrays(include_index=False):
     """
-    Dataframe with numpy arrays columns of every possible primtive type.
+    Dataframe with numpy arrays columns of every possible primitive type.
 
     Returns
     -------
@@ -83,7 +83,7 @@ def dataframe_with_arrays(include_index=False):
 
 def dataframe_with_lists(include_index=False, parquet_compatible=False):
     """
-    Dataframe with list columns of every possible primtive type.
+    Dataframe with list columns of every possible primitive type.
 
     Returns
     -------

diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py
@@ -243,7 +243,7 @@ def record_batches(draw, type, rows=None, max_fields=None):
 
     schema = draw(schemas(type, max_fields=max_fields))
     children = [draw(arrays(field.type, size=rows)) for field in schema]
-    # TODO(kszucs): the names and schame arguments are not consistent with
+    # TODO(kszucs): the names and schema arguments are not consistent with
     #               Table.from_array's arguments
     return pa.RecordBatch.from_arrays(children, names=schema)
 

diff --git a/python/pyarrow/tests/test_cuda.py b/python/pyarrow/tests/test_cuda.py
@@ -278,7 +278,7 @@ def test_foreign_buffer():
     del fbuf
     assert sys.getrefcount(hbuf) == rc
 
-    # test postponed dealloction of host buffer memory
+    # test postponed deallocation of host buffer memory
     fbuf = ctx.foreign_buffer(hbuf.address, hbuf.size, hbuf)
     del hbuf
     fbuf.copy_to_host()

diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -87,7 +87,7 @@ def _check_pandas_roundtrip(df, expected=None, use_threads=True,
     result = table.to_pandas(use_threads=use_threads)
 
     if expected_schema:
-        # all occurences of _check_pandas_roundtrip passes expected_schema
+        # all occurrences of _check_pandas_roundtrip passes expected_schema
         # without the pandas generated key-value metadata
         assert table.schema.equals(expected_schema, check_metadata=False)
 
@@ -425,7 +425,7 @@ def test_binary_column_name(self):
         data = {key: column_data}
         df = pd.DataFrame(data)
 
-        # we can't use _check_pandas_roundtrip here because our metdata
+        # we can't use _check_pandas_roundtrip here because our metadata
         # is always decoded as utf8: even if binary goes in, utf8 comes out
         t = pa.Table.from_pandas(df, preserve_index=True)
         df2 = t.to_pandas()
@@ -3337,7 +3337,7 @@ def __reduce__(self):
 
 
 def PandasArray__arrow_array__(self, type=None):
-    # harcode dummy return regardless of self - we only want to check that
+    # hardcode dummy return regardless of self - we only want to check that
     # this method is correctly called
     storage = pa.array([1, 2, 3], type=pa.int64())
     return pa.ExtensionArray.from_storage(DummyExtensionType(), storage)

diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
@@ -1596,7 +1596,7 @@ def test_equivalency(tempdir):
 
     # Check that all rows in the DF fulfill the filter
     # Pandas 0.23.x has problems with indexing constant memoryviews in
-    # categoricals. Thus we need to make an explicity copy here with np.array.
+    # categoricals. Thus we need to make an explicit copy here with np.array.
     df_filter_1 = (np.array(result_df['integer']) == 1) \
         & (np.array(result_df['string']) != 'b') \
         & (np.array(result_df['boolean']) == 'True')
@@ -2803,24 +2803,24 @@ def _make_dataset_for_pickling(tempdir, N=100):
     pytest.param(pytest.importorskip('cloudpickle'), id='cloudpickle')
 ])
 def test_pickle_dataset(tempdir, datadir, pickler):
-    def is_pickleable(obj):
+    def is_picklable(obj):
         return obj == pickler.loads(pickler.dumps(obj))
 
     dataset = _make_dataset_for_pickling(tempdir)
 
-    assert is_pickleable(dataset)
-    assert is_pickleable(dataset.metadata)
-    assert is_pickleable(dataset.metadata.schema)
+    assert is_picklable(dataset)
+    assert is_picklable(dataset.metadata)
+    assert is_picklable(dataset.metadata.schema)
     assert len(dataset.metadata.schema)
     for column in dataset.metadata.schema:
-        assert is_pickleable(column)
+        assert is_picklable(column)
 
     for piece in dataset.pieces:
-        assert is_pickleable(piece)
+        assert is_picklable(piece)
         metadata = piece.get_metadata()
         assert metadata.num_row_groups
         for i in range(metadata.num_row_groups):
-            assert is_pickleable(metadata.row_group(i))
+            assert is_picklable(metadata.row_group(i))
 
 
 @pytest.mark.pandas

diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py
@@ -140,21 +140,21 @@ def is_floating(t):
 
 def is_float16(t):
     """
-    Return True if value is an instance of an float16 (half-precision) type
+    Return True if value is an instance of a float16 (half-precision) type
     """
     return t.id == lib.Type_HALF_FLOAT
 
 
 def is_float32(t):
     """
-    Return True if value is an instance of an float32 (single precision) type
+    Return True if value is an instance of a float32 (single precision) type
     """
     return t.id == lib.Type_FLOAT
 
 
 def is_float64(t):
     """
-    Return True if value is an instance of an float64 (double precision) type
+    Return True if value is an instance of a float64 (double precision) type
     """
     return t.id == lib.Type_DOUBLE