Skip to content

Commit

Permalink
[SPARK-5722] [SQL] [PySpark] infer int as LongType in Python (for 1.2…
Browse files Browse the repository at this point in the history
… branch)

This PR change to use LongType for int in Python, when inferSchema(), because IntegerType in SQL is not enough for int in Python (which is 64-bit on 64-bit machines).

Closes #4521

cc dondrake marmbrus

Author: Davies Liu <davies@databricks.com>

Closes #4681 from davies/long2 and squashes the following commits:

05ef1c8 [Davies Liu] infer LongType for int in Python
  • Loading branch information
Davies Liu authored and marmbrus committed Feb 24, 2015
1 parent 5cea859 commit 71173de
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 5 deletions.
8 changes: 4 additions & 4 deletions python/pyspark/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ def _parse_datatype_json_value(json_value):
_type_mappings = {
type(None): NullType,
bool: BooleanType,
int: IntegerType,
int: LongType,
long: LongType,
float: DoubleType,
str: StringType,
Expand Down Expand Up @@ -926,11 +926,11 @@ def _infer_schema_type(obj, dataType):
>>> schema = _parse_schema_abstract("a b c d")
>>> row = (1, 1.0, "str", datetime.date(2014, 10, 10))
>>> _infer_schema_type(row, schema)
StructType...IntegerType...DoubleType...StringType...DateType...
StructType...LongType...DoubleType...StringType...DateType...
>>> row = [[1], {"key": (1, 2.0)}]
>>> schema = _parse_schema_abstract("a[] b{c d}")
>>> _infer_schema_type(row, schema)
StructType...a,ArrayType...b,MapType(StringType,...c,IntegerType...
StructType...a,ArrayType...b,MapType(StringType,...c,LongType...
"""
if dataType is None:
return _infer_type(obj)
Expand Down Expand Up @@ -985,7 +985,7 @@ def _verify_type(obj, dataType):
>>> _verify_type(None, StructType([]))
>>> _verify_type("", StringType())
>>> _verify_type(0, IntegerType())
>>> _verify_type(0, LongType())
>>> _verify_type(range(3), ArrayType(ShortType()))
>>> _verify_type(set(), ArrayType(StringType())) # doctest: +IGNORE_EXCEPTION_DETAIL
Traceback (most recent call last):
Expand Down
23 changes: 22 additions & 1 deletion python/pyspark/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
CloudPickleSerializer, CompressedSerializer
from pyspark.shuffle import Aggregator, InMemoryMerger, ExternalMerger, ExternalSorter
from pyspark.sql import SQLContext, IntegerType, Row, ArrayType, StructType, StructField, \
UserDefinedType, DoubleType
UserDefinedType, DoubleType, LongType, _infer_type
from pyspark import shuffle

_have_scipy = False
Expand Down Expand Up @@ -985,6 +985,27 @@ def test_parquet_with_udt(self):
point = srdd1.first().point
self.assertEquals(point, ExamplePoint(1.0, 2.0))

def test_infer_long_type(self):
longrow = [Row(f1='a', f2=100000000000000)]
rdd = self.sc.parallelize(longrow)
srdd = self.sqlCtx.inferSchema(rdd)
self.assertEqual(srdd.schema().fields[1].dataType, LongType())

# this saving as Parquet caused issues as well.
output_dir = os.path.join(self.tempdir.name, "infer_long_type")
srdd.saveAsParquetFile(output_dir)
df1 = self.sqlCtx.parquetFile(output_dir)
self.assertEquals('a', df1.first().f1)
self.assertEquals(100000000000000, df1.first().f2)

self.assertEqual(_infer_type(1), LongType())
self.assertEqual(_infer_type(2**10), LongType())
self.assertEqual(_infer_type(2**20), LongType())
self.assertEqual(_infer_type(2**31 - 1), LongType())
self.assertEqual(_infer_type(2**31), LongType())
self.assertEqual(_infer_type(2**61), LongType())
self.assertEqual(_infer_type(2**71), LongType())


class InputFormatTests(ReusedPySparkTestCase):

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
case ByteType => true
case ShortType => true
case FloatType => true
case LongType => true
case DateType => true
case TimestampType => true
case ArrayType(_, _) => true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ object EvaluatePython {
case (c: Int, ShortType) => c.toShort
case (c: Long, ShortType) => c.toShort
case (c: Long, IntegerType) => c.toInt
case (c: Int, LongType) => c.toLong
case (c: Double, FloatType) => c.toFloat
case (c, StringType) if !c.isInstanceOf[String] => c.toString

Expand Down

0 comments on commit 71173de

Please sign in to comment.