From d9500968f438c82111c3741f10c49b60ae160c12 Mon Sep 17 00:00:00 2001 From: Troels Nielsen Date: Sat, 15 Sep 2018 14:12:55 +0200 Subject: [PATCH] BUG: Make sure that sas7bdat parsers memory is initialized to 0 (#21616) (#22651) --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/io/sas/sas7bdat.py | 2 +- pandas/tests/io/sas/data/cars.sas7bdat | Bin 0 -> 13312 bytes pandas/tests/io/sas/test_sas7bdat.py | 16 ++++++++++++++++ 4 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/io/sas/data/cars.sas7bdat diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index f2ec08c61a6d8..e2e7f9e1e2324 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -739,7 +739,7 @@ I/O - :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) - :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) -- +- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) Plotting ^^^^^^^^ diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index b2d930c1be5e7..efeb306b618d1 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -614,7 +614,7 @@ def read(self, nrows=None): ns = (self.column_types == b's').sum() self._string_chunk = np.empty((ns, nrows), dtype=np.object) - self._byte_chunk = np.empty((nd, 8 * nrows), dtype=np.uint8) + self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8) self._current_row_in_chunk_index = 0 p = Parser(self) diff --git a/pandas/tests/io/sas/data/cars.sas7bdat b/pandas/tests/io/sas/data/cars.sas7bdat new file mode 100644 index 0000000000000000000000000000000000000000..ca5d3474c36ad4532d1b87d1f872491c9bee8f1f GIT binary patch literal 13312 zcmeHN4Q!R=6@K{{GHjN*HCrXbT^V77AQlP?k+HYsPh2G^2q{c|Mi1o-REDNvA6!KwU>bU zC^l+D2A??sBDiWtjZl%5O)GOtmbENi*mTp%+^4t}HLYsm*PSc>>8|HbHrX93<(~6l zJ>F?tt#hLSko>r2dIPQ-8rCdZogr{MWH;`sb7zBEgo2l2c}o_8va&)21% z>r&5^aq?Zr@R^q^k$iA7z03LV+|cV?{ujx!n;zlHHt${=5ZI(Ekbd=uzztE_0se!+ zGa_8s;@vw6o9&hPYY8MO=XzJ0Jo@$C?JfyC$GiIEvE1R^#6tojf&y0n z4?H38>77*B1 zN&;t{Kd4@s^Nh)^!y^3+Qv&*|)ihYr+-=@Hq(oj2rOPY5n^g_F)uMMC-vsoln^JPL5XT4G*$!YVy0T9Sz%YDWW$ zm~2pK%MN*ROjB%xU@oszo>)^NA--NMP5NA^Fo>KJ&mcWGAkS8SY`vFP^MsZ>q)Qsv z2zMxKcGJKm3WFzOlVu7wA`K;3((Gn&cN`Vil(0jLRJ#r4$oP%I785%E+Ebzg9IQ39`PPs?K~qGbD5Q%qA^K@3K86=u%&?!yiaGErI>_RdtAIU-I-oxFv)hQ zWs#r26%Y}BoX#lVILv-|9?Zc6OPBQ?dQ zgwj7%7)cY|9MOYHA0q@)yuDwRwAx`zeyDbvJA|hyoN^xSOs{sHCG``HpczQJU)EiL z^%Gk1J0+VDnQi7oNenc`x^<{xg`snis z#NV0azVwN|ljHF=Tpye}cW$mc^EQ@0Mvn;hQSy9g3g=p*1?P89YRqpQgj?T8O_S`+~op*Kwk)F-lx;OOld@xCDx#$f+Yk&~raLGl5 zO`t}E8FQ{T2LJTVc<`Az^$sXtpA(Gt4}sy9xj@M(>|f`>{&^lOqv$jG@E)$f8R-89 zuHWKbBWqpOFw%Vm&%5cgTq?_t)A>Wlx*3mfAzXz2*RJ&0_hPou-I4gm`#8BXCak$m z`TR+^g|7te)P!Kw@uOTdp_V5kRjc3y@1tQ+DitC(ctLsmg0zfRN9z0^`rnB3cN^no zgxianl25*e)-EF{-{#$l#|5^L#C^oE0|KXpI;P1fo%I-wIg!9fdRFK0H{+5HgJ9aD zA~M(D+q`>S=Y!3j0yf0c^x$N6d)VD^sM?ZV943!c>8A8cssy^l%L2T7>j{yb9PQ?C zEDi{4(j3$C4+{)SLmdC?pGc=7xXQbh#{^Doh*x7LrevFI zfd#x>VOx@?#=W|vNprKkMbnUy#{TDMNtQGN>5f-R8Z&7qRPP{3pE3HT2$fc!=p+1! zvMLJ?C^apY+!M~%G-S(lttQQ+xix=KVM{SL+cQ+jjA%&(^x1V(yo8dJfm;+d1H(~Y zsW~%BlY(51-4fV>Ni#d+&%?o+@is?lz5pe)i3@Z-9>>q0`3t4N#u4q_H6SrD zR?+UW(l`oP_+F7_V8tEQm@J5PmoWr1Ca&4wl@{1Ssz~6a&a|?R(v0_pesL#T z4J*_@gFSNjOLgaJa~^h28x$|68w@XB6gWi)(x;&GF|x5EpEx2)EDhw~dSz9XYA(-J`YUv+xAzY|a{ohk+mJlCz`T?8qy$BLf~QpGua zdlyG(j*Mi<@=#5)!LTG3syM%Kqp_+04kpsR@%X|hO|nf3?qrm32&j_N$b2rbgk-xu z5CXalW=EbLio`cIf<7Y6P35UT2*8l8Pn3{V{ncb+k3AGRM508b2~Q2Jx7c@is&ts&mXyit>xUmr3LyHT#S@adUdAG|2xLuw<9`L1o<)TJnN2q+9$uW zp3FrD0^P7Eq9Th9_Mij<@nNFb!guD$?1t`aBI^378y%&|Vt6x7uK_Oh{W%3AKW87M zG&+iLmbrgDP#G|)UILX|Z^7N2o793e8ao z8_nnt6ErdNl7EayCOr&X8l~yw&fq)w^+B_Qe@^ zL5c35c3_||mPD=zF~M&ZL}}lOJ)ZQR7Sm1ooRQgm+T3BNPJ|^hp<>JXV~{TP-2&-) zk2P?$d-q((4h%$*$S#6^i8)j96mSJgLjeZv4iSXiwj_^xr$TlBTc|2|{QnGzcb^fW z(uzO+D3Mb`8iCy&U@Qqs#6Z&i`VeoJVlkoaQpp3uHfLcB72lwAJwg~t3}!A{LgRq6 zx#L%PR|wJ|o4LaX>fiJW%~3Z~9Gg-KMqo!5Jr>NgFlHy&LLuI%B@wFS8Y+WXX~_`* zWHWvfez-*4a3^ITlq4HPv`2%PhDEB|VDfu?$wP&2>@}Fx&>iB9;?CAik{uL@6*t@C z{J=r*1Zj}$f zQiBv@_lz_Sjz+ZvO1`Mw-t2bkF}sCT?%J43!bOFfGVrq{&()nv#RSq6E>r4_Fnl(BF=wJ&H5F@bb*ggHsx?)Siu;^qj4+3=Vrnzn@SOCs%z-&7LcRW?T@ zq|bR^w#Q1*Q*+u9(r4ZQA5fQy3Fxy&VOx@xJfoKQxMT*-N>5XFj3>p7sh3TVmR#ns zhxoDfK@dVC*oOBlng*i}U#v{lz?vnl75~#urJJJ?Chh)`cj3w=Ew)~RIlP86mrJ4& xc8FwhLyav--EV2AQXE9Q?^8>RRV+X~l_cg2{~V6Ma0G@UFdTv52z>St_%D}_T}J=_ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 101ee3e619f5b..efde152a918bd 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -183,6 +183,22 @@ def test_date_time(datapath): tm.assert_frame_equal(df, df0) +def test_compact_numerical_values(datapath): + # Regression test for #21616 + fname = datapath("io", "sas", "data", "cars.sas7bdat") + df = pd.read_sas(fname, encoding='latin-1') + # The two columns CYL and WGT in cars.sas7bdat have column + # width < 8 and only contain integral values. + # Test that pandas doesn't corrupt the numbers by adding + # decimals. + result = df['WGT'] + expected = df['WGT'].round() + tm.assert_series_equal(result, expected, check_exact=True) + result = df['CYL'] + expected = df['CYL'].round() + tm.assert_series_equal(result, expected, check_exact=True) + + def test_zero_variables(datapath): # Check if the SAS file has zero variables (PR #18184) fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")