From 1e3a5088011c0cca4844404d5edc87d18f34b517 Mon Sep 17 00:00:00 2001 From: Ishmeet Kaur Date: Tue, 6 Sep 2016 07:44:16 -0700 Subject: [PATCH 01/11] Show function modification pyspark --- .../main/resources/python/zeppelin_pyspark.py | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/spark/src/main/resources/python/zeppelin_pyspark.py b/spark/src/main/resources/python/zeppelin_pyspark.py index 9a405566036..1211b1fb86a 100644 --- a/spark/src/main/resources/python/zeppelin_pyspark.py +++ b/spark/src/main/resources/python/zeppelin_pyspark.py @@ -29,6 +29,12 @@ from pyspark.serializers import MarshalSerializer, PickleSerializer import ast import traceback +import base64 +from io import BytesIO +try: + from StringIO import StringIO +except ImportError: + from io import StringIO # for back compatibility from pyspark.sql import SQLContext, HiveContext, Row @@ -50,6 +56,7 @@ def flush(self): class PyZeppelinContext(dict): def __init__(self, zc): self.z = zc + self.max_result = 1000 def show(self, obj): from pyspark.sql import DataFrame @@ -58,6 +65,16 @@ def show(self, obj): else: print(str(obj)) + def show_plot(self, p, **kwargs): + if hasattr(p, '__name__') and p.__name__ == "matplotlib.pyplot": + self.show_matplotlib(p, **kwargs) + elif type(p).__name__ == "DataFrame": # does not play well with sub-classes + # `isinstance(p, DataFrame)` would req `import pandas.core.frame.DataFrame` + # and so a dependency on pandas + self.show_dataframe(p, **kwargs) + elif hasattr(p, '__call__'): + p() #error reporting + # By implementing special methods it makes operating on it more Pythonic def __setitem__(self, key, item): self.z.put(key, item) @@ -71,6 +88,57 @@ def __delitem__(self, key): def __contains__(self, item): return self.z.containsKey(item) + def show_dataframe(self, df, **kwargs): + """Pretty prints DF using Table Display System + """ + limit = len(df) > self.max_result + header_buf = StringIO("") + header_buf.write(str(df.columns[0])) + for col in df.columns[1:]: + header_buf.write("\t") + header_buf.write(str(col)) + header_buf.write("\n") + + body_buf = StringIO("") + rows = df.head(self.max_result).values if limit else df.values + for row in rows: + body_buf.write(str(row[0])) + for cell in row[1:]: + body_buf.write("\t") + body_buf.write(str(cell)) + body_buf.write("\n") + body_buf.seek(0); header_buf.seek(0) + #TODO(bzz): fix it, so it shows red notice, as in Spark + print("%table " + header_buf.read() + body_buf.read()) # + + # ("\nResults are limited by {}." \ + # .format(self.max_result) if limit else "") + #) + body_buf.close(); header_buf.close() + + def show_matplotlib(self, p, fmt="png", width="auto", height="auto", + **kwargs): + """Matplotlib show function + """ + if fmt == "png": + img = BytesIO() + p.savefig(img, format=fmt) + img_str = b"data:image/png;base64," + img_str += base64.b64encode(img.getvalue().strip()) + img_tag = "" + # Decoding is necessary for Python 3 compability + img_str = img_str.decode("ascii") + img_str = img_tag.format(img=img_str, width=width, height=height) + elif fmt == "svg": + img = StringIO() + p.savefig(img, format=fmt) + img_str = img.getvalue() + else: + raise ValueError("fmt must be 'png' or 'svg'") + + html = "%html
{img}
" + print(html.format(width=width, height=height, img=img_str)) + img.close() + def add(self, key, value): self.__setitem__(key, value) From 561860a2321409c596397e2eb3366ff7c1820e51 Mon Sep 17 00:00:00 2001 From: Ishmeet Kaur Date: Wed, 7 Sep 2016 08:52:35 -0700 Subject: [PATCH 02/11] show function modification pyspark --- .../main/resources/python/zeppelin_pyspark.py | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/spark/src/main/resources/python/zeppelin_pyspark.py b/spark/src/main/resources/python/zeppelin_pyspark.py index 1211b1fb86a..3570aacce01 100644 --- a/spark/src/main/resources/python/zeppelin_pyspark.py +++ b/spark/src/main/resources/python/zeppelin_pyspark.py @@ -115,29 +115,29 @@ def show_dataframe(self, df, **kwargs): #) body_buf.close(); header_buf.close() - def show_matplotlib(self, p, fmt="png", width="auto", height="auto", - **kwargs): - """Matplotlib show function - """ - if fmt == "png": - img = BytesIO() - p.savefig(img, format=fmt) - img_str = b"data:image/png;base64," - img_str += base64.b64encode(img.getvalue().strip()) - img_tag = "" - # Decoding is necessary for Python 3 compability - img_str = img_str.decode("ascii") - img_str = img_tag.format(img=img_str, width=width, height=height) - elif fmt == "svg": - img = StringIO() - p.savefig(img, format=fmt) - img_str = img.getvalue() - else: - raise ValueError("fmt must be 'png' or 'svg'") - - html = "%html
{img}
" - print(html.format(width=width, height=height, img=img_str)) - img.close() + def show_matplotlib(self, p, fmt="png", width="auto", height="auto", + **kwargs): + """Matplotlib show function + """ + if fmt == "png": + img = BytesIO() + p.savefig(img, format=fmt) + img_str = b"data:image/png;base64," + img_str += base64.b64encode(img.getvalue().strip()) + img_tag = "" + # Decoding is necessary for Python 3 compability + img_str = img_str.decode("ascii") + img_str = img_tag.format(img=img_str, width=width, height=height) + elif fmt == "svg": + img = StringIO() + p.savefig(img, format=fmt) + img_str = img.getvalue() + else: + raise ValueError("fmt must be 'png' or 'svg'") + + html = "%html
{img}
" + print(html.format(width=width, height=height, img=img_str)) + img.close() def add(self, key, value): self.__setitem__(key, value) From 6491ff2839e2e1a94fbbbd30727bf821cdb69805 Mon Sep 17 00:00:00 2001 From: Ishmeet Kaur Date: Wed, 7 Sep 2016 12:52:44 -0700 Subject: [PATCH 03/11] show function modification --- spark/src/main/resources/python/zeppelin_pyspark.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spark/src/main/resources/python/zeppelin_pyspark.py b/spark/src/main/resources/python/zeppelin_pyspark.py index 3570aacce01..684a97c33ba 100644 --- a/spark/src/main/resources/python/zeppelin_pyspark.py +++ b/spark/src/main/resources/python/zeppelin_pyspark.py @@ -67,11 +67,11 @@ def show(self, obj): def show_plot(self, p, **kwargs): if hasattr(p, '__name__') and p.__name__ == "matplotlib.pyplot": - self.show_matplotlib(p, **kwargs) + show_matplotlib(p, **kwargs) elif type(p).__name__ == "DataFrame": # does not play well with sub-classes # `isinstance(p, DataFrame)` would req `import pandas.core.frame.DataFrame` # and so a dependency on pandas - self.show_dataframe(p, **kwargs) + show_dataframe(p, **kwargs) elif hasattr(p, '__call__'): p() #error reporting From 207d6dd0dbe79f66b34088ff96db1fa8835ddb61 Mon Sep 17 00:00:00 2001 From: Ishmeet Kaur Date: Thu, 8 Sep 2016 07:31:59 -0700 Subject: [PATCH 04/11] Revert "show function modification pyspark" This reverts commit 561860a2321409c596397e2eb3366ff7c1820e51. --- .../main/resources/python/zeppelin_pyspark.py | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/spark/src/main/resources/python/zeppelin_pyspark.py b/spark/src/main/resources/python/zeppelin_pyspark.py index 684a97c33ba..43c07be34dc 100644 --- a/spark/src/main/resources/python/zeppelin_pyspark.py +++ b/spark/src/main/resources/python/zeppelin_pyspark.py @@ -115,29 +115,29 @@ def show_dataframe(self, df, **kwargs): #) body_buf.close(); header_buf.close() - def show_matplotlib(self, p, fmt="png", width="auto", height="auto", - **kwargs): - """Matplotlib show function - """ - if fmt == "png": - img = BytesIO() - p.savefig(img, format=fmt) - img_str = b"data:image/png;base64," - img_str += base64.b64encode(img.getvalue().strip()) - img_tag = "" - # Decoding is necessary for Python 3 compability - img_str = img_str.decode("ascii") - img_str = img_tag.format(img=img_str, width=width, height=height) - elif fmt == "svg": - img = StringIO() - p.savefig(img, format=fmt) - img_str = img.getvalue() - else: - raise ValueError("fmt must be 'png' or 'svg'") - - html = "%html
{img}
" - print(html.format(width=width, height=height, img=img_str)) - img.close() + def show_matplotlib(self, p, fmt="png", width="auto", height="auto", + **kwargs): + """Matplotlib show function + """ + if fmt == "png": + img = BytesIO() + p.savefig(img, format=fmt) + img_str = b"data:image/png;base64," + img_str += base64.b64encode(img.getvalue().strip()) + img_tag = "" + # Decoding is necessary for Python 3 compability + img_str = img_str.decode("ascii") + img_str = img_tag.format(img=img_str, width=width, height=height) + elif fmt == "svg": + img = StringIO() + p.savefig(img, format=fmt) + img_str = img.getvalue() + else: + raise ValueError("fmt must be 'png' or 'svg'") + + html = "%html
{img}
" + print(html.format(width=width, height=height, img=img_str)) + img.close() def add(self, key, value): self.__setitem__(key, value) From d42d49b0b820aa24024677b7910b40e7025efebd Mon Sep 17 00:00:00 2001 From: Ishmeet Kaur Date: Thu, 8 Sep 2016 07:42:58 -0700 Subject: [PATCH 05/11] show function modification --- .../main/resources/python/zeppelin_pyspark.py | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/spark/src/main/resources/python/zeppelin_pyspark.py b/spark/src/main/resources/python/zeppelin_pyspark.py index 43c07be34dc..3570aacce01 100644 --- a/spark/src/main/resources/python/zeppelin_pyspark.py +++ b/spark/src/main/resources/python/zeppelin_pyspark.py @@ -67,11 +67,11 @@ def show(self, obj): def show_plot(self, p, **kwargs): if hasattr(p, '__name__') and p.__name__ == "matplotlib.pyplot": - show_matplotlib(p, **kwargs) + self.show_matplotlib(p, **kwargs) elif type(p).__name__ == "DataFrame": # does not play well with sub-classes # `isinstance(p, DataFrame)` would req `import pandas.core.frame.DataFrame` # and so a dependency on pandas - show_dataframe(p, **kwargs) + self.show_dataframe(p, **kwargs) elif hasattr(p, '__call__'): p() #error reporting @@ -115,29 +115,29 @@ def show_dataframe(self, df, **kwargs): #) body_buf.close(); header_buf.close() - def show_matplotlib(self, p, fmt="png", width="auto", height="auto", - **kwargs): - """Matplotlib show function - """ - if fmt == "png": - img = BytesIO() - p.savefig(img, format=fmt) - img_str = b"data:image/png;base64," - img_str += base64.b64encode(img.getvalue().strip()) - img_tag = "" - # Decoding is necessary for Python 3 compability - img_str = img_str.decode("ascii") - img_str = img_tag.format(img=img_str, width=width, height=height) - elif fmt == "svg": - img = StringIO() - p.savefig(img, format=fmt) - img_str = img.getvalue() - else: - raise ValueError("fmt must be 'png' or 'svg'") - - html = "%html
{img}
" - print(html.format(width=width, height=height, img=img_str)) - img.close() + def show_matplotlib(self, p, fmt="png", width="auto", height="auto", + **kwargs): + """Matplotlib show function + """ + if fmt == "png": + img = BytesIO() + p.savefig(img, format=fmt) + img_str = b"data:image/png;base64," + img_str += base64.b64encode(img.getvalue().strip()) + img_tag = "" + # Decoding is necessary for Python 3 compability + img_str = img_str.decode("ascii") + img_str = img_tag.format(img=img_str, width=width, height=height) + elif fmt == "svg": + img = StringIO() + p.savefig(img, format=fmt) + img_str = img.getvalue() + else: + raise ValueError("fmt must be 'png' or 'svg'") + + html = "%html
{img}
" + print(html.format(width=width, height=height, img=img_str)) + img.close() def add(self, key, value): self.__setitem__(key, value) From 3224184c51886ea286bd41dc9c9c34b09d9bffe9 Mon Sep 17 00:00:00 2001 From: Ishmeet Kaur Date: Thu, 8 Sep 2016 08:45:59 -0700 Subject: [PATCH 06/11] combining show and show_plot --- .../main/resources/python/zeppelin_pyspark.py | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/spark/src/main/resources/python/zeppelin_pyspark.py b/spark/src/main/resources/python/zeppelin_pyspark.py index 3570aacce01..0f9f6b4b844 100644 --- a/spark/src/main/resources/python/zeppelin_pyspark.py +++ b/spark/src/main/resources/python/zeppelin_pyspark.py @@ -58,22 +58,20 @@ def __init__(self, zc): self.z = zc self.max_result = 1000 - def show(self, obj): + def show(self, obj,**kwargs): from pyspark.sql import DataFrame if isinstance(obj, DataFrame): - print(gateway.jvm.org.apache.zeppelin.spark.ZeppelinContext.showDF(self.z, obj._jdf)) + print(gateway.jvm.org.apache.zeppelin.spark.ZeppelinContext.showDF(self.z, obj._jdf)) + elif hasattr(obj, '__name__') and obj.__name__ == "matplotlib.pyplot": + self.show_matplotlib(obj, **kwargs) + elif type(obj).__name__ == "DataFrame": # does not play well with sub-classes + # `isinstance(p, DataFrame)` would req `import pandas.core.frame.DataFrame` + # and so a dependency on pandas + self.show_dataframe(obj, **kwargs) + elif hasattr(obj, '__call__'): + obj() #error reporting else: - print(str(obj)) - - def show_plot(self, p, **kwargs): - if hasattr(p, '__name__') and p.__name__ == "matplotlib.pyplot": - self.show_matplotlib(p, **kwargs) - elif type(p).__name__ == "DataFrame": # does not play well with sub-classes - # `isinstance(p, DataFrame)` would req `import pandas.core.frame.DataFrame` - # and so a dependency on pandas - self.show_dataframe(p, **kwargs) - elif hasattr(p, '__call__'): - p() #error reporting + print(str(obj)) # By implementing special methods it makes operating on it more Pythonic def __setitem__(self, key, item): From 0faf2aa2b3b979f352a7a1f8f449d47a24651a89 Mon Sep 17 00:00:00 2001 From: Ishmeet Kaur Date: Fri, 9 Sep 2016 01:06:55 -0700 Subject: [PATCH 07/11] dataframe pandas dependency --- spark/src/main/resources/python/zeppelin_pyspark.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/spark/src/main/resources/python/zeppelin_pyspark.py b/spark/src/main/resources/python/zeppelin_pyspark.py index 0f9f6b4b844..218f6995430 100644 --- a/spark/src/main/resources/python/zeppelin_pyspark.py +++ b/spark/src/main/resources/python/zeppelin_pyspark.py @@ -30,6 +30,7 @@ import ast import traceback import base64 +import pandas.core.frame.DataFrame from io import BytesIO try: from StringIO import StringIO @@ -61,13 +62,11 @@ def __init__(self, zc): def show(self, obj,**kwargs): from pyspark.sql import DataFrame if isinstance(obj, DataFrame): + #pandas object module dependency if dataframe is a pandas module + print "pandas object module" print(gateway.jvm.org.apache.zeppelin.spark.ZeppelinContext.showDF(self.z, obj._jdf)) elif hasattr(obj, '__name__') and obj.__name__ == "matplotlib.pyplot": - self.show_matplotlib(obj, **kwargs) - elif type(obj).__name__ == "DataFrame": # does not play well with sub-classes - # `isinstance(p, DataFrame)` would req `import pandas.core.frame.DataFrame` - # and so a dependency on pandas - self.show_dataframe(obj, **kwargs) + self.show_matplotlib(obj, **kwargs) elif hasattr(obj, '__call__'): obj() #error reporting else: From 1d3583a72918c905a783d32145b4bea3826a637b Mon Sep 17 00:00:00 2001 From: Ishmeet Kaur Date: Sat, 10 Sep 2016 02:22:10 -0700 Subject: [PATCH 08/11] pandas dataframe dependency resolution --- spark/src/main/resources/python/zeppelin_pyspark.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spark/src/main/resources/python/zeppelin_pyspark.py b/spark/src/main/resources/python/zeppelin_pyspark.py index 218f6995430..8a7edd71d4e 100644 --- a/spark/src/main/resources/python/zeppelin_pyspark.py +++ b/spark/src/main/resources/python/zeppelin_pyspark.py @@ -30,7 +30,7 @@ import ast import traceback import base64 -import pandas.core.frame.DataFrame +import pandas as pd from io import BytesIO try: from StringIO import StringIO @@ -61,9 +61,9 @@ def __init__(self, zc): def show(self, obj,**kwargs): from pyspark.sql import DataFrame - if isinstance(obj, DataFrame): - #pandas object module dependency if dataframe is a pandas module - print "pandas object module" + if isinstance(obj, DataFrame) and isinstance(obj, pd.DataFrame): + self.show_dataframe(p, **kwargs) + elif isinstance(obj, DataFrame) and not isinstance(obj, pd.DataFrame): print(gateway.jvm.org.apache.zeppelin.spark.ZeppelinContext.showDF(self.z, obj._jdf)) elif hasattr(obj, '__name__') and obj.__name__ == "matplotlib.pyplot": self.show_matplotlib(obj, **kwargs) From cd3e111383b261b1def068e8ec9fbacf73132fe9 Mon Sep 17 00:00:00 2001 From: Ishmeet Kaur Date: Sat, 10 Sep 2016 08:50:13 -0700 Subject: [PATCH 09/11] Dataframe pandas module resolution --- spark/src/main/resources/python/zeppelin_pyspark.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/spark/src/main/resources/python/zeppelin_pyspark.py b/spark/src/main/resources/python/zeppelin_pyspark.py index 8a7edd71d4e..7eda664eaa1 100644 --- a/spark/src/main/resources/python/zeppelin_pyspark.py +++ b/spark/src/main/resources/python/zeppelin_pyspark.py @@ -30,7 +30,6 @@ import ast import traceback import base64 -import pandas as pd from io import BytesIO try: from StringIO import StringIO @@ -61,9 +60,7 @@ def __init__(self, zc): def show(self, obj,**kwargs): from pyspark.sql import DataFrame - if isinstance(obj, DataFrame) and isinstance(obj, pd.DataFrame): - self.show_dataframe(p, **kwargs) - elif isinstance(obj, DataFrame) and not isinstance(obj, pd.DataFrame): + if isinstance(obj, DataFrame) and type(p).__name__ == "DataFrame": print(gateway.jvm.org.apache.zeppelin.spark.ZeppelinContext.showDF(self.z, obj._jdf)) elif hasattr(obj, '__name__') and obj.__name__ == "matplotlib.pyplot": self.show_matplotlib(obj, **kwargs) From 3ee2555885ed5ea872cd6ef52d43ab44d0107418 Mon Sep 17 00:00:00 2001 From: ishmeetkaur Date: Mon, 12 Sep 2016 00:04:39 +0200 Subject: [PATCH 10/11] dataframe function removed --- .../main/resources/python/zeppelin_pyspark.py | 29 +------------------ 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/spark/src/main/resources/python/zeppelin_pyspark.py b/spark/src/main/resources/python/zeppelin_pyspark.py index 7eda664eaa1..78855d68610 100644 --- a/spark/src/main/resources/python/zeppelin_pyspark.py +++ b/spark/src/main/resources/python/zeppelin_pyspark.py @@ -80,34 +80,7 @@ def __delitem__(self, key): self.z.remove(key) def __contains__(self, item): - return self.z.containsKey(item) - - def show_dataframe(self, df, **kwargs): - """Pretty prints DF using Table Display System - """ - limit = len(df) > self.max_result - header_buf = StringIO("") - header_buf.write(str(df.columns[0])) - for col in df.columns[1:]: - header_buf.write("\t") - header_buf.write(str(col)) - header_buf.write("\n") - - body_buf = StringIO("") - rows = df.head(self.max_result).values if limit else df.values - for row in rows: - body_buf.write(str(row[0])) - for cell in row[1:]: - body_buf.write("\t") - body_buf.write(str(cell)) - body_buf.write("\n") - body_buf.seek(0); header_buf.seek(0) - #TODO(bzz): fix it, so it shows red notice, as in Spark - print("%table " + header_buf.read() + body_buf.read()) # + - # ("\nResults are limited by {}." \ - # .format(self.max_result) if limit else "") - #) - body_buf.close(); header_buf.close() + return self.z.containsKey(item) def show_matplotlib(self, p, fmt="png", width="auto", height="auto", **kwargs): From 9dc5dc01662e5c89b02a93cb5b4483c236e374d9 Mon Sep 17 00:00:00 2001 From: ishmeetkaur Date: Mon, 12 Sep 2016 15:24:35 +0200 Subject: [PATCH 11/11] removing whitespace --- spark/src/main/resources/python/zeppelin_pyspark.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spark/src/main/resources/python/zeppelin_pyspark.py b/spark/src/main/resources/python/zeppelin_pyspark.py index 65d241bc2ce..3a8425bed18 100644 --- a/spark/src/main/resources/python/zeppelin_pyspark.py +++ b/spark/src/main/resources/python/zeppelin_pyspark.py @@ -60,14 +60,14 @@ def __init__(self, zc): def show(self, obj,**kwargs): from pyspark.sql import DataFrame - if isinstance(obj, DataFrame) and type(p).__name__ == "DataFrame": + if isinstance(obj, DataFrame) and type(obj).__name__ == "DataFrame": print(gateway.jvm.org.apache.zeppelin.spark.ZeppelinContext.showDF(self.z, obj._jdf)) elif hasattr(obj, '__name__') and obj.__name__ == "matplotlib.pyplot": self.show_matplotlib(obj, **kwargs) elif hasattr(obj, '__call__'): obj() #error reporting else: - print(str(obj)) + print(str(obj)) # By implementing special methods it makes operating on it more Pythonic def __setitem__(self, key, item):