# Tutorial for NGWB Ingest API with Brunel visualizations
Brunel defines a highly succinct and novel language that defines interactive data visualizations based on tabular data. The language is well suited for both data scientists and more aggressive business users. The system interprets the language and produces visualizations using the user's choice of existing lower-level visualization technologies typically used by application engineers such as RAVE or D3.

In [1]:
#!pip install brunel

In [2]:
import pandas as pd
import brunel

In [3]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
from extension_utils import ExtensionUtils
eu = ExtensionUtils(sqlContext)

In [4]:
!wget --quiet  --output-document /resources/data/sparklingdataset.zip https://ibm.box.com/shared/static/9nxnsf6xwmuczjea911xjxp8l21yyd2x.zip
!unzip -o /resources/data/sparklingdataset.zip -d /resources/data/sparklingdata/
!rm /resources/data/sparklingdataset.zip

Archive:  /resources/data/sparklingdataset.zip
  inflating: /resources/data/sparklingdata/data/sampleDataDir/customers.csv  
  inflating: /resources/data/sparklingdata/data/sampleDataDir/drugInfo2014.json  
  inflating: /resources/data/sparklingdata/data/sampleDataDir/drugInfo2015.json  
  inflating: /resources/data/sparklingdata/data/sampleDocsDir/Events.doc  
  inflating: /resources/data/sparklingdata/data/sampleDocsDir/News.pdf  


In [5]:
df = sqlContext.read.format("com.ibm.spark.discover").load("/resources/data/sparklingdata/data/sampleDataDir/")

In [6]:
options = {"extractFields": True}
dfInfered = eu.inferTypes(df, options)
dfInfered.printSchema()

root
 |-- Age: long (nullable = true)
 |-- BP: string (nullable = true)
 |-- Cholestrol: string (nullable = true)
 |-- Cost: string (nullable = true)
 |-- DateTested: string (nullable = true)
 |-- Drug: string (nullable = true)
 |-- Ent#: long (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Potassium: double (nullable = true)
 |-- Sodium: double (nullable = true)



In [7]:
options = {"revealNA": {"mode": "any", "brackets": (">[", "]<")}}
dfBaddata = eu.inferTypes(df, options)
dfBaddata.show()

+---+------+----------+------------+-------------------+-----+----+------+---------+--------+
|Age|    BP|Cholestrol|        Cost|         DateTested| Drug|Ent#|Gender|Potassium|  Sodium|
+---+------+----------+------------+-------------------+-----+----+------+---------+--------+
| 28|NORMAL|      HIGH|   >[05,14]<|1990-05-14T05:12:05|drugX|   4|     F| 0.072289|0.563682|
| 61|   LOW|      HIGH|   >[10,21]<|1980-10-11T10:10:20|drugY|   5|     F| 0.030998|0.559294|
| 41|   LOW|      HIGH|   >[11,14]<|1980-11-14T09:10:20|drugC|   8|     M| 0.069461|0.766635|
| 47|   LOW|      HIGH|    >[1,14]<|1980-11-14T09:10:20|drugC|  11|     F| 0.076147|0.896056|
| 34|  HIGH|    NORMAL|>[1.981,09]<|1980-10-11T10:10:20|drugY|  12|     F| 0.034782|0.667775|
| 43|   LOW|      HIGH|>[8.010,11]<|1980-10-11T10:10:20|drugY|  13|     M| 0.040746|0.626527|
| 74|   LOW|      HIGH|   >[10,11]<|1980-10-11T10:10:20|drugY|  14|     F| 0.037851|0.792674|
| 50|NORMAL|      HIGH|>[5.000,14]<|1990-05-14T05:12:05|drug

In [8]:
options = {'locale': 'es', 'groupingUsed': True}
dfFixedBaddata = eu.inferTypes(df, options)
eu.printTypes(dfFixedBaddata)

Age:{"inferred_type":"Long","inferred_occurrence":"100","threshold%":"50","DiscoveredDataTypes":[Long],"DiscoveredDataTypePercentages":[1.0]}
BP:{"inferred_type":"String","inferred_occurrence":"100","threshold%":"50","DiscoveredDataTypes":[String],"DiscoveredDataTypePercentages":[1.0]}
Cholestrol:{"inferred_type":"String","inferred_occurrence":"100","threshold%":"50","DiscoveredDataTypes":[String],"DiscoveredDataTypePercentages":[1.0]}
Cost:{"inferred_type":"Float","inferred_occurrence":"100","threshold%":"50","DiscoveredDataTypes":[Short, Float, Byte],"DiscoveredDataTypePercentages":[0.16666666666666666, 0.43333333333333335, 0.4]}
DateTested:{"inferred_type":"Timestamp","inferred_occurrence":"100","threshold%":"50","DiscoveredDataTypes":[Timestamp],"DiscoveredDataTypePercentages":[1.0]}
Drug:{"inferred_type":"String","inferred_occurrence":"100","threshold%":"50","DiscoveredDataTypes":[String],"DiscoveredDataTypePercentages":[1.0]}
Ent#:{"inferred_type":"Long","inferred_occurrence":"10

In [9]:
options = {"extractFields": True, 'locale': 'es', 'groupingUsed': True}
dfConverted = eu.inferTypes(dfFixedBaddata, options)
dfConverted.printSchema()
dfConverted.show(5)

root
 |-- Age: long (nullable = true)
 |-- BP: string (nullable = true)
 |-- Cholestrol: string (nullable = true)
 |-- Cost: string (nullable = true)
 |-- DateTested: string (nullable = true)
 |-- Drug: string (nullable = true)
 |-- Ent#: long (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Potassium: double (nullable = true)
 |-- Sodium: double (nullable = true)

+---+------+----------+-----+-------------------+-----+----+------+---------+--------+
|Age|    BP|Cholestrol| Cost|         DateTested| Drug|Ent#|Gender|Potassium|  Sodium|
+---+------+----------+-----+-------------------+-----+----+------+---------+--------+
| 23|  HIGH|      HIGH|   11|1980-10-11T10:10:20|drugY|   1|     F| 0.031258|0.792535|
| 47|   LOW|      HIGH|   14|1980-11-14T09:10:20|drugC|   2|     M| 0.056468|0.739309|
| 47|   LOW|      HIGH|   14|1980-11-14T09:10:20|drugC|   3|     M| 0.068944|0.697269|
| 28|NORMAL|      HIGH|05,14|1990-05-14T05:12:05|drugX|   4|     F| 0.072289|0.563682|
| 61|   LOW

In [10]:
dfProfiled = eu.profile(dfConverted)
eu.printProfile(dfProfiled)

Age:{"StatsNames":["count","mean","min","max","range","stddev","variance"],"inferred_type":"Long","Bins":[16.0,18.9,21.8,24.7,27.6,30.5,33.4,36.3,39.2,42.1,45.0,47.9,50.8,53.7,56.6,59.5,62.4,65.3,68.2,71.1,74.0],"inferred_occurrence":100,"DiscoveredDataTypePercentages":[1.0],"Values":[1,0,3,0,2,3,1,1,1,3,5,4,0,0,1,2,1,0,1,1],"threshold%":50,"Stats":["30","43.06666666666667","16","74","58.0","14.318864656720212","205.02988505747123"],"DiscoveredDataTypes":["Long"]}
BP:{"StatsNames":["count","numberOfCategories","mode"],"inferred_type":"String","columnSpec":{"type":"String","locale":"es","groupingUsed":true},"inferred_occurrence":100,"Percentages":[0.5333333333333333,0.23333333333333334,0.23333333333333334],"DiscoveredDataTypePercentages":[1.0],"Values":[16,7,7],"Labels":["LOW","NORMAL","HIGH"],"threshold%":50,"Stats":["30","3","LOW"],"DiscoveredDataTypes":["String"]}
Cholestrol:{"StatsNames":["count","numberOfCategories","mode"],"inferred_type":"String","columnSpec":{"type":"String","lo

In [11]:
discovered_labels_BP = eu.render(dfProfiled,"labels").select("name", "value", "column").filter("column = 'BP'")
pd_discovered_labels_BP = discovered_labels_BP.toPandas()
discovered_types_Cost = eu.render(dfProfiled, 'types').select("name","value", "column").filter("column = 'Cost'")
pd_discovered_types_Cost = discovered_types_Cost.toPandas()
discovered_types_Drug = eu.render(dfProfiled,"labels").select("name","value", "column").filter("column = 'Drug'")
pd_discovered_types_Drug = discovered_types_Drug.toPandas()

In [12]:
%brunel data('pd_discovered_labels_BP') stack polar bar y(value) polar color(name) label(name) percent(value) tooltip(#all) | data('pd_discovered_types_Cost') stack polar bar y(value) polar color(name) label(name) percent(value) tooltip(#all)

<IPython.core.display.Javascript object>

In [13]:
%%brunel data('pd_discovered_types_Drug') stack polar bar y(value) polar color(name) label(name) percent(value) tooltip(#all)
 :: width=400, height=300

<IPython.core.display.Javascript object>

In [14]:
df_histogram = eu.render(dfProfiled,"histogram").select("name", "value", "column").filter("column = 'Cost'")
df_histogram.show()
pd_histogram = df_histogram.toPandas()

+----+-----+------+
|name|value|column|
+----+-----+------+
+----+-----+------+



In [15]:
%brunel data('pd_histogram') bar x(name) y(value) filter(column) color(name) tooltip(#all) :: width=900, height=600

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"


<IPython.core.display.Javascript object>

In [16]:
df_age = eu.render(dfProfiled,"histogram").select("name", "value", "column").filter("column = 'Age'")
pd_age = df_age.toPandas()

In [17]:
%brunel data('pd_age') bar x(name) y(value) filter(column) color(name) tooltip(#all)

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"


<IPython.core.display.Javascript object>

In [18]:
df_stats = eu.render(dfProfiled,"stats").select("name", "value", "column").filter("column = 'Cost'")
pd_stats = df_stats.toPandas()

In [19]:
%brunel data('pd_stats') area x(name) yrange(0,value) sort(value)

<IPython.core.display.Javascript object>

In [20]:
df_datetime = eu.render(dfProfiled,"datetime").select("name", "value", "column").filter("column = 'DateTested'")
pd_datetime = df_datetime.toPandas()

In [21]:
%brunel data('pd_datetime') area x(name) yrange(0,value)

<IPython.core.display.Javascript object>

In [22]:
dfCustomers = sqlContext.read.format("com.ibm.spark.discover").load("/resources/data/sparklingdata/data/sampleDataDir/customers.csv")
dfCustomers.printSchema()
dfCustomers.show()

root
 |-- C0: string (nullable = true)
 |-- C1: string (nullable = true)
 |-- C2: string (nullable = true)
 |-- C3: string (nullable = true)
 |-- C4: string (nullable = true)

+-----+--------------------+--------------------+--------------------+-------+
|   C0|                  C1|                  C2|                  C3|     C4|
+-----+--------------------+--------------------+--------------------+-------+
|t1234|           Tracy Doe|     Bank of America|69221 Newman Rd, ...|    250|
|t5566|       Lisa McDonald|         Wells Fargo|555 Bailey Ave, S...|   1000|
|t7666|    Lonnie Leo Gomez|       Bank of Texas|1234 Airline Dr, ...|   2000|
|t5567|    Stephen Brewster|First Bank of Ame...|425 Market Street...|3500.25|
|t1238|         Smith, Mary|         J.P. Morgan|3821 Twin Oaks Dr...|   5000|
|t1239|          Jen Norman|       Bank of Texas|4589 Holly Street...|    230|
|t5823|     Mary Burchfield|     Bank of America|69221 Newman Rd, ...|   1500|
|t9954|         John Miller|First 

In [23]:
options = {'extractFields': True, 'semanticTypes': True}
dfCustomersInferred = eu.inferTypes(dfCustomers, options)
dfCustomersInferred.printSchema()
dfCustomersInferred.show()

root
 |-- C0: string (nullable = true)
 |-- C1: string (nullable = true)
 |-- C2: string (nullable = true)
 |-- C3: string (nullable = true)
 |-- C4: string (nullable = true)

+-----+--------------------+--------------------+--------------------+-------+
|   C0|                  C1|                  C2|                  C3|     C4|
+-----+--------------------+--------------------+--------------------+-------+
|t1234|           Tracy Doe|     Bank of America|69221 Newman Rd, ...|    250|
|t5566|       Lisa McDonald|         Wells Fargo|555 Bailey Ave, S...|   1000|
|t7666|    Lonnie Leo Gomez|       Bank of Texas|1234 Airline Dr, ...|   2000|
|t5567|    Stephen Brewster|First Bank of Ame...|425 Market Street...|3500.25|
|t1238|         Smith, Mary|         J.P. Morgan|3821 Twin Oaks Dr...|   5000|
|t1239|          Jen Norman|       Bank of Texas|4589 Holly Street...|    230|
|t5823|     Mary Burchfield|     Bank of America|69221 Newman Rd, ...|   1500|
|t9954|         John Miller|First 

In [24]:
options = {"semanticTypes": True, "columns": ["C2"], "revealNA": {"mode": "any", "brackets": (">[", "]<")}}
dfCustomersForAnalysis = eu.inferTypes(dfCustomers, options)
dfCustomersForAnalysis.show()

+-----+----------------+--------------------+--------------------+-----+
|   C0|              C1|                  C2|                  C3|   C4|
+-----+----------------+--------------------+--------------------+-----+
|t9954|     John Miller|>[First Farmers &...|1555 Kingston Ave...|  200|
|t8887|   Helen Taranto|       >[BankFirst]<|1800 Century Park...|  300|
|t8763|  Michael Walker|                null|1463 Braxton Stre...|890.1|
|t8667|     Shana Wiley|                null|4589 Holly Street...| 2000|
|t2225|Stephen Brewster|>[First of America]<|4075 Harley Brook...|  600|
|t2229|   Hillary Frost|>[First Farmers &...|1234 Airline Dr, ...|  599|
+-----+----------------+--------------------+--------------------+-----+



In [25]:
options = {'extractFields': False, 'semanticTypes': True}
dfCustomersInferred = eu.inferTypes(dfCustomers, options)
dfCustProfiled = eu.profile(dfCustomersInferred)
eu.printProfile(dfCustProfiled)

C0:{"StatsNames":["count","numberOfCategories","mode"],"inferred_type":"String","columnSpec":{"type":"String"},"inferred_occurrence":100,"Percentages":[0.06451612903225806,0.06451612903225806,0.03225806451612903,0.03225806451612903,0.03225806451612903,0.03225806451612903],"DiscoveredDataTypePercentages":[1.0],"Values":[2,2,1,1,1,1],"Labels":["t4563","t1239","t2224","t5823","t8763","t1234"],"threshold%":50,"Stats":["31","29","t4563"],"DiscoveredDataTypes":["String"]}
C1:{"StatsNames":["count","numberOfCategories","mode"],"inferred_type":"Person","columnSpec":{"type":"Person"},"inferred_occurrence":96,"Percentages":[0.0967741935483871,0.06451612903225806,0.06451612903225806,0.06451612903225806,0.06451612903225806,0.03225806451612903],"DiscoveredDataTypePercentages":[0.967741935483871,0.03225806451612903],"Values":[3,2,2,2,2,1],"Labels":["Lisa McDonald","Jen Norman","Stephen Brewster","Mary Burchfield","Lonnie Leo Gomez","Peter Frost"],"threshold%":50,"Stats":["31","25","Lisa McDonald"],"

In [26]:
dfC1Labels = eu.render(dfCustProfiled, "labels").select("name", "value", "column").filter("column = 'C1'")
dfC1Labels.show
pd_dfC1Labels = dfC1Labels.toPandas()
dfC1Types = eu.render(dfCustProfiled, "types").select("name", "value", "column").filter("column = 'C1'")
dfC1Types.show
pd_dfC1Types = dfC1Types.toPandas()

In [27]:
%brunel data('pd_dfC1Labels') bar x(name) y(value) sort(value) label(name:3, ": ", value) | data('pd_dfC1Types')stack polar bar y(value) polar color(name) label(name) percent(value) tooltip(#all)

<IPython.core.display.Javascript object>

## Want to learn more?

<a href="http://bigdatauniversity.com/courses/introduction-to-python/?utm_source=tutorial-sparkling-python3&utm_medium=dswb&utm_campaign=bdu"><img src = "https://ibm.box.com/shared/static/l8yxiek0fg4e15lwz0ikgunj338nrrtd.png"> </a>

Created by: <a href="https://bigdatauniversity.com/?utm_source=bducreatedbylink&utm_medium=dswb&utm_campaign=bdu">The Big Data University Team</a>