In [7]:
'''

- Do data transformation
- Subset columns and filter rows by using .select() or .where()
- use a groupby() operation and an aggregate function .agg() ( sum by group, average by group etc)
- use distinct(), count() operation
- For those of you who know sql, do a sql operation on a table
    - Register the table view
    - run sql query using spark.sql("SELECT....")
- Create a new column using .withColumnNamed()
- Apply a user defined function (UDF)
- Implement an ML model
- (Optional) Implement Cross validations
- (Optional) identify a problem and fill out a diagnostics form

'''
import findspark  # this needs to be the first import                                                                        
findspark.init()

# jupyter notebook &

from pyspark.sql import DataFrame
from pyspark.sql import *
from pyspark.ml import Pipeline
from pyspark.ml.classification import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.tuning import *
from pyspark.ml.feature import Imputer


import numpy as np
import pandas as pd
import scipy as sc

In [1]:

from pyspark import SparkContext
from pyspark.sql.session import SparkSession
try:
    sc = SparkContext()     
except ValueError:
    pass
# spark = SparkSession(sc)


spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
    
    
# from DaeWon
# establish a spark session with 50 executors
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .master("yarn") \
        .appName("testing") \
        .config("spark.executor.instances", "70") \
        .config("spark.executor.memory","4g") \
        .config("spark.driver.memory","30g") \
        .config("spark.executor.cores",'1') \
        .config("spark.scheduler.mode","FIFO") \
        .getOrCreate()

In [13]:
 spark.read.json?

In [37]:
dat = spark.read.json('./dataset/checkin.json').repartition(150)
dat.printSchema()

bus = spark.read.json('./dataset/business.json').repartition(150)
bus.printSchema()

phot = spark.read.json('./dataset/photos.json').repartition(150)
phot.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- time: struct (nullable = true)
 |    |-- Friday: struct (nullable = true)
 |    |    |-- 0:00: long (nullable = true)
 |    |    |-- 10:00: long (nullable = true)
 |    |    |-- 11:00: long (nullable = true)
 |    |    |-- 12:00: long (nullable = true)
 |    |    |-- 13:00: long (nullable = true)
 |    |    |-- 14:00: long (nullable = true)
 |    |    |-- 15:00: long (nullable = true)
 |    |    |-- 16:00: long (nullable = true)
 |    |    |-- 17:00: long (nullable = true)
 |    |    |-- 18:00: long (nullable = true)
 |    |    |-- 19:00: long (nullable = true)
 |    |    |-- 1:00: long (nullable = true)
 |    |    |-- 20:00: long (nullable = true)
 |    |    |-- 21:00: long (nullable = true)
 |    |    |-- 22:00: long (nullable = true)
 |    |    |-- 23:00: long (nullable = true)
 |    |    |-- 2:00: long (nullable = true)
 |    |    |-- 3:00: long (nullable = true)
 |    |    |-- 4:00: long (nullable = true)
 |    |    |-- 5:00: lo

In [74]:
print dat.count()
print bus.count()
print phot.count()

135148
156639
196278


In [22]:
dat.rdd.getNumPartitions()

150

In [52]:
dat.show()

+--------------------+--------------------+
|         business_id|                time|
+--------------------+--------------------+
|b2596IQibXyco49ct...|[[1,null,null,nul...|
|1njD5p7-FvV8uoHzw...|[[null,null,null,...|
|gE80aKaybUL4SZZE4...|[[null,null,null,...|
|2tMuTCM-UxbCjyxTL...|[null,null,[null,...|
|mzGrMnuHJpPJEF7E4...|[null,null,[1,nul...|
|w9nB3j5ElUThCkgXQ...|[null,null,null,n...|
|Nwd3NkLVZD85zAV_n...|[[1,null,null,nul...|
|ZjwBMsW0tmS7sAYnM...|[null,[null,null,...|
|TzRK4L5ZpZv71cArM...|[[1,null,null,nul...|
|uLdJrH2rJwULKLopi...|[[null,null,null,...|
|d0_yLkH7u16tlcvvU...|[[null,null,null,...|
|DqNpyLa9CWN3YKaIG...|[[null,1,null,nul...|
|k0pZyGb_oBdFpcekM...|[null,[null,null,...|
|oH3V-79Cldz_6sWXi...|[[null,null,1,nul...|
|1y64UsTx6ikxoaCAY...|[[null,null,null,...|
|X209KCF2ex6VvNJcb...|[[1,1,1,null,3,nu...|
|0mj6WOkZnwrdcWYlG...|[[null,null,2,nul...|
|XG0twdTiMzUS3v3p9...|[[1,null,null,nul...|
|85ot7LGtfcr8pj5yK...|[null,[null,null,...|
|YhLrPdDWKd26l-fIG...|[[1,null,n

In [70]:
dat.columns


['business_id', 'time']

In [71]:
phot.columns

['business_id', 'caption', 'label', 'photo_id']

In [72]:
bus.columns

['address',
 'attributes',
 'business_id',
 'categories',
 'city',
 'hours',
 'is_open',
 'latitude',
 'longitude',
 'name',
 'neighborhood',
 'postal_code',
 'review_count',
 'stars',
 'state']

In [53]:
bus.show()

+--------------------+--------------------+--------------------+--------------------+----------+--------------------+-------+-------------+--------------+--------------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|      city|               hours|is_open|     latitude|     longitude|                name|        neighborhood|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+----------+--------------------+-------+-------------+--------------+--------------------+--------------------+-----------+------------+-----+-----+
|  Silberburgstr. 171|[null,null,null,n...|IVtp8mznb2vRO0Ocw...|  [Shopping, Bridal]| Stuttgart|[10:00-18:00,10:0...|      1|   48.7714398|      9.168795|   Brautatelier Tara|                    |      70178|           3|  4.5|   BW|
|    7000 E Shea Blvd|[null,null,full_b...|XFQbescZxNRAb2U-N...|

In [54]:
bus.dtypes

[('address', 'string'),
 ('attributes',
  'struct<AcceptsInsurance:boolean,AgesAllowed:string,Alcohol:string,Ambience:struct<casual:boolean,classy:boolean,divey:boolean,hipster:boolean,intimate:boolean,romantic:boolean,touristy:boolean,trendy:boolean,upscale:boolean>,BYOB:boolean,BYOBCorkage:string,BestNights:struct<friday:boolean,monday:boolean,saturday:boolean,sunday:boolean,thursday:boolean,tuesday:boolean,wednesday:boolean>,BikeParking:boolean,BusinessAcceptsBitcoin:boolean,BusinessAcceptsCreditCards:boolean,BusinessParking:struct<garage:boolean,lot:boolean,street:boolean,valet:boolean,validated:boolean>,ByAppointmentOnly:boolean,Caters:boolean,CoatCheck:boolean,Corkage:boolean,DietaryRestrictions:struct<dairy-free:boolean,gluten-free:boolean,halal:boolean,kosher:boolean,soy-free:boolean,vegan:boolean,vegetarian:boolean>,DogsAllowed:boolean,DriveThru:boolean,GoodForDancing:boolean,GoodForKids:boolean,GoodForMeal:struct<breakfast:boolean,brunch:boolean,dessert:boolean,dinner:boolean

In [46]:
dat = dat.select('business_id','time')
print type(dat)

<class 'pyspark.sql.dataframe.DataFrame'>


In [47]:
print type(bus)

<class 'pyspark.sql.dataframe.DataFrame'>


In [56]:
# Comput summary stats
bus.describe().show() 

+-------+--------------------+--------------------+--------------+------------------+-----------------+------------------+------------------+------------+-----------------+-----------------+------------------+------------------+
|summary|             address|         business_id|          city|           is_open|         latitude|         longitude|              name|neighborhood|      postal_code|     review_count|             stars|             state|
+-------+--------------------+--------------------+--------------+------------------+-----------------+------------------+------------------+------------+-----------------+-----------------+------------------+------------------+
|  count|              156639|              156639|        156639|            156639|           156638|            156638|            156639|      156639|           156639|           156639|            156639|            156639|
|   mean|               100.0|                null|          null|0.8443746448840965

In [20]:
dat

DataFrame[business_id: string, time: struct<Friday:struct<0:00:bigint,10:00:bigint,11:00:bigint,12:00:bigint,13:00:bigint,14:00:bigint,15:00:bigint,16:00:bigint,17:00:bigint,18:00:bigint,19:00:bigint,1:00:bigint,20:00:bigint,21:00:bigint,22:00:bigint,23:00:bigint,2:00:bigint,3:00:bigint,4:00:bigint,5:00:bigint,6:00:bigint,7:00:bigint,8:00:bigint,9:00:bigint>,Monday:struct<0:00:bigint,10:00:bigint,11:00:bigint,12:00:bigint,13:00:bigint,14:00:bigint,15:00:bigint,16:00:bigint,17:00:bigint,18:00:bigint,19:00:bigint,1:00:bigint,20:00:bigint,21:00:bigint,22:00:bigint,23:00:bigint,2:00:bigint,3:00:bigint,4:00:bigint,5:00:bigint,6:00:bigint,7:00:bigint,8:00:bigint,9:00:bigint>,Saturday:struct<0:00:bigint,10:00:bigint,11:00:bigint,12:00:bigint,13:00:bigint,14:00:bigint,15:00:bigint,16:00:bigint,17:00:bigint,18:00:bigint,19:00:bigint,1:00:bigint,20:00:bigint,21:00:bigint,22:00:bigint,23:00:bigint,2:00:bigint,3:00:bigint,4:00:bigint,5:00:bigint,6:00:bigint,7:00:bigint,8:00:bigint,9:00:bigint>,Sun

In [24]:
# count num of unique classes (rows) in a column
dat.select(col("time")).distinct().count()

120782

In [48]:
# we can do an aggregate count to see the distribution of the classes
dat.groupBy(col("business_id")).count().collect()

[Row(business_id=u'y8e07i7HPWuYVQzg_2RbxA', count=1),
 Row(business_id=u'68htH_xVthYKRxvpTpu2ew', count=1),
 Row(business_id=u'yuBzzarZ13pt8UpspSxOBg', count=1),
 Row(business_id=u'CqTPLUHBM9AM3TEqPn0krg', count=1),
 Row(business_id=u'0859wfd1BQHG46Zpwhc0ZQ', count=1),
 Row(business_id=u'G-AW1PNolA2XnADKqqJO_A', count=1),
 Row(business_id=u'VmSrPPO2WXmOKjUW7pDGsQ', count=1),
 Row(business_id=u'FVgHb9w4pnkRBV5bZYJaZA', count=1),
 Row(business_id=u'xZpZvAqZSla5c72CuNKnJA', count=1),
 Row(business_id=u'zhBkNLn2KPnh5-NIueXVHA', count=1),
 Row(business_id=u'ARkNkb3EJzA42PqMC5VMbA', count=1),
 Row(business_id=u'OZJniEO9I-PqavZYuyNjng', count=1),
 Row(business_id=u'oF054vcRTDNx88qxBGzhFw', count=1),
 Row(business_id=u'4XMhO5LyfFHq5rdFHkXfXQ', count=1),
 Row(business_id=u'Cl-xl1vTUwHeaGgBxzdTRA', count=1),
 Row(business_id=u'XIAbir-WvErgW6xjB6YRlA', count=1),
 Row(business_id=u'rWxAgLSI0Sk1MCpQGd5HDQ', count=1),
 Row(business_id=u'SubDWJhpXWLs-TMNQxE67Q', count=1),
 Row(business_id=u'KkyOeKAro

In [57]:
bus.filter(bus["longitude"]>24).show()

+--------------------+--------------------+--------------------+--------------------+---------+--------------------+-------+---------+----------+-----------+------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|     city|               hours|is_open| latitude| longitude|       name|neighborhood|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+---------+--------------------+-------+---------+----------+-----------+------------+-----------+------------+-----+-----+
|Green Valley Ranc...|[null,null,full_b...|Zmp2_b2gpSloz4Dv2...|[Italian, Restaur...|Henderson|[17:00-23:00,17:0...|      0|36.019323|115.086769|Terra Verde|            |      89052|          70|  3.5|   NV|
+--------------------+--------------------+--------------------+--------------------+---------+--------------------+-------+---------+----------+-----------+-----------

In [60]:
import pyspark.sql.functions as func
xx = bus.groupBy("postal_code").agg(func.max("stars"), func.sum("review_count"))

In [61]:
xx

DataFrame[postal_code: string, max(stars): double, sum(review_count): bigint]

In [62]:
#we can do the same thing in sql code
# we must first create a "view" of the data in order to execute SQL functions
bus.createOrReplaceTempView("bus")
spark.sql("SELECT COUNT(*) FROM bus GROUP BY neighborhood").collect()

[Row(count(1)=70),
 Row(count(1)=5),
 Row(count(1)=51),
 Row(count(1)=239),
 Row(count(1)=122),
 Row(count(1)=5),
 Row(count(1)=81),
 Row(count(1)=4),
 Row(count(1)=2),
 Row(count(1)=3),
 Row(count(1)=3237),
 Row(count(1)=12),
 Row(count(1)=8),
 Row(count(1)=4),
 Row(count(1)=3),
 Row(count(1)=100),
 Row(count(1)=58),
 Row(count(1)=7),
 Row(count(1)=234),
 Row(count(1)=6),
 Row(count(1)=13),
 Row(count(1)=99),
 Row(count(1)=531),
 Row(count(1)=13),
 Row(count(1)=1),
 Row(count(1)=110),
 Row(count(1)=254),
 Row(count(1)=63),
 Row(count(1)=367),
 Row(count(1)=1),
 Row(count(1)=2),
 Row(count(1)=2),
 Row(count(1)=10),
 Row(count(1)=4),
 Row(count(1)=133),
 Row(count(1)=5),
 Row(count(1)=14),
 Row(count(1)=314),
 Row(count(1)=42),
 Row(count(1)=18),
 Row(count(1)=1751),
 Row(count(1)=269),
 Row(count(1)=149),
 Row(count(1)=18),
 Row(count(1)=23),
 Row(count(1)=8),
 Row(count(1)=20),
 Row(count(1)=276),
 Row(count(1)=24),
 Row(count(1)=34),
 Row(count(1)=118),
 Row(count(1)=5),
 Row(count(1

In [64]:
spark.sql("SELECT * FROM bus").show()

+--------------------+--------------------+--------------------+--------------------+----------+--------------------+-------+-------------+--------------+--------------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|      city|               hours|is_open|     latitude|     longitude|                name|        neighborhood|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+----------+--------------------+-------+-------------+--------------+--------------------+--------------------+-----------+------------+-----+-----+
|  Silberburgstr. 171|[null,null,null,n...|IVtp8mznb2vRO0Ocw...|  [Shopping, Bridal]| Stuttgart|[10:00-18:00,10:0...|      1|   48.7714398|      9.168795|   Brautatelier Tara|                    |      70178|           3|  4.5|   BW|
|    7000 E Shea Blvd|[null,null,full_b...|XFQbescZxNRAb2U-N...|

In [28]:
#Convert string types(categorical) to integerTypes
types = [str(f.dataType) for f in dat.schema.fields]
ind_str = [i for i in range(len(types)) if types[i] is 'StringType']
ind_int = [i for i in range(len(types)) if types[i] is 'IntegerType']
stringcol= [dat.columns[i] for i in ind_str]
intcol = [dat.columns[i] for i in ind_int]
print(stringcol)
print(intcol)
del ind_str, ind_int, types

['business_id']
[]


In [33]:
dat = dat.drop('ID')
dat.drop?

In [34]:
# remove all string columns
for colname in stringcol:
    dat = dat.drop(colname)
    
# remove 'business_id' from inttype list
intcol = intcol[1:]

In [35]:
intcol

[]

In [69]:
print type('s')


<type 'str'>


In [80]:
# apply a user defined function (UDF)
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

taste_udf = udf(lambda stars: "haeven" if stars == 5 else "mediocore", StringType())

In [85]:
print bus['stars']

Column<stars>


In [86]:
print bus.select('stars').show()

+-----+
|stars|
+-----+
|  4.5|
|  3.5|
|  2.0|
|  3.5|
|  3.5|
|  5.0|
|  2.5|
|  3.0|
|  4.0|
|  4.5|
|  5.0|
|  4.5|
|  3.0|
|  3.5|
|  4.5|
|  2.5|
|  4.0|
|  3.5|
|  4.0|
|  4.0|
+-----+
only showing top 20 rows

None


In [90]:
# Add new column
print taste_udf(bus.stars)
# Add new column
buste = bus.withColumn('tastebud', taste_udf(bus.stars))

Column<<lambda>(stars)>


In [91]:
print buste.show()

+--------------------+--------------------+--------------------+--------------------+----------+--------------------+-------+-------------+--------------+--------------------+--------------------+-----------+------------+-----+-----+---------+
|             address|          attributes|         business_id|          categories|      city|               hours|is_open|     latitude|     longitude|                name|        neighborhood|postal_code|review_count|stars|state| tastebud|
+--------------------+--------------------+--------------------+--------------------+----------+--------------------+-------+-------------+--------------+--------------------+--------------------+-----------+------------+-----+-----+---------+
|  Silberburgstr. 171|[null,null,null,n...|IVtp8mznb2vRO0Ocw...|  [Shopping, Bridal]| Stuttgart|[10:00-18:00,10:0...|      1|   48.7714398|      9.168795|   Brautatelier Tara|                    |      70178|           3|  4.5|   BW|mediocore|
|    7000 E Shea Blvd|[n

In [93]:
#  Implement an ML model
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

categoricalColumns = bus['categories']

In [94]:
print categoricalColumns

Column<categories>


In [None]:
# https://docs.databricks.com/spark/latest/mllib/binary-classification-mllib-pipelines.html
# (Optional) Implement Cross validations
# (Optional) identify a problem and fill out a diagnostics form

In [95]:
spark.stop()