In [1]:
import pandas as pd
import numpy as np

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

In [2]:
sc = pyspark.SparkContext()
ss = SparkSession(sc)

In [3]:
# get the sales text files
fil = './Exercise Files/sales_log/sales-1.csv'
sales = ss.read.csv(fil).cache()
print(type(sales))
print('%d records'%sales.count())

<class 'pyspark.sql.dataframe.DataFrame'>
99 records


In [4]:
display(sales.take(10))

[Row(_c0='13729', _c1='1/1/09', _c2='9', _c3='0.08', _c4='200', _c5='1640.96', _c6='Matt Bertelsons', _c7='Maryland', _c8='East', _c9='Development - Big Data', _c10='1', _c11='0.71'),
 Row(_c0='28774', _c1='1/1/09', _c2='32', _c3='0.1', _c4='200', _c5='5707.67', _c6='Jessica Thornton', _c7='Pennsylvania', _c8='East', _c9='Development - Big Data', _c10='1', _c11='0.78'),
 Row(_c0='9285', _c1='1/2/09', _c2='3', _c3='0.06', _c4='160', _c5='447.11', _c6="David O'Rourke", _c7='Minnesota', _c8='Central', _c9='Development - Java', _c10='1', _c11='0.56'),
 Row(_c0='37537', _c1='1/2/09', _c2='4', _c3='0', _c4='125', _c5='495.47', _c6='Alan Brumley', _c7='California', _c8='West', _c9='Training - Development', _c10='1', _c11='0.5'),
 Row(_c0='37537', _c1='1/2/09', _c2='43', _c3='0.07', _c4='125', _c5='4953.46', _c6='Alan Brumley', _c7='California', _c8='West', _c9='Training - Development', _c10='1', _c11='0.58'),
 Row(_c0='37537', _c1='1/2/09', _c2='32', _c3='0.05', _c4='200', _c5='6024.92', _c6=

In [5]:
# load into a dataframe & name columns
salesDF = ss.read.load(fil, format='com.databricks.spark.csv', header='false', inferSchema='true')
cols = ['Invoice', 'Date', 'C2', 'C3', 'C4', 'Amount', 'Seller', 'State', 'Region', 'Topic', 'C10', 'C11']
salesDF = salesDF.toDF(*cols).cache()
salesDF.printSchema()
display(salesDF.show(10))

root
 |-- Invoice: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- C2: integer (nullable = true)
 |-- C3: double (nullable = true)
 |-- C4: integer (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Seller: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Topic: string (nullable = true)
 |-- C10: integer (nullable = true)
 |-- C11: double (nullable = true)

+-------+------+---+----+---+-------+----------------+------------+-------+--------------------+---+----+
|Invoice|  Date| C2|  C3| C4| Amount|          Seller|       State| Region|               Topic|C10| C11|
+-------+------+---+----+---+-------+----------------+------------+-------+--------------------+---+----+
|  13729|1/1/09|  9|0.08|200|1640.96| Matt Bertelsons|    Maryland|   East|Development - Big...|  1|0.71|
|  28774|1/1/09| 32| 0.1|200|5707.67|Jessica Thornton|Pennsylvania|   East|Development - Big...|  1|0.78|
|   9285|1/2/09|  3|0.06|

None

In [6]:
# show distinct states
salesDF.select('State').distinct().orderBy('State').show()

+-----------+
|      State|
+-----------+
|    Alabama|
| California|
|   Colorado|
|Connecticut|
|    Florida|
|   Illinois|
|       Iowa|
|   Kentucky|
|  Louisiana|
|         MA|
|         MO|
|      Maine|
|   Maryland|
|   Michigan|
|  Minnesota|
|   Nebraska|
|     Nevada|
| New Jersey|
|   New York|
|       Ohio|
+-----------+
only showing top 20 rows



In [7]:
# get total amount by state & topic
salesDF.select(salesDF['State'], salesDF['Topic'], salesDF['Amount']).groupBy(['State', 'Topic']).sum().orderBy('sum(Amount)', ascending=False).show()

+--------------+--------------------+-----------------+
|         State|               Topic|      sum(Amount)|
+--------------+--------------------+-----------------+
|    California|Development - Bus...|         13430.93|
|South Carolina|Development - Big...|          9026.39|
|         Texas|Training - Develo...|          8626.77|
|         Texas|      Training - SQL|8473.539999999999|
|      Virginia|Development - Python|          8232.77|
|       Florida|Development - Big...|          7755.69|
|      Oklahoma|      Training - SQL|          7552.77|
|    California|Development - Python|7288.360000000001|
|      Kentucky|Development - Big...|          7215.86|
|      Kentucky|Consulting - Stra...|          7066.33|
|         Texas|  Development - Java|           6943.6|
|     Louisiana|Consulting - Stra...|          6919.42|
|        Nevada|  Development - Java|           6710.7|
|            MO|Consulting - Busi...|          6073.21|
|    California|Development - Big...|          6

In [8]:
# get just texas rows
salesDF.filter(salesDF['State']=='Texas').show()

+-------+-------+---+----+---+-------+----------------+-----+-------+--------------------+---+----+
|Invoice|   Date| C2|  C3| C4| Amount|          Seller|State| Region|               Topic|C10| C11|
+-------+-------+---+----+---+-------+----------------+-----+-------+--------------------+---+----+
|   8710| 1/4/09| 17|0.03|160|2614.63|  Tamara O'Brill|Texas|Central|  Development - Java|  1|0.72|
|   8710| 1/4/09| 42|0.07|110|4257.89|  Tamara O'Brill|Texas|Central|Development - Python|  1|0.38|
|  49730| 1/4/09|  3|0.06|125| 349.32|       Dave Hart|Texas|Central|Training - Develo...|  1|0.46|
|  57253| 1/5/09| 43|0.08|125|4900.57|  Darren Perrino|Texas|Central|Training - Develo...|  1|0.43|
|  57253| 1/5/09| 29|0.06|125|3376.88|  Darren Perrino|Texas|Central|Training - Develo...|  1|0.58|
|  57253| 1/5/09| 48|0.07|125|5529.86|  Darren Perrino|Texas|Central|      Training - SQL|  1|0.44|
|  57253| 1/5/09| 49|0.08|125|5584.37|  Darren Perrino|Texas|Central|Training - Javasc...|  1| 0.5|


In [9]:
# most valuable topics in california
caTop5 = salesDF.filter(salesDF['State']=='California').select(salesDF['State'], salesDF['Topic'], salesDF['Amount']).groupBy(['State', 'Topic']).sum().withColumnRenamed('sum(Amount)', 'Total').orderBy('Total', ascending=False).limit(5)
caTop5.show()

+----------+--------------------+-----------------+
|     State|               Topic|            Total|
+----------+--------------------+-----------------+
|California|Development - Bus...|         13430.93|
|California|Development - Python|7288.360000000001|
|California|Development - Big...|          6024.92|
|California|Training - Develo...|          5448.93|
|California|Consulting - Mark...|          4801.71|
+----------+--------------------+-----------------+



In [10]:
caTop5.write.saveAsTable('california_top5') # creates folder in spark-warehouse, data in parquet files

In [11]:
caTop5.write.csv('california_top5') # creates folder in the cwd

In [None]:
sc.stop()