# Ex2 - Getting and Knowing your Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Chipotle").getOrCreate()
sc = spark.sparkContext

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). 

### Step 3. Assign it to a variable called chipo.

In [2]:
from pyspark import SparkFiles

#data_path = r"https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv"
#sc.addFile(data_path)

#chipo = spark.read.option("delimiter","\t").csv("file:///"+SparkFiles.get("chiptole.tsv"), header = True, inferSchema=True)

chipo = spark.read.option("delimiter","\t").csv("chipotle.tsv", header = True, inferSchema = True)

### Step 4. See the first 10 entries

In [3]:
chipo.take(10)

[Row(order_id=1, quantity=1, item_name='Chips and Fresh Tomato Salsa', choice_description='NULL', item_price='$2.39 '),
 Row(order_id=1, quantity=1, item_name='Izze', choice_description='[Clementine]', item_price='$3.39 '),
 Row(order_id=1, quantity=1, item_name='Nantucket Nectar', choice_description='[Apple]', item_price='$3.39 '),
 Row(order_id=1, quantity=1, item_name='Chips and Tomatillo-Green Chili Salsa', choice_description='NULL', item_price='$2.39 '),
 Row(order_id=2, quantity=2, item_name='Chicken Bowl', choice_description='[Tomatillo-Red Chili Salsa (Hot), [Black Beans, Rice, Cheese, Sour Cream]]', item_price='$16.98 '),
 Row(order_id=3, quantity=1, item_name='Chicken Bowl', choice_description='[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sour Cream, Guacamole, Lettuce]]', item_price='$10.98 '),
 Row(order_id=3, quantity=1, item_name='Side of Chips', choice_description='NULL', item_price='$1.69 '),
 Row(order_id=4, quantity=1, item_name='Steak Burrito', choice_description='[Tom

### Step 5. What is the number of observations in the dataset?

In [4]:
# Solution 1
chipo.count()


4622

In [5]:
# Solution 2
chipo.createOrReplaceTempView('chipo')
spark.sql("Select count(*) from chipo").show()

+--------+
|count(1)|
+--------+
|    4622|
+--------+



### Step 6. What is the number of columns in the dataset?

In [6]:
len(chipo.columns)

5

### Step 7. Print the name of all the columns.

In [7]:
chipo.columns

['order_id', 'quantity', 'item_name', 'choice_description', 'item_price']

### Step 8. How is the dataset indexed?

### Step 9. Which was the most-ordered item? 

In [8]:
from pyspark.sql.functions import col

chipo[['quantity','item_name']].groupBy('item_name').sum('quantity').orderBy(col('sum(quantity)').desc()).take(1)

[Row(item_name='Chicken Bowl', sum(quantity)=761)]

### Step 10. For the most-ordered item, how many items were ordered?

In [9]:
chipo[['quantity','item_name']] \
    .groupBy('item_name').sum('quantity') \
    .orderBy(col('sum(quantity)').desc()).take(1)[0]['sum(quantity)']

761

### Step 11. What was the most ordered item in the choice_description column?

In [10]:
from pyspark.sql.functions import explode, regexp_replace, split, trim, sum

In [11]:
choice_desc = chipo.select(chipo.choice_description, chipo.quantity).filter(chipo.choice_description!='NULL')

choice = choice_desc.withColumn('choice_description', regexp_replace('choice_description','[\[\]]', '')) \
    .select(choice_desc.quantity, split(col('choice_description'), ',').alias('choice_array')) \
    .select(choice_desc.quantity, explode(col('choice_array')).alias('choice')) \
    .select(choice_desc.quantity, trim(col('choice')).alias('choice'))

choice.groupBy('choice').sum('quantity').orderBy(col('sum(quantity)').desc()).take(1)

[Row(choice='Rice', sum(quantity)=2497)]

In [12]:
chipo.select(chipo.choice_description, chipo.quantity) \
    .groupBy('choice_description').sum('quantity') \
    .orderBy(col('sum(quantity)').desc()).show(10)

+--------------------+-------------+
|  choice_description|sum(quantity)|
+--------------------+-------------+
|                NULL|         1382|
|         [Diet Coke]|          159|
|              [Coke]|          143|
|            [Sprite]|           89|
|[Fresh Tomato Sal...|           49|
|[Fresh Tomato Sal...|           42|
|[Fresh Tomato Sal...|           40|
|          [Lemonade]|           36|
|[Fresh Tomato Sal...|           36|
|         [Coca Cola]|           32|
+--------------------+-------------+
only showing top 10 rows



### Step 12. How many items were orderd in total?

In [13]:
chipo.select(sum(chipo.quantity)).show()

+-------------+
|sum(quantity)|
+-------------+
|         4972|
+-------------+



### Step 13. Turn the item price into a float

#### Step 13.a. Check the item price type

In [14]:
chipo.dtypes[4]

('item_price', 'string')

#### Step 13.b. Create a lambda function and change the type of item price

In [15]:
from pyspark.sql.functions import cast
from pyspark.sql.types import FloatType

In [16]:
chipo = chipo.withColumn('item_price', trim(regexp_replace('item_price','\$','')).cast(FloatType()))

#map(lambda x:Row(x['item_price']))

#### Step 13.c. Check the item price type

In [17]:
chipo.dtypes

[('order_id', 'int'),
 ('quantity', 'int'),
 ('item_name', 'string'),
 ('choice_description', 'string'),
 ('item_price', 'float')]

### Step 14. How much was the revenue for the period in the dataset?

In [18]:
from pyspark.sql.functions import sum

In [19]:
chipo.select(sum(col('quantity')*col('item_price'))).take(10)

[Row(sum((quantity * item_price))=39237.0197327137)]

### Step 15. How many orders were made in the period?

In [20]:
chipo.select(col('order_id')).distinct().count()

1834

### Step 16. What is the average revenue amount per order?

In [21]:
from pyspark.sql.functions import avg

In [22]:
# Solution 1
chipo.select(col('order_id'), col('quantity')*col('item_price')) \
    .groupBy('order_id').sum('(quantity * item_price)') \
    .select(avg(col('sum((quantity * item_price))'))).show()


+---------------------------------+
|avg(sum((quantity * item_price)))|
+---------------------------------+
|               21.394231042919138|
+---------------------------------+



In [23]:
# Solution 2
chipo.createOrReplaceTempView('chipo')
spark.sql('with cte as \
          (select \
              sum(quantity*item_price) as sum_price, \
              order_id \
          from chipo group by order_id) \
          select avg(sum_price) from cte').show()


+------------------+
|    avg(sum_price)|
+------------------+
|21.394231042919138|
+------------------+



### Step 17. How many different items are sold?

In [24]:
chipo[['item_name']].distinct().count()

50