## Understand Data

### Step 1. Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np

### Step 2. Import the dataset from this address:
https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv

### Step 3. Assign it to a variable called chipo.

In [37]:
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv'
df = pd.read_csv(url, sep='\t')
chipo = df

### Step 4. See the first 10 entries

In [38]:
chipo[:10]

## or chipo.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


### Step 5. What is the number of observations in the dataset?

In [4]:
chipo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   order_id            4622 non-null   int64 
 1   quantity            4622 non-null   int64 
 2   item_name           4622 non-null   object
 3   choice_description  3376 non-null   object
 4   item_price          4622 non-null   object
dtypes: int64(2), object(3)
memory usage: 180.7+ KB


In [5]:
len(chipo)

4622

In [6]:
chipo.shape[0]

4622

In [7]:
chipo.index

RangeIndex(start=0, stop=4622, step=1)

### Step 6. What is the number of columns in the dataset?

In [8]:
chipo.shape[1]

5

In [9]:
len(chipo.columns)

5

In [10]:
chipo.columns.size

5

### Step 7. Print the name of all the columns.

In [11]:
for i in chipo:
    print(i)

order_id
quantity
item_name
choice_description
item_price


In [12]:
print([i for i in chipo])

['order_id', 'quantity', 'item_name', 'choice_description', 'item_price']


In [13]:
chipo.columns

Index(['order_id', 'quantity', 'item_name', 'choice_description',
       'item_price'],
      dtype='object')

### Step 8. How is the dataset indexed?

In [14]:
chipo.index

RangeIndex(start=0, stop=4622, step=1)

In [15]:
chipo

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",$11.75
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",$11.75
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",$11.25
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",$8.75


### Step 9. Which was the most-ordered item?

In [16]:
chipo_2 = chipo.groupby('item_name')
chipo_2 = chipo_2.sum()
chipo_2 = chipo_2.sort_values(['quantity'], ascending=False)
chipo_2[:1]

Unnamed: 0_level_0,order_id,quantity
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Chicken Bowl,713926,761


### Step 11. What was the most ordered item in the choice_description column?

In [17]:
chipo_2 = chipo.groupby('choice_description')
chipo_2 = chipo_2.sum()
chipo_2 = chipo_2.sort_values(['quantity'], ascending=False)
chipo_2[:1]

Unnamed: 0_level_0,order_id,quantity
choice_description,Unnamed: 1_level_1,Unnamed: 2_level_1
[Diet Coke],123455,159


### Step 12. How many items were orderd in total?

In [41]:
total_items = chipo.quantity.sum()
total_items

4972

### Step 13. Turn the item price into a float
#### Step 13.a. Check the item price type

In [68]:
chipo.item_price.dtype 
# datatype = object

dtype('O')

#### Step 13.b. Create a lambda function and change the type of item price

In [69]:
chipo.item_price = chipo.item_price.apply(lambda x: float(x[1:-1]))
chipo

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39
2,1,1,Nantucket Nectar,[Apple],3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98
...,...,...,...,...,...
4617,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Sour ...",11.75
4618,1833,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Sour Cream, Cheese...",11.75
4619,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Pinto...",11.25
4620,1834,1,Chicken Salad Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Lettu...",8.75


#### Step 13.c. Check the item price type

In [70]:
chipo.item_price.dtype 
# datatype = object

dtype('float64')

### Step 14. How much was the revenue for the period in the dataset?

In [45]:
revenue = (chipo['quantity'] * chipo['item_price']).sum()
revenue

39237.02

### Step 15. How many orders were made in the period?

In [30]:
total_orders = chipo.order_id.max()
total_orders

1834

In [46]:
orders = chipo.order_id.value_counts().count()
orders

1834

### Step 16. What is the average revenue amount per order?

In [50]:
chipo['revenue'] = chipo['quantity'] * chipo['item_price']
grouped = chipo.groupby('order_id').sum()
grouped.mean()['revenue']

21.394231188658654

In [51]:
chipo.groupby('order_id').sum().mean()['revenue']

21.394231188658654

### Step 17. How many different items are sold?

In [63]:
chipo.groupby('item_name').sum().shape[0]

50

In [61]:
chipo.item_name.value_counts().count()

50

## Filter & Sort Data

### Step 1. How many products cost more than $10.00?

In [117]:
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv'
chipo = pd.read_csv(url, sep='\t')

# convert prices to float
prices = [float(price[1:]) for price in chipo['item_price']]

# reassign the column with the cleaned prices
chipo.item_price = prices

# remove duplicate item names and descriptions
chipo_filtered = chipo.drop_duplicates(['item_name', 'quantity'])

# select only the products with quantity equals to 1
chipo_single = chipo_filtered[chipo_filtered.quantity == 1]

# amount of products over 10
chipo_single.loc[chipo_single['item_price'] > 10.00].item_price.count()

12

### Step 2. What is the price of each item?
print a data frame with only two columns item_name and item_price

In [140]:
chipo_item_price = chipo_single[['item_name', 'item_price']]
chipo_item_price.sort_values('item_price', ascending=False)

Unnamed: 0,item_name,item_price
606,Steak Salad Bowl,11.89
1229,Barbacoa Salad Bowl,11.89
1132,Carnitas Salad Bowl,11.89
7,Steak Burrito,11.75
168,Barbacoa Crispy Tacos,11.75
39,Barbacoa Bowl,11.75
738,Veggie Soft Tacos,11.25
186,Veggie Salad Bowl,11.25
62,Veggie Bowl,11.25
57,Veggie Burrito,11.25


### Step 3. Sort by the name of the item

In [153]:
chipo.sort_values('item_name')

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
3389,1360,2,6 Pack Soft Drink,[Diet Coke],12.98
341,148,1,6 Pack Soft Drink,[Diet Coke],6.49
1849,749,1,6 Pack Soft Drink,[Coke],6.49
1860,754,1,6 Pack Soft Drink,[Diet Coke],6.49
2713,1076,1,6 Pack Soft Drink,[Coke],6.49
...,...,...,...,...,...
2384,948,1,Veggie Soft Tacos,"[Roasted Chili Corn Salsa, [Fajita Vegetables,...",8.75
781,322,1,Veggie Soft Tacos,"[Fresh Tomato Salsa, [Black Beans, Cheese, Sou...",8.75
2851,1132,1,Veggie Soft Tacos,"[Roasted Chili Corn Salsa (Medium), [Black Bea...",8.49
1699,688,1,Veggie Soft Tacos,"[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...",11.25


### Step 4. What was the quantity of the most expensive item ordered?

In [151]:
chipo.sort_values('item_price', ascending=False).head(1)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
3598,1443,15,Chips and Fresh Tomato Salsa,,44.25


### Step 8. How many times was a Veggie Salad Bowl ordered?

In [164]:
len(chipo.loc[chipo.item_name == 'Veggie Salad Bowl'])

18

### Step 9. How many times did someone order more than one Canned Soda?

In [163]:
len(chipo.loc[(chipo.item_name == 'Canned Soda') & (chipo.quantity >= 2)])

20