# Getting and Knowing your Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv).

### Step 3. Assign it to a variable called chipo.

In [2]:
chipo = pd.read_csv("https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv", sep='\t')
chipo.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


### Step 4. See the first 10 entries

In [4]:
#chipo.head(10)
chipo.iloc[0:10] # iloc excludes the end index where as loc includes end label
#chipo.iloc[0:10,:] -> [start_row:end_row , start_column:end_column]
#chipo.loc[0:9] # -> includes end labels

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


### Step 5. What is the number of observations in the dataset?

In [8]:
print("Number of observations: ", chipo.shape[0])
#print(chipo.count())
#print(len(chipo))
#print(chipo.info())
#print(chipo.describe())


Number of observations:  4622


### Step 6. What is the number of columns in the dataset?

In [None]:
print("Number of colums: ", chipo.shape[1])
print("Number of colums: ", len(chipo.columns))

Number of colums:  5
Number of colums:  5


### Step 7. Print the name of all the columns.

In [None]:
print("Colums are: ", chipo.columns)

Colums are:  Index(['order_id', 'quantity', 'item_name', 'choice_description',
       'item_price'],
      dtype='object')


### Step 8. How is the dataset indexed?

In [None]:
print(chipo.index)

RangeIndex(start=0, stop=4622, step=1)


### Step 9. Which was the most ordered item?

In [None]:
#chipo.groupby("item_name").agg("sum")
chipo["item_name"].value_counts().index[0] # most ordered item
chipo["item_name"].value_counts(ascending=True).index[0] # least ordered item
chipo["item_name"].value_counts(normalize=True).index[0] # this gives propotianate

'Chicken Bowl'

In [None]:
chipo.groupby("item_name").sum().sort_values(["quantity"], ascending=False).index[0]

  chipo.groupby("item_name").sum().sort_values(["quantity"], ascending=False).index[0]


'Chicken Bowl'

In [None]:
#print(chipo.groupby("item_name").sum())
#print(chipo.groupby("item_name")["quantity"].sum().sort(ascending=False))
chipo.groupby("item_name")["quantity"].sum().sort_values(ascending=False)

item_name
Chicken Bowl                             761
Chicken Burrito                          591
Chips and Guacamole                      506
Steak Burrito                            386
Canned Soft Drink                        351
Chips                                    230
Steak Bowl                               221
Bottled Water                            211
Chips and Fresh Tomato Salsa             130
Canned Soda                              126
Chicken Salad Bowl                       123
Chicken Soft Tacos                       120
Side of Chips                            110
Veggie Burrito                            97
Barbacoa Burrito                          91
Veggie Bowl                               87
Carnitas Bowl                             71
Barbacoa Bowl                             66
Carnitas Burrito                          60
Steak Soft Tacos                          56
6 Pack Soft Drink                         55
Chips and Tomatillo Red Chili Salsa       50


### Step 10. How many items were ordered?

In [None]:
print(chipo["quantity"].sum())
print(chipo.quantity.sum())

4972
4972


In [None]:
print(chipo[["quantity", "order_id"]])

      quantity  order_id
0            1         1
1            1         1
2            1         1
3            1         1
4            2         2
...        ...       ...
4617         1      1833
4618         1      1833
4619         1      1834
4620         1      1834
4621         1      1834

[4622 rows x 2 columns]


### Step 11. What was the most ordered item in the choice_description column?

In [65]:
chipo = pd.read_csv("https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv", sep='\t')
#chipo.head()
chipo=chipo["choice_description"].map(lambda x : list(str(x).replace("[","").replace("]","").split(",")))
#chipo.head()
print(chipo.apply(pd.Series).unstack().reset_index(drop=True).dropna().value_counts().index[0])

 Rice


### Step 12. How many items were orderd in total?

In [10]:
print(chipo.quantity.sum())

4972


### Step 13. Turn the item price into a float

In [13]:
chipo = pd.read_csv("https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv", sep='\t')
print(chipo.dtypes)
chipo["item_price"] = chipo["item_price"].apply(lambda x: float(x.replace("$","")))
#chipo["item_price"] =
print(chipo.head())
print(chipo.dtypes)


order_id               int64
quantity               int64
item_name             object
choice_description    object
item_price            object
dtype: object
   order_id  quantity                              item_name  \
0         1         1           Chips and Fresh Tomato Salsa   
1         1         1                                   Izze   
2         1         1                       Nantucket Nectar   
3         1         1  Chips and Tomatillo-Green Chili Salsa   
4         2         2                           Chicken Bowl   

                                  choice_description  item_price  
0                                                NaN        2.39  
1                                       [Clementine]        3.39  
2                                            [Apple]        3.39  
3                                                NaN        2.39  
4  [Tomatillo-Red Chili Salsa (Hot), [Black Beans...       16.98  
order_id                int64
quantity                

### Step 14. How much was the revenue for the period in the dataset?

In [19]:
chipo["amount"]=chipo["quantity"]*chipo["item_price"]
print(chipo["amount"].sum())

39237.02


### Step 15. How many orders were made in the period?

In [None]:
print(chipo.order_id.count())
print(len(chipo["order_id"].unique()))

4622
1834


### Step 16. What is the average amount per order?

In [23]:
chipo = pd.read_csv("https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv", sep='\t')
chipo["item_price"] = chipo["item_price"].apply(lambda x: float(x.replace("$","")))
chipo["amount"] = chipo["quantity"] * chipo["item_price"]
chipo[["order_id", "amount"]].groupby("order_id").mean("amount")

Unnamed: 0_level_0,amount
order_id,Unnamed: 1_level_1
1,2.890000
2,33.960000
3,6.335000
4,10.500000
5,6.850000
...,...
1830,11.500000
1831,4.300000
1832,6.600000
1833,11.750000


### Step 17. How many different items are sold?

In [29]:
#len(chipo["item_name"].unique())
chipo["item_name"].nunique()

50