# Pandas

Download the dataset.tsv file from the canvas.
### 1. Import the libraries

In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

### 2. Import the dataset

In [None]:
df = pd.read_csv('dataset.tsv', sep='\t')

### 3. Print the first 50 entries

In [None]:
df[:50]

### 4. Print the number of datapoints (rows) in the dataset?

In [None]:
len(df)

### 5. Print the number of columns in the dataset?

In [None]:
len(df.columns)

###  7. Print the names of columns.

In [None]:
names = list(df.columns)
for name in names:
    print(name)

### 8. Print the most ordered product and it's count?

In [None]:
orders_by_product = df.groupby('product_name').sum()
popular_products_sorted = orders_by_product.sort_values('quantity', ascending = False)
popular_products_sorted[:1]

### 9. Print the number of products ordered?

In [None]:
no_orders = sum(orders_by_product.quantity)
no_orders

### 10. Print the most ordered product in the option_description column?

In [None]:
def dataframe_from_series_of_lists(series):
    """Convert pandas series of lists to single dataframe.

    Algorithm:
    1. Loop through every list in series.
    2. Remove [ and ] from list.
    3. Split list by ,
    4. Append values to dataframe.

    Args:
        series (pandas Series): [desc
    """
    new_df = pd.DataFrame()
    for row in series:
        a = row.replace('[', '')
        b = a.replace(']', '')
        c = b.split(', ')
        new_df = new_df.append(pd.DataFrame(c))
    return(new_df)

options = df.option_description.dropna()
df_options = dataframe_from_series_of_lists(options)
popular_options_sorted = df_options.value_counts()
popular_options_sorted[:1]

### 11. Print the number of products orderd in total?

In [None]:
popular_options_sorted.sum()

### 12. Cast the product price to float type

In [None]:
df.product_price = df.product_price.replace('[\$,]', '', regex=True).astype(float)

### 13. Print the quantity of the costliest product ordered?


In [None]:
price_sorted_df = df.sort_values('product_price', ascending=False)
price_sorted_df[:1].quantity

### 14. What was the revenue for the whole period in the dataset?

In [None]:
# multiply price with quantitiy and sum for every datapoint.
sum(df.quantity * df.product_price)

### 15. Print the total no of orders in the whole period?

In [None]:
df.order_id.value_counts().size

### 16. Print the average price per order?

In [None]:
mean_order_price = df.groupby('order_id').mean()
mean_order_price.drop('quantity', axis=1)

### 17. How many unique products were sold?

In [None]:
df.product_name.value_counts().size

### 18. Print the number of time Veggie Burrito was ordered?

In [None]:
# Extract datapoints whose name is 'Veggie Burrito'
veggie_df = df[df.product_name == 'Veggie Burrito']
# Extract order id Series. Count occurences of order id and return length.
veggie_df.order_id.value_counts().size

### 19. Print the products cost more than $20.00?


In [None]:
df[df.product_price > 20.00]

### 20. Plot a histogram for the prices of top 7 products bought

In [None]:
tmp = popular_products_sorted.reset_index()

fig = plt.figure(figsize=(15, 12))
i = 1
for product in tmp.product_name[:7]:
    ax = fig.add_subplot(3, 3, i)
    ax.title.set_text(product)
    plt.hist(df[df.product_name == product].product_price)
    i = i + 1

### 21. Draw a scatterplot with the number of products orderered per transaction price
#### Tips: Price should be in the X-axis and products ordered in the Y-axis

In [None]:
sns.set_style("darkgrid")
sns.lmplot(x = 'product_price', y = 'quantity', data = df)