# Dataframe Manipulation Warmup

In [1]:
import numpy as np
import pandas as pd

np.random.seed(406)

n = 5000
df = pd.DataFrame({
    'favorite_animal': np.random.choice(['cat', 'dog', 'frog', 'lemur', 'panda'], n),
    'favorite_vegetable': np.random.choice(['brussel sprouts', 'potato', 'squash'], n),
    'favorite_fruit': np.random.choice(['banana', 'apple', 'blueberries'], n),
    'wears_glasses': np.random.choice(['yes', 'no'], n),
    'netflix_consumption': np.random.normal(10, 2, n),
    'open_browser_tabs': np.random.randint(2, 90, n),
})

- What is the highest amount of netflix consumption? `17.535`
- How many people wear glasses? What percentage of people is this? `2555`, `.511`
- How many people's favorite animal is a dog? `1002`
- What is the most common favorite animal? `lemur`
- What is the average netflix consumption for people that prefer brussel
  sprouts? `10.008`
- What is the most common favorite fruit for people who wear glasses and have
  more than 40 open browser tabs? `blueberries`
- What percentage of people have a netflix consumption lower than 7? `.0716`
- What is the average netflix consumption for people with less than 30 open
  browser tabs? `9.91935`
- How many people *don't* wear glasses, have a favorite animal of a panda, have
  a favorite fruit of blueberries, and have more than 60 open browser tabs? What
  is the median netflix consumption for this group? What is the most common
  favorite vegetable for this group? `46`, `10.455`, `potato`
- What is the least popular combination of favorite fruit and vegetable? `apple` and `potato`
- Which combination of favorite animal and wearing glasses has the highest average
  netflix consumption? people that wear glasses and prefer pandas
- **Bonus**: for each of the above questions, what kind of visualization would
  be the most effective in conveying your answer?

In [3]:
# What is the highest amount of netflix consumption? 17.535
df.netflix_consumption.max()

17.534818515438925

In [17]:
# How many people wear glasses? What percentage of people is this? 2555, .511
print((df.wears_glasses == "yes").sum())
print((df.wears_glasses == "yes").sum() / df.shape[0])

2555
0.511


In [18]:
# How many people's favorite animal is a dog? 1002
(df.favorite_animal == "dog").sum()

1002

In [27]:
# What is the most common favorite animal? lemur
df.favorite_animal.value_counts().head(1)

lemur    1028
Name: favorite_animal, dtype: int64

In [43]:
# What is the average netflix consumption for people that prefer brussel sprouts? 10.008
df.groupby("favorite_vegetable").netflix_consumption.mean().head(1)

favorite_vegetable
brussel sprouts    10.008472
Name: netflix_consumption, dtype: float64

In [105]:
# What is the most common favorite fruit for people who wear glasses and have more than 40 open browser tabs? blueberries
glasses_wearers = df[df.wears_glasses == "yes"]
internet_fiends = glasses_wearers[glasses_wearers.open_browser_tabs > 40]
internet_fiends.favorite_fruit.value_counts().nlargest(1)

blueberries    498
Name: favorite_fruit, dtype: int64

In [83]:
# What percentage of people have a netflix consumption lower than 7? .0716
df[df.netflix_consumption < 7].count() / df.shape[0]

favorite_animal        0.0716
favorite_vegetable     0.0716
favorite_fruit         0.0716
wears_glasses          0.0716
netflix_consumption    0.0716
open_browser_tabs      0.0716
dtype: float64

In [85]:
# What is the average netflix consumption for people with less than 30 open browser tabs? 9.91935
internet_noobs = df[df.open_browser_tabs < 30]
internet_noobs.netflix_consumption.mean()

9.91935736918227

In [111]:
# How many people don't wear glasses,
# have a favorite animal of a panda,
# have a favorite fruit of blueberries,
# and have more than 60 open browser tabs? 46
# What is the median netflix consumption for this group? 10.455
# What is the most common favorite vegetable for this group? potato
good_eyes = df[df.wears_glasses == "no"]
panda_lovers = good_eyes[good_eyes.favorite_animal == "panda"]
berry_people = panda_lovers[panda_lovers.favorite_fruit == "blueberries"]
esoteric_mofos = berry_people[berry_people.open_browser_tabs > 60]
print(esoteric_mofos.shape[0])
print(esoteric_mofos.netflix_consumption.median())
print(esoteric_mofos.favorite_vegetable.value_counts().nlargest(1))

46
10.45479760071613
potato    19
Name: favorite_vegetable, dtype: int64


In [140]:
# What is the least popular combination of favorite fruit and vegetable? apple and potato
pd.crosstab(index=df.favorite_vegetable, columns=df.favorite_fruit)

favorite_fruit,apple,banana,blueberries
favorite_vegetable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
brussel sprouts,565,576,555
potato,512,570,560
squash,555,524,583


In [147]:
# Which combination of favorite animal and wearing glasses has the
# highest average netflix consumption? people that wear glasses and
# prefer pandas
pd.crosstab(index=df.favorite_animal, columns=df.wears_glasses, values=df.netflix_consumption, aggfunc="mean")

wears_glasses,no,yes
favorite_animal,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,9.846183,9.884685
dog,9.933246,10.087352
frog,9.962311,9.83474
lemur,10.024557,10.010196
panda,9.946293,10.092273
