# Pandas


In [1]:
import pandas as pd
reviews = pd.read_csv("./input/winemag-data-130k-v2.csv", index_col=0)
pd.set_option("display.max_rows", 5)


In [2]:
reviews.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [3]:
# Your code here
desc = reviews['description']
first_description = reviews['description'][0]
first_row = reviews.iloc[0]
first_descriptions = reviews['description'].iloc[:10]
indices = [1, 2, 3, 5, 8]
sample_reviews = reviews.loc[indices]


In [4]:
cols = ['country', 'province', 'region_1', 'region_2']
indices = [0, 1, 10, 100]
df = reviews.loc[indices, cols]

In [5]:
cols_idx = [0, 11]
df = reviews.iloc[:100, cols_idx]

In [6]:
italian_wines = reviews[reviews.country == 'Italy']


In [7]:
top_oceania_wines = reviews.loc[
    (reviews.country.isin(['Australia', 'New Zealand']))
    & (reviews.points >= 95)
]


In [21]:
top_romania_wines = reviews.loc[
    (reviews.country=='Romania')
    & (reviews.points >= 91)
]
top_romania_wines

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
5761,Romania,This late harvest Sauvignon Blanc is honey gol...,Conocul Ambrozy,92,28.0,Recas,,,Jeff Jenssen,@worldwineguys,Cramele Recas 2013 Conocul Ambrozy Sauvignon B...,Sauvignon Blanc,Cramele Recas
63159,Romania,This late harvest Sauvignon Blanc is honey gol...,Conocul Ambrozy,92,28.0,Recas,,,Jeff Jenssen,@worldwineguys,Cramele Recas 2013 Conocul Ambrozy Sauvignon B...,Sauvignon Blanc,Cramele Recas


In [23]:
centered_price = reviews.price - reviews.price.mean()
centered_price

0               NaN
1        -20.363389
            ...    
129969    -3.363389
129970   -14.363389
Name: price, Length: 129971, dtype: float64

In [8]:
bargain_idx = (reviews.points / reviews.price).idxmax()
bargain_wine = reviews.loc[bargain_idx, 'title']
bargain_wine

'Bandit NV Merlot (California)'

In [9]:
n_trop = reviews.description.map(lambda desc: "tropical" in desc).sum()
n_fruity = reviews.description.map(lambda desc: "fruity" in desc).sum()
descriptor_counts = pd.Series([n_trop, n_fruity], index=['tropical', 'fruity'])

In [10]:
def stars(row):
    if row.country == 'Canada':
        return 3
    elif row.points >= 95:
        return 3
    elif row.points >= 85:
        return 2
    else:
        return 1

star_ratings = reviews.apply(stars, axis='columns')

In [11]:
#Most common wine reviews in the dataset
reviews_written = reviews.groupby('taster_twitter_handle').size()
#Best wine that can be bought for a given amount of money
best_rating_per_price = reviews.groupby('price')['points'].max().sort_index()


In [None]:
#The minimum and maximum prices for each variety of wine
price_extremes = reviews.groupby('variety').price.agg([min, max])
#The most expensive wine varieties
sorted_varieties = price_extremes.sort_values(by=['min', 'max'], ascending=False)


In [None]:
#Create a Series whose index is reviewers and whose values is the average review score given out by that reviewer. Hint: you will need the taster_name and points columns.
reviewer_mean_ratings = reviews.groupby('taster_name').points.mean()


In [None]:
#What combination of countries and varieties are most common? Create a Series whose index is a MultiIndexof {country, variety} pairs. For example, a pinot noir produced in the US should map to {"US", "Pinot Noir"}. Sort the values in the Series in descending order based on wine count.
country_variety_counts = reviews.groupby(['country', 'variety']).size().sort_values(ascending=False)

In [13]:
point_strings = reviews.points.astype(str)
missing_price_reviews = reviews[reviews.price.isnull()]
n_missing_prices = len(missing_price_reviews)

In [14]:
#Most common wine producing regions
reviews_per_region = reviews.region_1.fillna('Unknown').value_counts().sort_values(ascending=False)


In [None]:
#Renaming columns
renamed = reviews.rename(columns=dict(region_1='region', region_2='locale'))

