In [1]:
import os
import pandas as pd
import matplotlib as mlt
from pathlib import Path

In [2]:
 #designate the csv file data path
data_load = Path ('../Resources/wine_data.csv')

# uniform language
wine_df = pd.read_csv(data_load,encoding="ISO-8859-1")

# removing duplicates
wine_df = wine_df.drop(["Unnamed: 7", "Unnamed: 8", "Unnamed: 9", "Unnamed: 10", "Unnamed: 11", "Unnamed: 12"], axis=1)
wine_df = wine_df.drop_duplicates()

# rennaming and capitalization of data
wine_df.rename(columns={'country':'Country'}, inplace=True)
wine_df.rename(columns={'description':'Description'}, inplace=True)
wine_df.rename(columns={'points':'Points'}, inplace=True)
wine_df.rename(columns={'price ($)':'Price ($)'}, inplace=True)
wine_df.rename(columns={'region':'Region'}, inplace=True)
wine_df.rename(columns={'title':'Title'}, inplace=True)
wine_df.rename(columns={'variety':'Variety'}, inplace=True)

#change types from objects to strings, anything that is a number to a float64/int
wine_df['Country']= wine_df['Country'].astype('string')
wine_df['Description']= wine_df['Description'].astype('string')
wine_df['Region']= wine_df['Region'].astype('string')
wine_df['Title']= wine_df['Title'].astype('string')
wine_df['Variety']= wine_df['Variety'].astype('string')
wine_df['Price ($)']=pd.to_numeric(wine_df['Price ($)'],errors='coerce')
wine_df['Points']=pd.to_numeric(wine_df['Points'],errors='coerce')

#drop unnecessary values
wine_df = wine_df.dropna(axis=0)

#print first five rows of data frame
wine_df.head()



  wine_df = pd.read_csv(data_load,encoding="ISO-8859-1")


Unnamed: 0,Country,Description,Points,Price ($),Region,Title,Variety
2,US,"Tart and snappy, the flavors of lime flesh and...",87.0,14.0,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris
3,US,"Pineapple rind, lemon pith and orange blossom ...",87.0,13.0,Lake Michigan Shore,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling
4,US,"Much like the regular bottling from 2012, this...",87.0,65.0,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir
5,Spain,Blackberry and raspberry aromas show a typical...,87.0,15.0,Navarra,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot
6,Italy,"Here's a bright, informal red that opens with ...",87.0,16.0,Vittoria,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato


In [3]:
# df types
wine_df.dtypes

Country        string[python]
Description    string[python]
Points                float64
Price ($)             float64
Region         string[python]
Title          string[python]
Variety        string[python]
dtype: object

In [4]:
# df count
wine_df.count()

Country        93576
Description    93576
Points         93576
Price ($)      93576
Region         93576
Title          93576
Variety        93576
dtype: int64

In [5]:
#find the different columns that are in data frame
wine_df.columns 

Index(['Country', 'Description', 'Points', 'Price ($)', 'Region', 'Title',
       'Variety'],
      dtype='object')

In [7]:
#isolate the top 10 wines based on variety data
df= wine_df['Variety'].value_counts()
threshold = 2500
drop_df = df[df > threshold].index

clean_wine_df = wine_df[wine_df['Variety'].isin(drop_df)==False].reset_index(drop=True)

clean_wine_df

Unnamed: 0,Country,Description,Points,Price ($),Region,Title,Variety
0,US,"Tart and snappy, the flavors of lime flesh and...",87.0,14.0,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris
1,US,"Pineapple rind, lemon pith and orange blossom ...",87.0,13.0,Lake Michigan Shore,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling
2,Spain,Blackberry and raspberry aromas show a typical...,87.0,15.0,Navarra,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot
3,Italy,"Here's a bright, informal red that opens with ...",87.0,16.0,Vittoria,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato
4,France,This dry and restrained wine offers spice in p...,87.0,24.0,Alsace,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer
...,...,...,...,...,...,...,...
42650,Italy,"Blackberry, cassis, grilled herb and toasted a...",90.0,40.0,Sicilia,Cusumano 2012 Sàgana Tenuta San Giacomo Nero d...,Nero d'Avola
42651,France,"While it's rich, this beautiful dry wine also ...",90.0,28.0,Alsace,Domaine Rieflé-Landmann 2013 Seppi Landmann Va...,Pinot Gris
42652,France,Well-drained gravel soil gives this wine its c...,90.0,30.0,Alsace,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer
42653,France,"A dry style of Pinot Gris, this is crisp with ...",90.0,32.0,Alsace,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris


In [8]:
#change data types to strings and integers
clean_wine_df['Country']= clean_wine_df['Country'].astype('string')
clean_wine_df['Description']= clean_wine_df['Description'].astype('string')
clean_wine_df['Region']= clean_wine_df['Region'].astype('string')
clean_wine_df['Title']= clean_wine_df['Title'].astype('string')
clean_wine_df['Variety']= clean_wine_df['Variety'].astype('string')
clean_wine_df['Price ($)']=pd.to_numeric(clean_wine_df['Price ($)'],errors='coerce')
clean_wine_df['Points']=pd.to_numeric(clean_wine_df['Points'],errors='coerce')

clean_wine_df.dtypes

Country        string[python]
Description    string[python]
Points                float64
Price ($)             float64
Region         string[python]
Title          string[python]
Variety        string[python]
dtype: object

In [9]:
#isolate the wines that have points of 90 or avbove
high_points = clean_wine_df.loc[(clean_wine_df['Points']>= 90)]

high_points['Points'].value_counts()

Points
90.0     4843
91.0     3348
92.0     2688
93.0     1734
94.0     1010
95.0      433
96.0      148
97.0       62
98.0       24
99.0       11
100.0       8
Name: count, dtype: int64

In [10]:
#find the countries that produced the 90 or above wines
high_points['Country'].value_counts()

Country
US           4823
Italy        4114
France       3074
Spain        1162
Australia     578
Argentina     487
Canada         71
Name: count, dtype: Int64

In [11]:
#create a groupby by country of the high rated wines

high_points_df = high_points.groupby('Country')