In [390]:
import pandas as pd
import numpy as np

In [463]:
df = pd.read_csv('wine_scraped_dataset.csv', index_col=0)

## Assess 

In [464]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23822 entries, 0 to 23821
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_name  23822 non-null  object 
 1   price         23822 non-null  object 
 2   varietal      23822 non-null  object 
 3   origin        23819 non-null  object 
 4   rating        23822 non-null  float64
 5   rating_count  23822 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 1.3+ MB


In [465]:
df.head()

Unnamed: 0,product_name,price,varietal,origin,rating,rating_count
0,Dom Perignon Vintage with Gift Box 2010,199,Vintage Sparkling Wine,"Champagne, France",4.5,42
1,Veuve Clicquot Yellow Label Brut,59,Non-Vintage Sparkling Wine,"Champagne, France",4.4,1138
2,Duckhorn Napa Valley Cabernet Sauvignon 2017,78,Cabernet Sauvignon,"Napa Valley, California",4.3,62
3,Caymus Special Selection Cabernet Sauvignon 2016,180,Cabernet Sauvignon,"Napa Valley, California",4.6,108
4,Quintessa 2016,199,Cabernet Sauvignon,"Rutherford, Napa Valley, California",4.7,73


In [466]:
df['varietal'].sort_values().unique()

array(['Agiorgitiko', 'Aglianico', 'Albarino', 'Alicante Bouschet',
       'Arneis', 'Assyrtiko', 'Baga', 'Barbera', 'Blaufrankisch', 'Bobal',
       'Bordeaux Red Blends', 'Bordeaux White Blends', 'Cabernet Franc',
       'Cabernet Sauvignon', 'Carignan', 'Carmenere', 'Chardonnay',
       'Chenin Blanc', 'Cinsault', 'Corvina', 'Dolcetto', 'Fiano',
       'Friulano', 'Fruit Wine', 'Furmint', 'Gamay', 'Garganega',
       'Gewurztraminer', 'Godello', 'Greco', 'Grenache', 'Grenache Blanc',
       'Gruner Veltliner', 'Lagrein', 'Madeira', 'Malbec', 'Malvasia',
       'Marsanne', 'Melon de Bourgogne', 'Mencia', 'Merlot',
       'Montepulciano', 'Mourvedre', 'Muscat', 'Nebbiolo',
       'Nerello Mascalese', "Nero d'Avola", 'Non-Vintage Sparkling Wine',
       'Other Dessert', 'Other Red Blends', 'Other Red Wine',
       'Other White Blends', 'Other White Wine', 'Petit Verdot',
       'Petite Sirah', 'Pinot Blanc', 'Pinot Gris/Grigio', 'Pinot Noir',
       'Pinotage', 'Port', 'Red Sparkling W

### Cleaning 

The following items have been indentified as tidiness/ quality issues:
- 'origin' is missing three values
- Seperate 'origin' into 'region' and 'country'
- 'country' has a lot of regions. extrct country from region name
- Remove non wines from varietals: 'Port' 'Non-Vintage Sparkling Wine' 'Vintage Sparkling Wine'
- Seperate 'year' out of 'product_name' and drop the ones that don't have a year?
- turn remove commas and turn price into int

### 'origin' is missing three values 

In [467]:
df = df[df['origin'].notna()]

In [468]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23819 entries, 0 to 23821
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_name  23819 non-null  object 
 1   price         23819 non-null  object 
 2   varietal      23819 non-null  object 
 3   origin        23819 non-null  object 
 4   rating        23819 non-null  float64
 5   rating_count  23819 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 1.3+ MB


###  separate 'origin' into 'region' and 'country'

In [670]:
df_or = df.copy()

In [671]:
df_or = df_or.origin.str.rsplit(',').str[-2:]

In [672]:
df_or = df_or.to_frame()

In [673]:
df_or = pd.DataFrame(df.origin.str.rsplit(',', 1).tolist(),
                                 columns = ['origin','region'])

In [674]:
df_or2 = df_or.origin.str.rsplit(',').str[-1:]

In [676]:
df_or2 = df_or2.to_frame()

In [677]:
df_or2['origin'] = df_or2['origin'].astype(str)

In [678]:
type(df_or2['origin'])

pandas.core.series.Series

In [679]:
df_merged = df_or.assign(origin=df_or2['origin'])

In [683]:
df.drop('origin', axis = 1, inplace = True)

In [684]:
df = pd.concat([df, df_merged], axis=1)

In [685]:
df.head(2)

Unnamed: 0,product_name,price,varietal,rating,rating_count,origin,region
0,Dom Perignon Vintage with Gift Box 2010,199,Vintage Sparkling Wine,4.5,42.0,['Champagne'],France
1,Veuve Clicquot Yellow Label Brut,59,Non-Vintage Sparkling Wine,4.4,1138.0,['Champagne'],France


In [686]:
df.dropna(inplace = True)

In [687]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22685 entries, 0 to 23818
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_name  22685 non-null  object 
 1   price         22685 non-null  object 
 2   varietal      22685 non-null  object 
 3   rating        22685 non-null  float64
 4   rating_count  22685 non-null  float64
 5   origin        22685 non-null  object 
 6   region        22685 non-null  object 
dtypes: float64(2), object(5)
memory usage: 1.4+ MB


### 'country' has a lot of regions. extrct country from region name 

In [277]:
df['country'].sort_values().unique()

array([' Argentina', ' Australia', ' Austria',
       ' Barossa, South Australia, Australia', ' Bordeaux, France',
       ' Burgundy, France', ' California', ' Central Coast, California',
       ' Chianti, Tuscany, Italy', ' Chile',
       ' Columbia Valley, Washington',
       ' Cote Chalonnaise, Burgundy, France',
       " Cote d'Or, Burgundy, France",
       " Cote de Beaune, Cote d'Or, Burgundy, France",
       " Cote de Nuits, Cote d'Or, Burgundy, France",
       ' Côtes du Roussillon-Villages, Roussillon, South of France, France',
       ' France', ' Friuli-Venezia Giulia, Italy', ' Germany', ' Greece',
       ' Italy', ' Lake County, North Coast, California',
       ' Languedoc, South of France, France', ' Loire, France',
       ' Maconnais, Burgundy, France', ' Mendocino, California',
       ' Mendoza, Argentina',
       ' Minervois, Languedoc, South of France, France',
       ' Monterey, Central Coast, California', ' Napa Valley, California',
       ' New South Wales, Australi

### remove non wines from varietals: 'Port' 'Non-Vintage Sparkling Wine' 'Vintage Sparkling Wine' 

In [147]:
df_varietals = df.copy()

In [148]:
non_varietals = ['Port', 'Non-Vintage Sparkling Wine', 'Vintage Sparkling Wine']
# Delete these row indexes from dataFrame
df_varietals = df_varietals[~df_varietals['varietal'].isin(non_varietals)]

In [149]:
# this is the df with the non-wine varietals
df_varietals['varietal'].sort_values().unique()

array(['Agiorgitiko', 'Aglianico', 'Albarino', 'Alicante Bouschet',
       'Arneis', 'Assyrtiko', 'Baga', 'Barbera', 'Blaufrankisch', 'Bobal',
       'Bordeaux Red Blends', 'Bordeaux White Blends', 'Cabernet Franc',
       'Cabernet Sauvignon', 'Carignan', 'Carmenere', 'Chardonnay',
       'Chenin Blanc', 'Corvina', 'Dolcetto', 'Fiano', 'Friulano',
       'Fruit Wine', 'Furmint', 'Gamay', 'Garganega', 'Gewurztraminer',
       'Godello', 'Greco', 'Grenache', 'Grenache Blanc',
       'Gruner Veltliner', 'Lagrein', 'Madeira', 'Malbec', 'Marsanne',
       'Melon de Bourgogne', 'Mencia', 'Merlot', 'Montepulciano',
       'Mourvedre', 'Muscat', 'Nebbiolo', 'Nerello Mascalese',
       "Nero d'Avola", 'Other Dessert', 'Other Red Blends',
       'Other Red Wine', 'Other White Blends', 'Other White Wine',
       'Petit Verdot', 'Petite Sirah', 'Pinot Blanc', 'Pinot Gris/Grigio',
       'Pinot Noir', 'Pinotage', 'Red Sparkling Wine', 'Rhone Red Blends',
       'Rhone White Blends', 'Riesling', 'R

In [253]:
df_varietals.shape

(21928, 7)

### seperate 'year' out of 'product_name' and drop the ones that don't have a year? 

In [246]:
df['year']= df.product_name.str.extract('(\d+)')

In [258]:
df.tail(90)

Unnamed: 0,product_name,price,varietal,rating,rating_count,region,country,year
23726,Greenock Creek Roennfeldt Road Shiraz 2001,399,Syrah/Shiraz,0.0,0.0,Pauillac,"Bordeaux, France",2001
23727,Domaine Francois Raveneau Chablis Valmur Grand...,319,Chardonnay,0.0,0.0,Santa Cruz Mountains,California,1997
23728,Weingut Willi Schaefer Graacher Domprobst Ries...,169,Riesling,0.0,0.0,Sonoma County,California,1403
23729,Chateau Mouton Rothschild 1959,2499,Bordeaux Red Blends,0.0,0.0,Burgundy,France,1959
23730,Rhys Vineyards Alpine Vineyard Pinot Noir (1.5...,249,Pinot Noir,0.0,0.0,St-Julien,"Bordeaux, France",1
...,...,...,...,...,...,...,...,...
23813,Chateau Leoville Barton (375ML half-bottle) 2016,109,Bordeaux Red Blends,0.0,0.0,Pauillac,"Bordeaux, France",375
23814,Domaine Leroy Vosne Romanee les Beaux Monts 2005,599,Pinot Noir,0.0,0.0,Mendocino,California,2005
23815,Chateau Doisy Daene (375ML half-bottle) 2005,29,Other Dessert,0.0,0.0,Sonoma County,California,375
23816,Chateau Latour (1.5 Liter Magnum) 2000,2999,Bordeaux Red Blends,0.0,0.0,Russian River,"Sonoma County, California",1


In [250]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22579 entries, 0 to 23818
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_name  22579 non-null  object 
 1   price         22579 non-null  object 
 2   varietal      22579 non-null  object 
 3   rating        22579 non-null  float64
 4   rating_count  22579 non-null  float64
 5   region        22579 non-null  object 
 6   country       22579 non-null  object 
 7   year          22579 non-null  object 
dtypes: float64(2), object(6)
memory usage: 1.6+ MB


In [249]:
df.dropna(inplace = True)

### remove commas and turn price into int 

In [267]:
df['price'] = df['price'].str.replace(',', '')

In [269]:
df['price'].min(), df['price'].max()

('10', '9999')

In [270]:
df['price'] = df['price'].astype(int)

In [272]:
df.dtypes

product_name     object
price             int64
varietal         object
rating          float64
rating_count    float64
region           object
country          object
year             object
dtype: object

## Master Dataframe Below 

In [273]:
df.shape

(22579, 8)

In [274]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22579 entries, 0 to 23818
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_name  22579 non-null  object 
 1   price         22579 non-null  int64  
 2   varietal      22579 non-null  object 
 3   rating        22579 non-null  float64
 4   rating_count  22579 non-null  float64
 5   region        22579 non-null  object 
 6   country       22579 non-null  object 
 7   year          22579 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 1.6+ MB


In [275]:
df.head()

Unnamed: 0,product_name,price,varietal,rating,rating_count,region,country,year
0,Dom Perignon Vintage with Gift Box 2010,199,Vintage Sparkling Wine,4.5,42.0,Champagne,France,2010
2,Duckhorn Napa Valley Cabernet Sauvignon 2017,78,Cabernet Sauvignon,4.3,62.0,Napa Valley,California,2017
3,Caymus Special Selection Cabernet Sauvignon 2016,180,Cabernet Sauvignon,4.6,108.0,Napa Valley,California,2016
4,Quintessa 2016,199,Cabernet Sauvignon,4.7,73.0,Rutherford,"Napa Valley, California",2016
5,Joseph Phelps Insignia 2016,300,Bordeaux Red Blends,4.6,81.0,Napa Valley,California,2016


In [276]:
df.to_csv('wine_master_dataset.csv')