In [861]:
import pandas as pd
import numpy as np
import re

In [898]:
df = pd.read_csv('wine_scraped_dataset.csv', index_col=0)

## Assess 

In [829]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23822 entries, 0 to 23821
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_name  23822 non-null  object 
 1   price         23822 non-null  object 
 2   varietal      23822 non-null  object 
 3   origin        23819 non-null  object 
 4   rating        23822 non-null  float64
 5   rating_count  23822 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 1.3+ MB


In [830]:
df.head()

Unnamed: 0,product_name,price,varietal,origin,rating,rating_count
0,Dom Perignon Vintage with Gift Box 2010,199,Vintage Sparkling Wine,"Champagne, France",4.5,42
1,Veuve Clicquot Yellow Label Brut,59,Non-Vintage Sparkling Wine,"Champagne, France",4.4,1138
2,Duckhorn Napa Valley Cabernet Sauvignon 2017,78,Cabernet Sauvignon,"Napa Valley, California",4.3,62
3,Caymus Special Selection Cabernet Sauvignon 2016,180,Cabernet Sauvignon,"Napa Valley, California",4.6,108
4,Quintessa 2016,199,Cabernet Sauvignon,"Rutherford, Napa Valley, California",4.7,73


In [831]:
df['varietal'].sort_values().unique()

array(['Agiorgitiko', 'Aglianico', 'Albarino', 'Alicante Bouschet',
       'Arneis', 'Assyrtiko', 'Baga', 'Barbera', 'Blaufrankisch', 'Bobal',
       'Bordeaux Red Blends', 'Bordeaux White Blends', 'Cabernet Franc',
       'Cabernet Sauvignon', 'Carignan', 'Carmenere', 'Chardonnay',
       'Chenin Blanc', 'Cinsault', 'Corvina', 'Dolcetto', 'Fiano',
       'Friulano', 'Fruit Wine', 'Furmint', 'Gamay', 'Garganega',
       'Gewurztraminer', 'Godello', 'Greco', 'Grenache', 'Grenache Blanc',
       'Gruner Veltliner', 'Lagrein', 'Madeira', 'Malbec', 'Malvasia',
       'Marsanne', 'Melon de Bourgogne', 'Mencia', 'Merlot',
       'Montepulciano', 'Mourvedre', 'Muscat', 'Nebbiolo',
       'Nerello Mascalese', "Nero d'Avola", 'Non-Vintage Sparkling Wine',
       'Other Dessert', 'Other Red Blends', 'Other Red Wine',
       'Other White Blends', 'Other White Wine', 'Petit Verdot',
       'Petite Sirah', 'Pinot Blanc', 'Pinot Gris/Grigio', 'Pinot Noir',
       'Pinotage', 'Port', 'Red Sparkling W

### Cleaning 

The following items have been indentified as tidiness/ quality issues:
- 'origin' is missing three values
- separate 'origin' into 'region' and 'appelation'
- Remove non wines from varietals: 'Port' 'Non-Vintage Sparkling Wine' 'Vintage Sparkling Wine'
- Seperate 'year' out of 'product_name' and drop the ones that don't have a year?
- turn remove commas and turn price into int

### 'origin' is missing three values 

In [832]:
df = df[df['origin'].notna()]

In [833]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23819 entries, 0 to 23821
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_name  23819 non-null  object 
 1   price         23819 non-null  object 
 2   varietal      23819 non-null  object 
 3   origin        23819 non-null  object 
 4   rating        23819 non-null  float64
 5   rating_count  23819 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 1.3+ MB


###  separate 'origin' into 'region' and 'appelation'

In [834]:
df_or = df.copy()

In [835]:
df_or = df_or.origin.str.rsplit(',').str[-2:]

In [836]:
df_or = df_or.to_frame()

In [837]:
df_or = pd.DataFrame(df.origin.str.rsplit(',', 1).tolist(),
                                 columns = ['appellation','region'])

In [838]:
df_or2 = df_or.appellation.str.rsplit(',').str[-1:]

In [839]:
df_or2 = df_or2.to_frame()

In [840]:
df_or2['appellation'] = df_or2['appellation'].astype(str)

In [841]:
type(df_or2['appellation'])

pandas.core.series.Series

In [842]:
df_merged = df_or.assign(appellation=df_or2['appellation'])

In [843]:
df.drop('origin', axis = 1, inplace = True)

In [844]:
df = pd.concat([df, df_merged], axis=1)

In [845]:
df.head(2)

Unnamed: 0,product_name,price,varietal,rating,rating_count,appellation,region
0,Dom Perignon Vintage with Gift Box 2010,199,Vintage Sparkling Wine,4.5,42.0,['Champagne'],France
1,Veuve Clicquot Yellow Label Brut,59,Non-Vintage Sparkling Wine,4.4,1138.0,['Champagne'],France


In [846]:
df.dropna(inplace = True)

In [847]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22685 entries, 0 to 23818
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_name  22685 non-null  object 
 1   price         22685 non-null  object 
 2   varietal      22685 non-null  object 
 3   rating        22685 non-null  float64
 4   rating_count  22685 non-null  float64
 5   appellation   22685 non-null  object 
 6   region        22685 non-null  object 
dtypes: float64(2), object(5)
memory usage: 1.4+ MB


In [848]:
df['appellation'].sort_values().unique()

array(['["Valle d\'Aosta"]', "[' Bordeaux']", "[' Burgundy']",
       "[' Central Coast']", "[' Columbia Valley']",
       "[' Friuli-Venezia Giulia']", "[' Loire']", "[' Mendocino']",
       "[' Mendoza']", "[' Napa Valley']", "[' New South Wales']",
       "[' New York']", "[' North Coast']", "[' Piedmont']",
       "[' Prosecco']", "[' Rapel Valley']", "[' Rhone']", "[' Salta']",
       "[' San Antonio Valley (Chile)']", "[' Sierra Foothills']",
       "[' Sonoma County']", "[' South Australia']",
       "[' South of France']", "[' Southwest']",
       "[' Trentino-Alto Adige']", "[' Tuscany']", "[' Veneto']",
       "[' Victoria']", "[' Western Australia']",
       "[' Willamette Valley']", "['Abruzzo']", "['Aconcagua Valley']",
       "['Alentejo']", "['Alsace']", "['Auckland']", "['Basilicata']",
       "['Bierzo']", "['Bordeaux']", "['Burgenland']", "['Burgundy']",
       "['Campania']", "['Canterbury']", "['Carneros']",
       "['Casablanca Valley']", "['Central Coast']", "['Ce

In [849]:
df['region'].sort_values().unique()

array([' Argentina', ' Australia', ' Austria', ' California', ' Chile',
       ' France', ' Germany', ' Greece', ' Italy', ' New Zealand',
       ' Oregon', ' Other U.S.', ' Portugal', ' South Africa', ' Spain',
       ' Washington'], dtype=object)

### seperate 'year' out of 'product_name' and drop the ones that don't have a year? 

In [899]:
# this removes the parenthesis and what's between them. made it easier to extract year. 
df['product_name'] = df['product_name'].str.replace(r"\(.*?\)", "")

In [900]:
# the following only extract the digit from left to right. 
df['year']= df.product_name.str.extract('(\d+)')

In [902]:
df['product_name'] = df['product_name'].str[:-5] #.str[-1:]

In [905]:
df.dropna(inplace = True)

In [906]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23637 entries, 0 to 23821
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_name  23637 non-null  object 
 1   price         23637 non-null  object 
 2   varietal      23637 non-null  object 
 3   origin        23637 non-null  object 
 4   rating        23637 non-null  float64
 5   rating_count  23637 non-null  int64  
 6   year          23637 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 1.4+ MB


In [907]:
# turn the string year into an integer
df['year'] = df['year'].astype(int)

In [908]:
df['year'].sort_values().unique()

array([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
         11,   12,   13,   14,   15,   16,   17,   20,   21,   22,   23,
         24,   25,   27,   28,   29,   30,   32,   36,   40,   41,   42,
         43,   45,   50,   51,   55,   60,   66,   75,   81,   84,   88,
         94,  100,  101,  115,  150,  170,  203,  239,  281,  302,  337,
        375,  389,  407,  459,  620,  707,  890,  902,  904, 1003, 1102,
       1147, 1206, 1302, 1403, 1406, 1522, 1614, 1752, 1792, 1830, 1843,
       1850, 1855, 1860, 1863, 1875, 1898, 1902, 1906, 1908, 1920, 1927,
       1929, 1937, 1951, 1955, 1959, 1961, 1963, 1964, 1965, 1966, 1967,
       1968, 1970, 1971, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982,
       1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993,
       1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2019, 2020, 2134, 3103, 40

In [923]:
# search for the weird year entries above
df.loc[df['year'] == 1961]

Unnamed: 0,product_name,price,varietal,origin,rating,rating_count,year
11503,Pewsey Vale 1961 Block Riesling,49,Riesling,"Eden Valley, Barossa, South Australia, Australia",0.0,0,1961


In [915]:
# drop anything that is older than 
df = df[df['year'] > 1937]  

In [897]:
df.shape

(23104, 7)

### remove commas and turn price into int 

In [791]:
df['price'] = df['price'].str.replace(',', '')

In [792]:
df['price'].min(), df['price'].max()

('10', '9999')

In [793]:
df['price'] = df['price'].astype(int)

In [794]:
df.dtypes

product_name     object
price             int64
varietal         object
rating          float64
rating_count    float64
appellation      object
region           object
year             object
dtype: object

### look for zero and non values in the ratings

In [928]:
df['rating'].sort_values().unique()

array([0. , 1.6, 1.9, 2.2, 2.4, 2.6, 2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3,
       3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6,
       4.7, 4.8, 4.9, 5. ])

In [931]:
df.groupby('rating').size()

rating
0.0    18789
1.6        1
1.9        1
2.2        1
2.4        1
2.6        3
2.7        1
2.8        2
2.9        4
3.0       10
3.1       16
3.2       21
3.3       29
3.4       44
3.5       48
3.6      100
3.7      122
3.8      187
3.9      250
4.0      300
4.1      358
4.2      462
4.3      401
4.4      403
4.5      370
4.6      341
4.7      256
4.8      204
4.9      170
5.0      202
dtype: int64

In [929]:
df['rating_count'].sort_values().unique()

array([  0,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,
        17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
        30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,
        43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
        56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,
        69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,
        82,  83,  84,  85,  86,  87,  88,  89,  90,  92,  93,  94,  95,
        97,  99, 100, 101, 102, 103, 105, 106, 107, 108, 109, 110, 112,
       113, 114, 115, 116, 118, 119, 121, 122, 123, 125, 127, 128, 129,
       130, 131, 132, 136, 137, 138, 139, 140, 143, 144, 146, 147, 148,
       149, 150, 151, 152, 156, 157, 158, 160, 164, 166, 168, 169, 172,
       174, 176, 177, 180, 183, 184, 185, 186, 188, 196, 199, 200, 202,
       203, 206, 208, 211, 213, 216, 224, 227, 231, 235, 238, 243, 244,
       247, 248, 251, 257, 263, 267, 271, 276, 278, 281, 283, 29

In [938]:
df.groupby('rating_count').size()

rating_count
0      18789
5        581
6        462
7        372
8        284
       ...  
556        1
614        1
663        3
672        3
750        1
Length: 201, dtype: int64

## Dataframes

### Master  

In [795]:
df.shape

(22579, 8)

In [796]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22579 entries, 0 to 23818
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_name  22579 non-null  object 
 1   price         22579 non-null  int64  
 2   varietal      22579 non-null  object 
 3   rating        22579 non-null  float64
 4   rating_count  22579 non-null  float64
 5   appellation   22579 non-null  object 
 6   region        22579 non-null  object 
 7   year          22579 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 1.6+ MB


In [797]:
df.head()

Unnamed: 0,product_name,price,varietal,rating,rating_count,appellation,region,year
0,Dom Perignon Vintage with Gift Box 2010,199,Vintage Sparkling Wine,4.5,42.0,['Champagne'],France,2010
2,Duckhorn Napa Valley Cabernet Sauvignon 2017,78,Cabernet Sauvignon,4.3,62.0,['Napa Valley'],California,2017
3,Caymus Special Selection Cabernet Sauvignon 2016,180,Cabernet Sauvignon,4.6,108.0,['Napa Valley'],California,2016
4,Quintessa 2016,199,Cabernet Sauvignon,4.7,73.0,[' Napa Valley'],California,2016
5,Joseph Phelps Insignia 2016,300,Bordeaux Red Blends,4.6,81.0,['Napa Valley'],California,2016


In [798]:
df.to_csv('wine_master_dataset.csv')

###  Wines with Ratings 

In [939]:
df_ratings_only = df.copy()

In [940]:
df_ratings_only = df_ratings_only[df_ratings_only['rating_count'] > 0]  

In [941]:
df_ratings_only.shape

(4308, 7)

In [942]:
df_ratings_only.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4308 entries, 0 to 23786
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_name  4308 non-null   object 
 1   price         4308 non-null   object 
 2   varietal      4308 non-null   object 
 3   origin        4308 non-null   object 
 4   rating        4308 non-null   float64
 5   rating_count  4308 non-null   int64  
 6   year          4308 non-null   int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 269.2+ KB


In [943]:
df_ratings_only.head()

Unnamed: 0,product_name,price,varietal,origin,rating,rating_count,year
0,Dom Perignon Vintage with Gift Box,199,Vintage Sparkling Wine,"Champagne, France",4.5,42,2010
2,Duckhorn Napa Valley Cabernet Sauvignon,78,Cabernet Sauvignon,"Napa Valley, California",4.3,62,2017
3,Caymus Special Selection Cabernet Sauvignon,180,Cabernet Sauvignon,"Napa Valley, California",4.6,108,2016
4,Quintessa,199,Cabernet Sauvignon,"Rutherford, Napa Valley, California",4.7,73,2016
5,Joseph Phelps Insignia,300,Bordeaux Red Blends,"Napa Valley, California",4.6,81,2016


### Wines with Bonafide Varietals

In [947]:
df_varietals = df.copy()

In [948]:
non_varietals = ['Port', 'Non-Vintage Sparkling Wine', 'Vintage Sparkling Wine']
# Delete these row indexes from dataFrame
df_varietals = df_varietals[~df_varietals['varietal'].isin(non_varietals)]

In [949]:
# this is the df with the non-wine varietals
df_varietals['varietal'].sort_values().unique()

array(['Agiorgitiko', 'Aglianico', 'Albarino', 'Alicante Bouschet',
       'Arneis', 'Assyrtiko', 'Baga', 'Barbera', 'Blaufrankisch', 'Bobal',
       'Bordeaux Red Blends', 'Bordeaux White Blends', 'Cabernet Franc',
       'Cabernet Sauvignon', 'Carignan', 'Carmenere', 'Chardonnay',
       'Chenin Blanc', 'Cinsault', 'Corvina', 'Dolcetto', 'Fiano',
       'Friulano', 'Fruit Wine', 'Furmint', 'Gamay', 'Garganega',
       'Gewurztraminer', 'Godello', 'Greco', 'Grenache', 'Grenache Blanc',
       'Gruner Veltliner', 'Lagrein', 'Madeira', 'Malbec', 'Malvasia',
       'Marsanne', 'Melon de Bourgogne', 'Mencia', 'Merlot',
       'Montepulciano', 'Mourvedre', 'Muscat', 'Nebbiolo',
       'Nerello Mascalese', "Nero d'Avola", 'Other Dessert',
       'Other Red Blends', 'Other Red Wine', 'Other White Blends',
       'Other White Wine', 'Petit Verdot', 'Petite Sirah', 'Pinot Blanc',
       'Pinot Gris/Grigio', 'Pinot Noir', 'Pinotage',
       'Red Sparkling Wine', 'Rhone Red Blends', 'Rhone White

In [950]:
df_varietals.shape

(22442, 7)

In [951]:
df_varietals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22442 entries, 2 to 23821
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_name  22442 non-null  object 
 1   price         22442 non-null  object 
 2   varietal      22442 non-null  object 
 3   origin        22442 non-null  object 
 4   rating        22442 non-null  float64
 5   rating_count  22442 non-null  int64  
 6   year          22442 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 1.4+ MB


In [952]:
df_varietals.head()

Unnamed: 0,product_name,price,varietal,origin,rating,rating_count,year
2,Duckhorn Napa Valley Cabernet Sauvignon,78,Cabernet Sauvignon,"Napa Valley, California",4.3,62,2017
3,Caymus Special Selection Cabernet Sauvignon,180,Cabernet Sauvignon,"Napa Valley, California",4.6,108,2016
4,Quintessa,199,Cabernet Sauvignon,"Rutherford, Napa Valley, California",4.7,73,2016
5,Joseph Phelps Insignia,300,Bordeaux Red Blends,"Napa Valley, California",4.6,81,2016
6,Opus One,364,Bordeaux Red Blends,"Oakville, Napa Valley, California",4.2,8,2017
