In [1]:
import pandas as pd
import glob

In [2]:
files = glob.glob('../data/*.csv')

In [3]:
wines = pd.concat((pd.read_csv(i) for i in files))

In [4]:
wines.head()

Unnamed: 0,winery,wine,year,rating,num_review,region,price,type,grapes
0,Sonsierra,Tempranillo Blanco 2016,2016,34,33,España / Rioja / Rioja Alta,890,Vino blanco,100 % de Tempranillo Blanco
1,Terra i Vins,Brúixola Blanc 2016,2016,38,32,España / Cataluña / Priorato,1645,Vino blanco,"Garnacha Blanca, Pedro Ximenez, Macabeo"
2,Agro de Bazán,Granbazán Limousin Albariño 2017,2017,42,193,España / Galicia / Rías Baixas,2370,Vino blanco,100 % de Albariño
3,Azpilicueta,Crianza 2018,2018,37,133,España / Rioja,831,Vino tinto,"85 % de Tempranillo, 10 % de Graciano, 5 % de ..."
4,Jané Ventura,Finca Els Camps Macabeu 2015,2015,38,35,España / Cataluña / Penedès,1590,Vino blanco,100 % de Macabeo


In [5]:
wines.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8505 entries, 0 to 975
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   winery      8505 non-null   object
 1   wine        8505 non-null   object
 2   year        8505 non-null   int64 
 3   rating      8505 non-null   object
 4   num_review  8505 non-null   int64 
 5   region      8505 non-null   object
 6   price       8351 non-null   object
 7   type        8505 non-null   object
 8   grapes      8505 non-null   object
dtypes: int64(2), object(7)
memory usage: 4.6 MB


In [6]:
wines.shape

(8505, 9)

In [7]:
wines.drop_duplicates(inplace=True) #drop duplicates

In [8]:
wines.shape

(6915, 9)

In [9]:
wines.rating = [str(wine).replace(",",".") for wine in wines.rating]

In [10]:
wines.rating = wines.rating.astype("float64") #change to numerical (rating)

In [11]:
wines.price = [str(wine).replace(",",".") for wine in wines.price]

In [12]:
wines.rating = wines.rating.astype("float64") #change to numerical (price)

In [13]:
wines.type.value_counts()

Vino tinto                                   4567
Vino blanco                                  1427
Vino espumoso                                 325
Vino fortificado                              246
Vino rosado                                   183
Vino de España                                 55
Vino de postre                                 48
Más Que Vinos                                  11
Vinos del Paseante                              6
Tierra del Vino de Zamora                       5
Vinos Sanz                                      4
Beso de Vino                                    4
Bodegas 6º Elemento - Vino Sexto Elemento       4
Reserva Grandes Vinos L'Héritage Cariñena       2
Gran Vino Blanco                                2
Vino de Finca                                   2
Compañía de Vinos Tricó                         2
Vinos en Voz Baja                               2
Gran Vino Tinto                                 1
Pateiro Vinos de Guarda                         1


In [14]:
other = list(wines.groupby('type').filter(lambda x : len(x)<=11).index)

In [15]:
len(other)

64

In [16]:
for wine in other:
    wines.loc[wine,"type"] = "Otro"

In [17]:
for wine in wines[wines.type == "Vino de España"].index:
    wines.loc[wine,"type"] = "Otro"

In [18]:
wines.region.value_counts()

España / Rioja                                 1349
España / Castilla y León / Ribera del Duero     766
España / Cataluña / Priorato                    354
España / Cataluña / Penedès                     316
España / Castilla y León / Rueda                251
                                               ... 
España / Islas Baleares / Ibiza                   1
España / Islas Baleares                           1
Costers del Segre                                 1
España / Valencia / Castelló                      1
Somontano                                         1
Name: region, Length: 130, dtype: int64

In [19]:
wines.year.value_counts()

2018    1077
2019     947
2017     777
2020     716
2016     704
        ... 
6730       1
3887       1
1570       1
9568       1
3964       1
Name: year, Length: 398, dtype: int64

In [20]:
future_wines = list(wines[wines.year > 2022].index)

In [21]:
len(future_wines)

293

In [22]:
wines.drop(future_wines,inplace=True) #drop 293 wines with year over 2022 (current year)

In [23]:
past_wines = list(wines[wines.year < 1800].index)

In [24]:
len(past_wines)

52

In [25]:
wines.drop(past_wines,inplace=True) #drop 52 wines with year older than 1800 (needs research on specific wines)

In [26]:
wines.grapes.value_counts()

100 % de Tempranillo                       797
Tempranillo                                388
100 % de Garnacha                          198
100 % de Verdejo                           132
100 % de Mencia                            111
                                          ... 
Petit Manseng, Hondarrabi Zuri               1
Albariño, Loureiro, Godello, Treixadura      1
Garnacha Roja (Gris)                         1
70 % de Viura, 30 % de Garnacha Blanca       1
70 % de Garnacha, 30 % de Shiraz/Syrah       1
Name: grapes, Length: 957, dtype: int64

In [27]:
def quick_fix_grapes():
    
    wines.grapes = ["100 % de Tempranillo" if wine == "Tempranillo" else wine for wine in wines.grapes]
    
    wines.grapes = ["100 % de Garnacha" if wine == "Garnacha" else wine for wine in wines.grapes]
    
    wines.grapes = ["100 % de Verdejo" if wine == "Verdejo" else wine for wine in wines.grapes]
    
    wines.grapes = ["100 % de Xarel-lo" if wine == "Xarel-lo" else wine for wine in wines.grapes]
    
    wines.grapes = ["100 % de Albariño" if wine == "Albariño" else wine for wine in wines.grapes]
    
    wines.grapes = ["100 % de Monastrell" if wine == "Monastrell" else wine for wine in wines.grapes]
    
    wines.grapes = ["100 % de Grenache" if wine == "Grenache" else wine for wine in wines.grapes]
    
    wines.grapes = ["100 % de Chardonnay" if wine == "Chardonnay" else wine for wine in wines.grapes]
    
    wines.grapes = ["100 % de Mencia" if wine == "Mencia" else wine for wine in wines.grapes]
    
    wines.grapes = ["100 % de Shiraz/Syrah" if wine == "Shiraz/Syrah" else wine for wine in wines.grapes]
    
    wines.grapes = ["100 % de Cabarnet Sauvignon" if wine == "Cabernet Sauvignon" else wine for wine in wines.grapes]
    
    wines.grapes = ["100 % de Bobal" if wine == "Bobal" else wine for wine in wines.grapes]
    
    wines.grapes = ["100 % de Godello" if wine == "Godello" else wine for wine in wines.grapes]
    
    wines.grapes = ["100 % de Tinto Fino" if wine == "Tinto Fino" else wine for wine in wines.grapes]
    
    wines.grapes = ["100 % de Cariñena" if wine == "Cariñena" else wine for wine in wines.grapes]
    
    wines.grapes = ["100 % de Macabeo" if wine == "Macabeo" else wine for wine in wines.grapes]  

In [28]:
quick_fix_grapes()

In [29]:
wines.region.value_counts().head(20)

España / Rioja                                 1055
España / Castilla y León / Ribera del Duero     594
España / Cataluña / Priorato                    266
España / Cataluña / Penedès                     248
España / Castilla y León / Rueda                188
España / Galicia / Rías Baixas                  152
España / Cataluña / Tarragona / Montsant        142
España / Castilla y León                        135
España / Castilla y León / Bierzo               132
España / Castilla y León / Toro                 131
España / Cataluña / Empordà                     109
España / Cava                                   100
España / Navarra                                 97
España / Cataluña                                89
España / Murcia / Jumilla                        83
España / Aragón / Somontano                      75
España / Valencia / Alicante                     73
España / Valencia                                71
España / Cataluña / Costers del Segre            66
España / Cas

In [30]:
wines.shape

(5020, 9)

In [31]:
#new column for map in pandas
wines['region_map'] = wines['region']

In [32]:
wines.region_map = [wine.split("/")[1].strip() if "/" in wine else wine for wine in wines.region_map]

In [33]:
wines.region_map.value_counts()

Castilla y León        1223
Rioja                  1126
Cataluña               1060
Galicia                 297
Aragón                  225
Valencia                189
Castilla                188
Murcia                  126
Navarra                 111
Cava                    101
Andalucía                97
Islas Baleares           93
Madrid                   46
Vino de España           43
Islas Canarias           40
País Vasco               24
Extremadura              17
Terra Alta                2
Empordà                   2
Priorato                  2
Valle de la Orotava       1
Ribeira Sacra             1
Montsant                  1
Ribera del Guadiana       1
Asturias                  1
Cigales                   1
Bierzo                    1
Somontano                 1
Name: region_map, dtype: int64

In [34]:
wines['region_map2'] = wines['region_map']

In [None]:
#SQL database

In [None]:
from sqlalchemy import create_engine

In [None]:
with open('../data/password.txt', 'r') as file:
    pass_=file.readlines()[0]

str_conn=f'mysql+pymysql://root:{pass_}@localhost:3306'

In [None]:
cursor=create_engine(str_conn)

In [None]:
cursor.execute('create database vivino')

In [None]:
str_conn=f'mysql+pymysql://root:{pass_}@localhost:3306/vivino'

In [None]:
cursor_sql=create_engine(str_conn)

In [None]:
wines.to_sql(name='wines', index=False, con=cursor_sql)

In [None]:
wines.shape

In [None]:
#to excel for the connection with Tableau

In [35]:
wines.to_csv("../data/WINES.csv", index=False)

In [36]:
wines.shape

(5020, 11)

In [37]:
wines

Unnamed: 0,winery,wine,year,rating,num_review,region,price,type,grapes,region_map,region_map2
0,Sonsierra,Tempranillo Blanco 2016,2016,3.4,33,España / Rioja / Rioja Alta,8.90,Vino blanco,100 % de Tempranillo Blanco,Rioja,Rioja
1,Terra i Vins,Brúixola Blanc 2016,2016,3.8,32,España / Cataluña / Priorato,16.45,Vino blanco,"Garnacha Blanca, Pedro Ximenez, Macabeo",Cataluña,Cataluña
2,Agro de Bazán,Granbazán Limousin Albariño 2017,2017,4.2,193,España / Galicia / Rías Baixas,23.70,Vino blanco,100 % de Albariño,Galicia,Galicia
3,Azpilicueta,Crianza 2018,2018,3.7,133,España / Rioja,8.31,Vino tinto,"85 % de Tempranillo, 10 % de Graciano, 5 % de ...",Rioja,Rioja
4,Jané Ventura,Finca Els Camps Macabeu 2015,2015,3.8,35,España / Cataluña / Penedès,15.90,Vino blanco,100 % de Macabeo,Cataluña,Cataluña
...,...,...,...,...,...,...,...,...,...,...,...
969,Vivanco,Selección de Familia Reserva 2014,2014,3.9,1811,España / Rioja,13.03,Vino tinto,100 % de Tempranillo,Rioja,Rioja
971,Arroyo,Gran Reserva Tinto 2012,2012,4.1,182,España / Castilla y León / Ribera del Duero,22.40,Vino tinto,100 % de Tempranillo,Castilla y León,Castilla y León
973,Jiménez-Landi,Sotorrondero 2017,2017,3.6,82,España / Castilla / Méntrida,18.45,Vino tinto,"70 % de Garnacha, 30 % de Shiraz/Syrah",Castilla,Castilla
974,Mar de Frades,Rias Baixas Albariño Finca Valiñas 2016,2016,4.2,222,España / Galicia / Rías Baixas,24.33,Vino blanco,100 % de Albariño,Galicia,Galicia
