In [1]:
#imports, uploads, inspection of data
import pandas as pd
import numpy as np
np.random.seed(5)
file=r'/Users/cecylia/Desktop/Notes/winedata.csv'
df=pd.read_csv(file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   id                     129971 non-null  int64  
 1   country                129908 non-null  object 
 2   description            129971 non-null  object 
 3   designation            92506 non-null   object 
 4   points                 129971 non-null  int64  
 5   price                  120975 non-null  float64
 6   province               129908 non-null  object 
 7   region_1               108724 non-null  object 
 8   region_2               50511 non-null   object 
 9   taster_name            103727 non-null  object 
 10  taster_twitter_handle  98758 non-null   object 
 11  title                  129971 non-null  object 
 12  variety                129970 non-null  object 
 13  winery                 129971 non-null  object 
dtypes: float64(1), int64(2), object(11)


In [2]:
#value count on varities and province
pd.set_option("display.max_rows", None, "display.max_columns", None)
print(df['province'].value_counts())
print(df['variety'].value_counts())

California                         36247
Washington                          8639
Bordeaux                            5941
Tuscany                             5897
Oregon                              5373
Burgundy                            3980
Northern Spain                      3851
Piedmont                            3729
Mendoza Province                    3264
Veneto                              2716
New York                            2688
Alsace                              2440
Northeastern Italy                  2138
Loire Valley                        1856
Sicily & Sardinia                   1797
Champagne                           1613
Southwest France                    1503
South Australia                     1349
Southern Italy                      1349
Provence                            1346
Douro                               1281
Central Italy                       1233
Catalonia                           1164
Rhône Valley                        1081
Beaujolais      

In [3]:
#how many nulls-no null desciptions
df.isnull().sum()

id                           0
country                     63
description                  0
designation              37465
points                       0
price                     8996
province                    63
region_1                 21247
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
#this is a large dataset-lets take a smaller sample and drop anything with a null country, price, province
df=df.dropna(subset=['country', 'price', 'province', 'variety'])
#also dropping anything that has 4 or less in variety or province
value_counts = df['variety'].value_counts()
to_remove = value_counts[value_counts <= 4].index
df = df[~df.variety.isin(to_remove)]
value_counts = df['province'].value_counts()
to_remove = value_counts[value_counts <= 4].index
df = df[~df.province.isin(to_remove)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120007 entries, 1 to 129970
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   id                     120007 non-null  int64  
 1   country                120007 non-null  object 
 2   description            120007 non-null  object 
 3   designation            85481 non-null   object 
 4   points                 120007 non-null  int64  
 5   price                  120007 non-null  float64
 6   province               120007 non-null  object 
 7   region_1               101013 non-null  object 
 8   region_2               50208 non-null   object 
 9   taster_name            95601 non-null   object 
 10  taster_twitter_handle  90751 non-null   object 
 11  title                  120007 non-null  object 
 12  variety                120007 non-null  object 
 13  winery                 120007 non-null  object 
dtypes: float64(1), int64(2), object(11)


In [6]:
#this is still a rather large dataset so lets take a  random sample of 40000
df1 = df.sample(n=40000)
df1.reset_index(inplace=True)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  40000 non-null  int64  
 1   id                     40000 non-null  int64  
 2   country                40000 non-null  object 
 3   description            40000 non-null  object 
 4   designation            28427 non-null  object 
 5   points                 40000 non-null  int64  
 6   price                  40000 non-null  float64
 7   province               40000 non-null  object 
 8   region_1               33742 non-null  object 
 9   region_2               16787 non-null  object 
 10  taster_name            31879 non-null  object 
 11  taster_twitter_handle  30247 non-null  object 
 12  title                  40000 non-null  object 
 13  variety                40000 non-null  object 
 14  winery                 40000 non-null  object 
dtypes:

In [7]:
#lets start building a simple recommender
tfidf = TfidfVectorizer(stop_words='english')

In [8]:
tfidf_matrix = tfidf.fit_transform(df1['description'])

In [9]:
cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)

In [10]:
indices = pd.Series(df1.index, index=df1['title']).drop_duplicates()

In [11]:
# Function that takes in wine title as input and outputs most similar titles
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the wine that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all wine with that wine
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the wines based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar wines
    sim_scores = sim_scores[1:11]

    # Get the wines indices
    wine_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar wines
    return df1['title'].iloc[wine_indices]

In [37]:
#let's test the recommender out on a random wine
wine = df1['title'].sample(1).values[0]
wine

'Billsboro 2012 Sawmill Creek Vineyard Sauvignon Blanc (Finger Lakes)'

In [38]:
#Here are the recs if you like that wine
get_recommendations(wine)

38200           Château de Parenchère 2014  Bordeaux Blanc
23396    Doña Paula 2015 Los Cardos Sauvignon Blanc (Me...
12933    Peter Mertes 2011 Bernkasteler-Kues Kabinett F...
33951    Conte Brandolini 2010 D'Adda Pinot Grigio (Fri...
34939    Château Paradis 2015 White (Coteaux d'Aix-en-P...
6003     Duckhorn 2013 Rector Creek Vineyard Merlot (Na...
9682     Echeverria 2014 Reserva Sauvignon Blanc (Curic...
22705        Frog's Leap 2011 Sauvignon Blanc (Rutherford)
2248       Casa Montes 2014 Ampakama Chardonnay (San Juan)
22055    Domaine D'en Ségur 2015 Sauvignon Blanc (Côtes...
Name: title, dtype: object