In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import pickle

from IPython.display import display, HTML
import pandas as pd

pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...


In [4]:
wine_reviews = pd.read_csv('datasets\winemag-data-130k-v2.csv').drop(['Unnamed: 0', 'region_2'], axis=1).dropna().drop_duplicates()
wine_reviews.head()
wine_reviews.shape

(47673, 12)

In [5]:
#for memory optimization
wine_reviews_sampled = wine_reviews.sample(25000).reset_index()
wine_reviews_sampled.head()

Unnamed: 0,index,country,description,designation,points,price,province,region_1,taster_name,taster_twitter_handle,title,variety,winery
0,52223,US,"Faint cranberry and incense aromas show on the nose of this bottling. The palate offers more flavors, including coffee, black tar, pomegranate and peppercorns.",Sierra Mar Vineyard,88,55.0,California,Santa Lucia Highlands,Matt Kettmann,@mattkettmann,Bernardus 2014 Sierra Mar Vineyard Pinot Noir (Santa Lucia Highlands),Pinot Noir,Bernardus
1,94020,US,"A reluctant nose slowly offers hard-to-tease aromas of blueberry, dark chocolate and loam. The palate also works to show its stuff, offering dried cherries and cedar. It's straight ahead and very drinkable, yet neither flashy nor complex.",Pear Valley Vineyard,85,25.0,California,Paso Robles,Matt Kettmann,@mattkettmann,Pear Valley 2010 Pear Valley Vineyard Cabernet Sauvignon (Paso Robles),Cabernet Sauvignon,Pear Valley
2,63462,Italy,"A blend of Cabernet Franc, Merlot, Cabernet Sauvignon, Petit Verdot, Sangiovese, Alicante and Syrah, this juicy wine opens with aromas of ripe plum, baking spice and a balsamic note. The concentrated palate doles out ripe black cherry, clove and licorice alongside fine-grained tannins.",Aria di Caiarossa,90,40.0,Tuscany,Toscana,Kerin O’Keefe,@kerinokeefe,Caiarossa 2012 Aria di Caiarossa Red (Toscana),Red Blend,Caiarossa
3,108910,US,"The aromas seem a bit muddled, with notes of jasmine and pear occasionally poking through and the alcohol showing itself. It's medium sweet (2.7% residual sugar) finishing a touch bitter.",Colter's Creek Estate,84,10.0,Idaho,Idaho,Sean P. Sullivan,@wawinereport,Colter's Creek 2011 Colter's Creek Estate Riesling (Idaho),Riesling,Colter's Creek
4,4966,Spain,"This is a really good showing for the Garnacha Blanca grape. The bouquet is fresh and not the least bit blowsy, with an emphasis on apple and pear aromas. The wine is plump, fruity and showing modest acidity, while flavors of apple and baking spices precede a finish that's easygoing. Drink now.",Els Ameliers,90,35.0,Catalonia,Terra Alta,Michael Schachner,@wineschach,Lafou 2011 Els Ameliers Garnacha Blanca (Terra Alta),Garnacha Blanca,Lafou


In [6]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Tokenize and convert to lowercase
    tokens = word_tokenize(text.lower())
    # Remove stopwords and lemmatize the words
    processed_text = ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])
    return processed_text

wine_reviews_sampled['processed_description'] = wine_reviews_sampled['description'].apply(preprocess_text)

In [7]:
max_features = 700  # Adjust this value based on your available memory
tfidf = TfidfVectorizer(max_features=max_features)
description_matrix = tfidf.fit_transform(wine_reviews_sampled['processed_description'])

In [8]:
similarity_matrix = cosine_similarity(description_matrix)

In [9]:
def recommend_wines(wine_title, n_recommendations=5):
    # Find the index of the wine with the given title
    wine_index = wine_reviews_sampled[wine_reviews_sampled['title'] == wine_title].index[0]
    
    # Get similarity scores for the given wine
    similarity_scores = list(enumerate(similarity_matrix[wine_index]))
    # Sort the wines based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    # Get the indices of the most similar wines (excluding the input wine itself)
    most_similar_indices = [score[0] for score in similarity_scores[1:n_recommendations+1]]
    # Return the names of the most similar wines
    return wine_reviews['title'].iloc[most_similar_indices]

In [10]:
wine_reviews_sampled['title']

0         Bernardus 2014 Sierra Mar Vineyard Pinot Noir (Santa Lucia Highlands)
1        Pear Valley 2010 Pear Valley Vineyard Cabernet Sauvignon (Paso Robles)
2                                Caiarossa 2012 Aria di Caiarossa Red (Toscana)
3                    Colter's Creek 2011 Colter's Creek Estate Riesling (Idaho)
4                          Lafou 2011 Els Ameliers Garnacha Blanca (Terra Alta)
                                          ...                                  
24995                 J. Bookwalter 2011 Protagonist Red (Columbia Valley (WA))
24996                Drappier NV Brut Nature Zero Dosage Pinot Noir (Champagne)
24997           Delectus 2012 Bear Crossing Cabernet Sauvignon (Knights Valley)
24998                      Soos Creek 2012 Palisades Red (Columbia Valley (WA))
24999                        Maison Bleue 2009 Soleil Roussanne (Yakima Valley)
Name: title, Length: 25000, dtype: object

In [11]:
wine_title = 'Condado de Oriza 2006 Roble  (Ribera del Duero)'  # You can search by wine title
recommendations = recommend_wines(wine_title)
display(recommendations)

31237         Palmer 1998 Estate Chardonnay (North Fork of Long Island)
57991    Château des Landes 2014 Cuvée Prestige  (Lussac Saint-Émilion)
6312      Fenestra 2012 Semonnay Chardonnay-Semillon (Livermore Valley)
31451                Castello di Amorosa 2015 Gioia Rosato (California)
43455                      Tenuta Carretta 2010 Garassino  (Barbaresco)
Name: title, dtype: object

In [12]:
import pickle


with open("web_app/wine_recommender.pkl", "wb") as f:
    pickle.dump((wine_reviews_sampled['title'], similarity_matrix), f)