## ML Model

### Libraries & Calling the dataset(s)

In [155]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.decomposition import PCA, SparsePCA, TruncatedSVD
from scipy.sparse import hstack
import xgboost as xgb

In [156]:
# Calling the clean dataset
wine_list_read_data = pd.read_csv('clean_dataset.csv')
wine_cheese_data = pd.read_csv('paired_cheese_wine_data.csv')
wine_list = pd.read_csv('clean_dataset.csv')
wine_list.head(5)

Unnamed: 0,country,description,points,price_USD,province,region,wine_name,variety,winery,year
0,Italy,"Aromas include tropical fruit, broom, brimston...",87.0,17,Sicily & Sardinia,Etna,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,2013
1,Portugal,"This is ripe and fruity, a wine that is smooth...",87.0,15,Douro,Unknown,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011
2,US,"Tart and snappy, the flavors of lime flesh and...",87.0,14,Oregon,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013
3,US,"Pineapple rind, lemon pith and orange blossom ...",87.0,13,Michigan,Lake Michigan Shore,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013
4,US,"Much like the regular bottling from 2012, this...",87.0,65,Oregon,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012


### Vectorizing and normalizing the text columns

In [157]:
# Starting with the 'description' column to vectorize
descriptions = wine_list['description']

In [158]:
# Vectorizing description of the wines. 
vectorizer_desc = CountVectorizer()
vectorizer_desc.fit_transform(descriptions)
vocabulary_description = vectorizer_desc.get_feature_names_out()

vocabulary_d = vectorizer_desc.fit_transform(descriptions).toarray()

In [159]:
# The second important parameter is variety that is needed to be vectorized. 
variety = wine_list['variety']

In [160]:
# Vectorizing variety of the wines. 
vectorizer_wine = CountVectorizer()
vectorizer_wine.fit_transform(variety)
vocabulary_wine = vectorizer_wine.get_feature_names_out()

vocabulary_v = vectorizer_wine.fit_transform(variety).toarray()

In [161]:
# The third important parameter is variety that is needed to be encoded. 

region = wine_list[['region']]
encoder = OneHotEncoder()

In [162]:
# The fourth important parameter is variety that is needed to be encoded. 
country = wine_list[['country']]
encoder = OneHotEncoder()

In [163]:
# fit and transform all vectorized and encoded data
X_desc_sparse = vectorizer_desc.fit_transform(descriptions)
X_variety_sparse = vectorizer_wine.fit_transform(variety)
X_region_sparse = encoder.fit_transform(region)
X_country_sparse = encoder.fit_transform(country)

In [164]:
X_desc_sparse.shape

(115198, 30752)

In [165]:
X_variety_sparse.shape

(115198, 546)

In [166]:
# Stacking all tranformed data, variety has the most importance so weight of it increased 3 times based on ML results.
X = hstack([X_desc_sparse, (X_variety_sparse*3), X_region_sparse, X_country_sparse])

In [167]:
# performs linear dimensionality reduction by means of truncated singular value decomposition.
svd = TruncatedSVD(n_components=10)

In [168]:
# Checking the shape of X before transformations
X.shape

(115198, 32560)

In [169]:
#Transforming X to SVD
X_svd = svd.fit_transform(X)

In [170]:
# Checking variance ratio of the transformed data
svd.explained_variance_ratio_.sum()

0.2795936191105247

In [171]:
# checking transformed X shape
X_svd.shape

(115198, 10)

In [172]:
# converting X_svd to array
X_svd_array = np.array(X_svd)

In [173]:
# creating data frame of X_svd
svd_df = pd.DataFrame(X_svd_array)

In [174]:
# Normalizing the transformed data for better ML performance
normalizer = Normalizer(norm='l2') 
svd_df = normalizer.fit_transform(svd_df)
svd_df = pd.DataFrame(svd_df)

In [175]:
# normalized 'points' and 'price_USD' in the main dataframe for better ML performance
scaler = StandardScaler()
columns_to_scale = ['points', 'price_USD']
wine_list[columns_to_scale] = scaler.fit_transform(wine_list[columns_to_scale])

In [176]:
# combine main dataframe and vectorized data dataframe.

concatenated_wine_list = pd.concat([wine_list, svd_df], axis=1)

### Running ML model

In [177]:
# create new dataframe which includes numerical columns
numerical_features = concatenated_wine_list.drop(columns={'country', 'description','province', 'region','variety', 'winery', 'wine_name', 'year', 'price_USD'})

In [178]:
numerics = numerical_features.to_numpy()

In [179]:
# Nearest Neighbor ML model
nearest_neighbors = NearestNeighbors(algorithm='brute', n_neighbors=6).fit(numerics)

In [180]:
#checking the resul of model
query_wine = numerical_features.iloc[40855:40856]

In [181]:
wine_list.iloc[40855:40856]

Unnamed: 0,country,description,points,price_USD,province,region,wine_name,variety,winery,year
40855,France,"A beautiful, pure wine that combines freshnes...",3.077465,2.722388,Burgundy,Clos de Lambrays,Domaine des Lambrays 2005 Clos de Lambrays,Pinot Noir,Domaine des Lambrays,2005


In [182]:
# checking neighbors distances and neighbors
distances, indices = nearest_neighbors.kneighbors(query_wine.to_numpy())

In [183]:
distances, indices

(array([[0.        , 0.19912522, 0.28684633, 0.40565365, 0.41916352,
         0.42960276]]),
 array([[ 40855,  40856,  38181,  39577, 108400,  40858]]))

In [184]:
#checking the resul of the ML model
indices = [ 30372, 77271, 97735, 100763, 67662, 51360 ]

selected_rows = wine_list_read_data.loc[indices]
selected_rows

Unnamed: 0,country,description,points,price_USD,province,region,wine_name,variety,winery,year
30372,France,This is a fabulous wine from the greatest Cham...,100.0,259,Champagne,Champagne,Krug 2002 Brut (Champagne),Champagne Blend,Krug,2002
77271,France,This latest incarnation of the famous brand is...,100.0,250,Champagne,Champagne,Louis Roederer 2008 Cristal Vintage Brut (Cham...,Champagne Blend,Louis Roederer,2008
97735,Italy,It takes only a few moments before you appreci...,100.0,270,Tuscany,Brunello di Montalcino,Casanova di Neri 2007 Cerretalto (Brunello di ...,Sangiovese Grosso,Casanova di Neri,2007
100763,Portugal,The sweetness of the wine vies with the acidit...,99.0,268,Port,Unknown,Taylor Fladgate 2011 Vargellas Vinhas Velhas V...,Port,Taylor Fladgate,2011
67662,Italy,Even better than the highly acclaimed 2001 vin...,99.0,250,Tuscany,Toscana,Tenuta dell'Ornellaia 2004 Masseto Merlot (Tos...,Merlot,Tenuta dell'Ornellaia,2004
51360,Italy,Sperss (the name is inspired by the local word...,98.0,255,Piedmont,Langhe,Gaja 2006 Sperss Nebbiolo (Langhe),Nebbiolo,Gaja,2006


In [185]:
wine_names = ["Domaine des Lambrays 2005 Clos de Lambrays",
              "Krug 2002 Brut (Champagne)",
              "Domaine Leflaive 2010 Bâtard-Montrachet",
              "Jean-Michel Dupré 2015 1935 Vieilles Vignes (Morgon)",
              "Artner 2012 Steinäcker Zweigelt (Carnuntum)",
              "Robert Biale 2014 Limerick Lane Vineyard Zinfandel (Russian River Valley)",
              "Bisol 2007 Cartizze (Prosecco Superiore di Cartizze)",
              "Domaine Zind-Humbrecht 2012 Clos Saint Urbain Rangen de Thann Grand Cru Gewurztraminer (Alsace)",
              "Bodegas Roda 2009 Cirsion (Rioja)",
              "Pittacum 2012 La Prohibición Garnacha Tintorera (Vino de la Tierra de Castilla y León)",
              "Bodega del Abad 2011 Carracedo Mencía (Bierzo)",
              "Domaine Vacheron 2014 L'Enclos des Remparts Sauvignon Blanc (Vin de France)",
              "Domaine Huët 2009 Clos du Bourg Première Trie Moelleux (Vouvray)",
              "Schloss Gobelsburg 2015 Eiswein Grüner Veltliner (Niederösterreich)",
              "Alpha Omega 2012 Stagecoach Vineyard Cabernet Sauvignon (Atlas Peak)",
              "François Lurton 2011 Alka Carmenère (Colchagua Valley)",
              "Masciarelli 2005 Villa Gemma (Montepulciano d'Abruzzo)",
              "Château Vignelaure 2016 Rosé (Coteaux d'Aix-en-Provence)",
              "Pietradolce 2016 Rosato (Etna)",
              "Müller-Catoir 2007 Breumel in den Mauren Trockenbeerenauslese Riesling (Pfalz)",
              "Château La Nerthe 2012 Clos de Beauvenir White (Châteauneuf-du-Pape)",
              "Capichera 2013 VT Vermentino (Isola dei Nuraghi)",
              "Bucci 2013 Villa Bucci Riserva (Verdicchio dei Castelli di Jesi Classico Superiore)",
              "Château du Cèdre 2012 GC Malbec (Cahors)",
              "Penfolds 2008 Grange Shiraz (South Australia)",
              "Bodegas Gutiérrez de la Vega 1999 Casta Diva Fondillón Sweet Monastrell (Alicante)",
              "Krutzler 2012 Perwolff Blaufränkisch (Südburgenland)",
              "Château Pontet-Canet 2009 Barrel sample (Pauillac)",
              "Tenuta dell'Ornellaia 2007 Masseto Merlot (Toscana)"]

filtered_df = wine_list_read_data[wine_list_read_data["wine_name"].isin(wine_names)]

index_wine_cheese = filtered_df.index.tolist()


In [186]:
filtered_df = filtered_df.reset_index()
filtered_df = filtered_df[['index', 'wine_name']]



In [187]:
wine_cheese_data = wine_cheese_data.merge(filtered_df, on='wine_name').sort_values('index', ascending=True)

In [188]:
wine_cheese_data = wine_cheese_data.reset_index(drop=True)

In [189]:
neighbors = []
for i in index_wine_cheese:
    query_wine = numerical_features.iloc[i]
    _, indices = nearest_neighbors.kneighbors([query_wine]) 
    neighbors.append(indices[0].tolist()) 

neighbors
                  

[[11895, 112692, 77273, 34861, 102757, 56817],
 [12447, 17981, 54848, 17976, 87776, 103864],
 [14971, 66088, 103506, 77037, 13556, 1165],
 [21093, 23297, 103524, 22839, 34494, 52669],
 [27741, 74921, 93121, 38363, 102753, 45530],
 [30372, 35163, 99777, 77271, 97735, 6032],
 [32731, 38178, 108992, 67662, 100762, 38195],
 [34862, 34861, 77273, 89070, 106522, 28102],
 [36418, 74921, 82207, 37527, 32480, 102753],
 [37207, 2529, 38373, 71492, 45488, 45476],
 [38188, 90660, 45472, 25008, 8283, 63831],
 [40855, 40856, 38181, 39577, 108400, 40858],
 [47634, 49980, 89976, 75018, 47946, 41240],
 [48391, 32014, 61033, 110819, 36003, 112777],
 [65181, 89401, 12606, 112544, 22839, 16076],
 [65757, 51360, 100764, 30374, 95519, 108401],
 [66097, 44220, 26809, 30349, 38455, 39537],
 [79241, 66931, 106527, 65346, 50568, 38493],
 [85265, 30373, 47791, 39575, 77272, 103712],
 [90875, 89751, 24298, 69588, 100983, 10065],
 [93122, 81041, 13553, 85294, 12893, 80966],
 [93696, 12897, 12892, 93130, 66772, 169

In [190]:
neighbors_df = pd.DataFrame(neighbors, columns=["neighbor_1", "neighbor_2", "neighbor_3", "neighbor_4", "neighbor_5", "neighbor_6"])

In [191]:
wine_cheese_neigbors = pd.concat([wine_cheese_data, neighbors_df], axis=1)
wine_cheese_neigbors.to_csv('wine_cheese_neigbors.csv')

In [192]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

recommendation_counter = 1
while True:
    input_cheese = input("Let me know your favorite cheese to recommend you the best-fitted wine!")

    input_cheese = input_cheese.lower() 

    available_cheeses_lower = wine_cheese_neigbors["cheese"].str.lower()
    if input_cheese in available_cheeses_lower.values:
        filtered_df = wine_cheese_neigbors[available_cheeses_lower == input_cheese]
        neighbors_list = filtered_df[['neighbor_1', 'neighbor_2', 'neighbor_3', 'neighbor_4', 'neighbor_5', 'neighbor_6']].values.flatten()
        selected_rows = wine_list_read_data[wine_list_read_data.index.isin(neighbors_list)]
        selected_rows = selected_rows.drop(columns=['province', 'winery'])

        for _, row in selected_rows.iterrows():
            print(f"Recommendation-{recommendation_counter}")
            print("Country:", row['country'])
            print("Description:", row['description'])
            print("Wine_Name:", row['wine_name'])
            print("Variety:", row['variety'])
            print("Year:", row['year'])
            print("Points:", row['points'])
            print("Price: ", row['price_USD'])
            print()
            recommendation_counter += 1
        break

    else:
        available_cheeses_lower = available_cheeses_lower.unique()
        matches = process.extract(input_cheese, available_cheeses_lower, scorer=fuzz.token_set_ratio)
        matches = [match[0] for match in matches if match[1] >= 70]  # Minimum similarity ratio of 70

        if matches:
            print("Did you mean one of the following cheeses?")
            print(", ".join(matches))
        else:
            print("Sorry, this cheese is not available. Please try another type of cheese.")
            print("Available cheeses are:")
            print(", ".join(available_cheeses))
             

        print()
        continue


Recommendation-1
Country: France
Description: A beautiful wine, with the firmest tannins surrounded by perfumed fruit. It is dense, of course, but this density is balanced with great elegance, blackberry fruits, sweetness and final juicy acidity. The wine is structured, a powerhouse of concentration while preserving this complete style.
Wine_Name: Château Palmer 2009 Margaux
Variety: Bordeaux-style Red Blend
Year: 2009
Points: 98.0
Price:  380

Recommendation-2
Country: France
Description: A big, bold wine with unbelievable power and concentration. Low yields and a dominance of Cabernet Franc have produced an immensely concentrated wine. Still very young, this magnificent wine holds the promise of great aging.
Wine_Name: Château Ausone 2010 Saint-Émilion
Variety: Bordeaux-style Red Blend
Year: 2010
Points: 99.0
Price:  340

Recommendation-3
Country: France
Description: Stern, almost severe initially, this great wine takes time to show its immense fruit power. Black currant and blackber