In [4]:
%pip install plotly.express


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd 
import difflib
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly_express as px

## Reading Data

In [5]:
data = pd.read_csv("aa.csv")

## Cleaning Data

In [6]:
def data_clean(data):
    data['Format'] = data['Format'].str.replace('L', '').str.replace(',', '.').astype(float)
    data['Alcohol content'] = data['Alcohol content'].apply(lambda a: float(str(a).replace(',', '.').replace('%', '')))
    data['Temperature'] = data['Temperature'].str.replace('°C', '')
    data['Average Rating'] = data['Average Rating'].str.replace(',', '.')
    data['Average Rating'] = data['Average Rating'].astype(str).apply(lambda a: float(a.strip("(''),."))).astype(float)
    data['Price'] = data['Price'].str.replace(',', '.').astype(str).apply(lambda a: float(a.strip("(''),."))).astype(float)
    data['Full name'] = data['Full name'].drop_duplicates()
    data.dropna(subset=['Full name'], inplace=True)
    return data
data = data_clean(data)

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2211 entries, 0 to 2487
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Image            2211 non-null   object 
 1   Type of wine     2209 non-null   object 
 2   Body             1937 non-null   object 
 3   Origin           1923 non-null   object 
 4   Ideal with       1903 non-null   object 
 5   Aroma            1918 non-null   object 
 6   Full name        2211 non-null   object 
 7   Appellation      1885 non-null   object 
 8   Vintage          1336 non-null   float64
 9   Dosage           1273 non-null   object 
 10  Country          1273 non-null   object 
 11  Region           1272 non-null   object 
 12  Grape            1265 non-null   object 
 13  Ageing           1031 non-null   object 
 14  Format           1031 non-null   float64
 15  Alcohol content  1024 non-null   float64
 16  Temperature      1008 non-null   object 
 17  Sulfites         10

In [7]:
new_data = data[['Image', 'Type of wine', 'Ideal with', 'Aroma', 'Full name', 'Price', 'Description']]

In [8]:
for feature in new_data:
    new_data[feature] = new_data[feature].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data[feature] = new_data[feature].fillna('')


In [9]:
combined_features = new_data['Type of wine'] + ' ' + new_data['Ideal with'] + ' ' + new_data['Aroma'] + ' ' + new_data['Full name'] + ' ' + new_data['Description']

## Visualization

In [27]:
type_counts = data['Type of wine'].value_counts().reset_index().head(10)
type_counts.columns = ['Type of wine', 'Count']

fig = px.histogram(type_counts, x='Type of wine', y='Count', 
             title='Count of Each Type of Wine', height=500, width=1100, text_auto=True, barmode='group')
fig.update_layout(title="The most type of wine in the store")
fig.show()

In [26]:
type_counts = data['Aroma'].value_counts().reset_index().head(10)
type_counts.columns = ['Aroma', 'Count']

fig = px.pie(type_counts, names='Aroma', values='Count', 
             title='Count of Each Type of Wine', height=800, width=800)
fig.update_layout(title="Distribution of aromas")
fig.show()

In [25]:
filtered_data = data[data['Type of wine'].notna()]

fig = px.sunburst(filtered_data.dropna(subset=['Country']), path=['Type of wine', 'Country'], values='Average Rating', color='Type of wine')
fig.update_layout(title='Average Rating of type of wines')
fig.show()


In [20]:
type_counts = data['Country'].value_counts().reset_index()
type_counts.columns = ['Country', 'Count']

fig = px.histogram(type_counts, x='Country', y='Count', 
             title='Count of Each Type of Wine', height=500, width=1100, text_auto=True, barmode='group')
fig.update_layout(title="Distribution of wines count by Country")
fig.show()

## Machine Learning

In [14]:
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_features)
similarity = cosine_similarity(feature_vectors)
vino_name = input('Enter vino name: ')
list_of_all_full_names = data['Full name'].tolist()
filtered_list = [name for name in list_of_all_full_names if isinstance(name, str)]
find_close_match = difflib.get_close_matches(vino_name, filtered_list)
close_match = find_close_match[0]

In [15]:
if vino_name in new_data['Full name'].values:
    index = new_data[new_data['Full name'] == vino_name].index[0]
    similarity_score = list(enumerate(similarity[index]))
    sorted_similar_vines = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    print("Suggested vines for you: \n ")

    i = 1
    vines = []

    for vine in sorted_similar_vines:
        index = vine[0]
        if index in new_data.index:
            title_from_index = new_data.loc[index, 'Full name']
            if pd.notna(title_from_index):
                vines.append((title_from_index, vine[1]))
        else:
            pass
        
    for vine, score in vines[:5]:
        print(f"{i}. {vine} - Similarity Score: {score*100:.1f}%")
        i += 1
else:
    print("The specified 'vino_name' does not exist in the DataFrame.")

Suggested vines for you: 
 
1. Brunello di Montalcino DOCG 2019 Fattoria dei Barbi - Similarity Score: 100.0%
2. Brunello di Montalcino DOCG 2018 Fattoria dei Barbi - Similarity Score: 99.3%
3. Brunello di Montalcino DOCG 2018 Geografico - Similarity Score: 47.6%
4. Brunello di Montalcino DOCG Collezione del Conte 2018 Villa da Filicaja - Similarity Score: 45.1%
5. Brunello di Montalcino Riserva DOCG 2017 Pian del Prete - Similarity Score: 43.5%


## Loading Data

In [16]:
pickle.dump(new_data, open('wines.pkl', 'wb'))

In [17]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))