## Importing necessary libraries

In [None]:
import pandas as pd 
import difflib
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly_express as px

## Reading Data

In [None]:
data = pd.read_csv("full_data.csv")

## Cleaning Data

In [None]:
def data_clean(data):
    data['Format'] = data['Format'].str.replace('L', '').str.replace(',', '.').astype(float)
    data['Alcohol content'] = data['Alcohol content'].apply(lambda a: float(str(a).replace(',', '.').replace('%', '')))
    data['Temperature'] = data['Temperature'].str.replace('°C', '')
    data['Average Rating'] = data['Average Rating'].str.replace(',', '.')
    data['Average Rating'] = data['Average Rating'].astype(str).apply(lambda a: float(a.strip("(''),."))).astype(float)
    data['Price'] = data['Price'].str.replace(',', '.').astype(str).apply(lambda a: float(a.strip("(''),."))).astype(float)
    data['Full name'] = data['Full name'].drop_duplicates()
    data.dropna(subset=['Full name'], inplace=True)
    return data
data = data_clean(data)

In [None]:
new_data = data[['Image', 'Type of wine', 'Ideal with', 'Aroma', 'Full name', 'Price', 'Description']]

In [None]:
for feature in new_data:
    new_data[feature] = new_data[feature].fillna('')

In [None]:
combined_features = new_data['Type of wine'] + ' ' + new_data['Ideal with'] + ' ' + new_data['Aroma'] + ' ' + new_data['Full name'] + ' ' + new_data['Description']

## Visualizations

In [None]:
type_counts = data['Type of wine'].value_counts().reset_index().head(10)
type_counts.columns = ['Type of wine', 'Count']

fig = px.histogram(type_counts, x='Type of wine', y='Count', 
             title='Count of Each Type of Wine', height=600, width=1300, text_auto=True, barmode='group')
fig.update_layout(title="What is the most type of wine in the store?")
fig.show()

In [None]:
type_counts = data['Aroma'].value_counts().reset_index().head(10)
type_counts.columns = ['Aroma', 'Count']

fig = px.pie(type_counts, names='Aroma', values='Count', 
             title='Count of Each Type of Wine', height=800, width=800)
fig.update_layout(title="Distribution of vino aromas")
fig.show()

In [None]:
filtered_data = data[data['Type of wine'].notna()]

fig = px.sunburst(filtered_data.dropna(subset=['Country']), path=['Type of wine', 'Country'], values='Average Rating', color='Type of wine')
fig.update_layout(title='Average Rating of type of wines')
fig.show()


In [None]:
type_counts = data['Country'].value_counts().reset_index()
type_counts.columns = ['Country', 'Count']

fig = px.histogram(type_counts, x='Country', y='Count', 
             title='Count of Each Type of Wine', height=600, width=1300, text_auto=True, barmode='group')
fig.update_layout(title="Distribution of wines count by Country")
fig.show()

## Machine Learning

In [None]:
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_features)
similarity = cosine_similarity(feature_vectors)
vino_name = input('Enter vino name: ')
list_of_all_full_names = data['Full name'].tolist()
filtered_list = [name for name in list_of_all_full_names if isinstance(name, str)]
find_close_match = difflib.get_close_matches(vino_name, filtered_list)
close_match = find_close_match[0]

In [None]:
if vino_name in new_data['Full name'].values:
    index = new_data[new_data['Full name'] == vino_name].index[0]
    similarity_score = list(enumerate(similarity[index]))
    sorted_similar_vines = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    print("Suggested vines for you: \n ")

    i = 1
    vines = []

    for vine in sorted_similar_vines:
        index = vine[0]
        if index in new_data.index:
            title_from_index = new_data.loc[index, 'Full name']
            if pd.notna(title_from_index):
                vines.append((title_from_index, vine[1]))
        else:
            pass
        
    for vine, score in vines[:5]:
        print(f"{i}. {vine} - Similarity Score: {score*100:.1f}%")
        i += 1
else:
    print("The specified 'vino_name' does not exist in the DataFrame.")

## Loading Data

In [None]:
pickle.dump(new_data, open('wines.pkl', 'wb'))

In [None]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))