# Recommendation Models

### Imports

In [19]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### Loading in Data

In [40]:
df = pd.read_csv('fivek_subset_data.csv')

### Baseline Model (based on random recommendations)

In [41]:
# Function to generate random recommendations
def random_recommender(df, num_recommendations=5):
    # Shuffle the DataFrame randomly and select the first num_recommendations rows
    random_recs = df.sample(n=num_recommendations, random_state=42)  
    return random_recs

#### Since the baseline recommender system above is generating random recommendations, it doesn't make sense to evaluate it using traditional recommendation metrics like precision or recall. In this case, the most appropriate evaluation would involve qualitative assessment and comparison with the other methods I will employ, cosine similarity & Euclidian Distance.

### Cosine Similarity Model

In [46]:
# Function to recommend based on cosine similarity

def cosine_recommender(df, accord, num_recommendations=5):
    filtered_df = df[df['main_accords'].str.contains(accord, case=False, na=False)]
    if filtered_df.empty:
        return pd.DataFrame()
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(filtered_df['main_accords'])
    similarities = cosine_similarity(tfidf_matrix)
    similar_indices = similarities.argsort(axis=1)[:, ::-1][:, 1:]
    cosine_recs = pd.DataFrame(columns=filtered_df.columns)
    for indices in similar_indices:
        similar_perfumes = filtered_df.iloc[indices[:num_recommendations]]
        cosine_recs = pd.concat([cosine_recs, similar_perfumes])
    return cosine_recs

### Euclidian Distance Model

In [47]:
# Function to recommend based on euclidean distance

def euclidean_recommender(df, accord, num_recommendations=5):
    filtered_df = df[df['main_accords'].str.contains(accord, case=False, na=False)]
    if filtered_df.empty:
        return pd.DataFrame()
    main_accords_list = filtered_df['main_accords'].str.split(', ')
    main_accords_array = main_accords_list.apply(lambda x: [1 if accord in x else 0])
    main_accords_matrix = np.vstack(main_accords_array)
    distances = euclidean_distances(main_accords_matrix)
    similar_indices = distances.argsort(axis=1)[:, :num_recommendations]
    euclidean_recs = pd.DataFrame(columns=filtered_df.columns)  
    for indices in similar_indices:
        similar_perfumes = filtered_df.iloc[indices]
        euclidean_recs = pd.concat([euclidean_recs, similar_perfumes])
    return euclidean_recs

### Model Comparison: Baseline vs. Euclidian vs. Cosine Similarity

In [48]:
# Function to compare the output of the baseline, euclidian, and cosine models

def compare_recommendations(df):
    # Prompt the user for their preferred fragrance accord
    accord = input("Enter your preferred fragrance accord: ")

    # Convert the input to lowercase for consistency
    accord = accord.lower()

    # Get recommendations based on Euclidean Distance
    euclidean_recs = euclidean_recommender(df, accord).drop_duplicates().head(5)

    # Get recommendations based on Cosine Similarity
    cosine_recs = cosine_recommender(df, accord).drop_duplicates().head(5)

    # Get random recommendations
    random_recs = random_recommender(df, 5).drop_duplicates().head(5)

    # Create a DataFrame for comparison
    comparison_df = pd.DataFrame({
        "Euclidean Recs": euclidean_recs["perfume"].tolist(),
        "Cosine Recs": cosine_recs["perfume"].tolist(),
        "Random Recs": random_recs["perfume"].tolist()
    })

    return comparison_df

# Load your DataFrame here
df = pd.read_csv('fivek_subset_data.csv')

# Call the comparison function
comparison_result = compare_recommendations(df)
print(comparison_result)

Enter your preferred fragrance accord:  wine


    Euclidean Recs      Cosine Recs                    Random Recs
0         Une Rose    Malaga Cooler               Best Free Lander
1    Malaga Cooler       Blood Kiss                        Trigger
2       Blood Kiss  The Black Tower                    My Torrente
3  The Black Tower      Leatherwood  Marc Jacobs Autumn Splash Ivy
4      Leatherwood         Une Rose                          V Ete


### Accords Within Recommended Perfumes

In [45]:
# List of selected perfumes
selected_perfumes = ["Une Rose", "Best Free Lander", "Malaga Cooler", "Trigger", "Blood Kiss",
                     "My Torrente", "The Black Tower", "Marc Jacobs Autumn Splash Ivy",
                     "Leatherwood", "V Ete"]

# Filter the DataFrame based on the selected perfumes
filtered_df = df[df['perfume'].isin(selected_perfumes)]

# Display the associated accords for each perfume
for perfume in selected_perfumes:
    accords = filtered_df[filtered_df['perfume'] == perfume]['main_accords'].values[0]
    print(f"{perfume}: {accords}")

Une Rose: rose, fresh spicy, aromatic, wine, floral
Best Free Lander: green, aromatic, citrus, woody, earthy
Malaga Cooler: citrus, wine, fresh, fresh spicy, conifer
Trigger: woody, floral, sweet, powdery, amber
Blood Kiss: wine, sweet, woody, cherry, vanilla, aromatic
My Torrente: fruity, floral, woody, fresh, sweet, nutty
The Black Tower: woody, green, wine, fresh, ozonic, balsamic
Marc Jacobs Autumn Splash Ivy: woody, aromatic, powdery, warm spicy, citrus
Leatherwood: wine, warm spicy, tuberose, leather, tobacco, animalic
V Ete: woody, powdery, floral, rose, fresh


### Comparison Analysis
* None of the randomly recommended perfumes contained the "wine" accord.
* The recommended perfumes based on Cosine Similarity and Euclidean Distance were the exact same, but in a different order. 
* Therefore, both methods seem to agree on the similar perfumes for the input accord "wine".

### Which Model Should I Choose? 

* The Euclidian Distance model proved to be much less computationally intensive than Cosine Similarity.
* The Cosine Similarity Matrix was too large of a file to run through streamlit, even when I utilized a smaller subset of the data.
* For the above reasons, I will be using Euclidian Distance for my final recommendation system.
* The code for this can be found in "Recommender_System.py" 