In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')
%matplotlib inline

# Building a Recommendation Engine. 

## Content-based filtering
* This is a method that uses only information about the description and attributes of the items users has previously consumed to model user's preferences.  It is a very popular method for recommender systems.
* The method is based on the idea that if a user likes an item, he or she will also like items that are similar to it.
* It uses a series of discrete characteristics of an item in order to recommend additional items with similar properties.

In [15]:
df = pd.read_csv('df.csv', nrows=5000)
df.head()

Unnamed: 0,UID,PRODUCT_NAME,IMAGE_URL,PRODUCT_PRICE,PRODUCT_PRICE_TREATED_OUTLIERS,PRODUCT_LINK,PRODUCT_INFORMATION_T,CATEGORY,SUB_CATEGORY,PRICE_PER_KG/L,...,PROTEIN,SALT,FAT,INGREDIENTS,STORAGE_INFORMATION,STORE_NAME,PRICE_PER_KG/L_UNIT,ENERGY_KJ,NUTRITIONAL_LABEL,combo
0,138,Almdudler Original Herb Lemonade,https://imageproxy.wolt.com/menu/menu-images/6...,2.24,2.24,https://wolt.com/en/deu/berlin/venue/flink-kar...,Alpine herbal lemonade,Food & Beverage,Soft Drinks,1.99,...,6.540362,6.13861,12.989936,"Water, sugar, carbon dioxide, acidifier citric...",Please store in a dry place and protect from h...,Wolt: Flink Karl Liebknecht,Kg,596.656412,high_fiber,Food & Beverage Soft Drinks Almdudler Original...
1,151,Almdudler Sugar Free Herb Lemonade,https://imageproxy.wolt.com/menu/menu-images/6...,2.24,2.24,https://wolt.com/en/deu/berlin/venue/flink-kar...,Alpine herb lemonade without sugar with sweete...,Food & Beverage,Soft Drinks,1.99,...,6.540362,6.13861,12.989936,"Water, carbon dioxide, acidifier citric acid, ...",Please store in a dry place and protect from h...,Wolt: Flink Karl Liebknecht,Kg,596.656412,high_fiber,Food & Beverage Soft Drinks Almdudler Sugar Fr...
2,267,almond butter brown 250g,https://static.mueller.de/markant_041044201797...,5.99,5.99,https://www.mueller.de/p/alnatura-mandelmus-br...,Product information An intensely aromatic pure...,Food & Beverage,Sweet spreads,23.96,...,22.0,0.01,53.0,ALMONDS* unpeeled *from organic farming. May c...,"Best before: see lid, summer de preference ava...",Muller,Kg,2559.002825,very unhealthy,Food & Beverage Sweet spreads almond butter br...
3,270,almond butter white 250g,https://static.mueller.de/markant_041044201811...,6.99,6.99,https://www.mueller.de/p/alnatura-mandelmus-we...,Product information A fine puree made from 100...,Food & Beverage,Sweet spreads,27.96,...,22.0,0.01,61.0,ALMONDS* peeled *from organic farming. May con...,"Best before: see lid, summer de preference ava...",Muller,Kg,2781.00307,high_fiber,Food & Beverage Sweet spreads almond butter wh...
4,275,Almond Cranberry Fruit Bar 75G,https://static.mueller.de/markant_000000424067...,1.69,1.69,https://www.mueller.de/p/alnatura-mandel-cranb...,Product information Two ideally harmonizing la...,Food & Beverage,Sweet & salty,22.53,...,7.0,0.08,11.0,"Rice syrup*, banana flakes* 20%, WHOLEMEAL OAT...",Please store in a dry place and protect from h...,Muller,Kg,1670.001844,healthy,Food & Beverage Sweet & salty Almond Cranberry...


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   UID                             5000 non-null   int64  
 1   PRODUCT_NAME                    5000 non-null   object 
 2   IMAGE_URL                       5000 non-null   object 
 3   PRODUCT_PRICE                   5000 non-null   float64
 4   PRODUCT_PRICE_TREATED_OUTLIERS  5000 non-null   float64
 5   PRODUCT_LINK                    5000 non-null   object 
 6   PRODUCT_INFORMATION_T           5000 non-null   object 
 7   CATEGORY                        5000 non-null   object 
 8   SUB_CATEGORY                    5000 non-null   object 
 9   PRICE_PER_KG/L                  5000 non-null   float64
 10  SATURATED_FATTY_ACIDS           5000 non-null   float64
 11  CARBOHYDRATES                   5000 non-null   float64
 12  SUGAR                           50

In [17]:
print(f'The dataset has {df.shape[0]} rows and {df.shape[1]} columns', end='\n\n')
print(f'The dataset has {df.isnull().sum().sum()} missing values', end='\n\n')
print(f'The dataset has {df.duplicated().sum()} duplicated rows', end='\n\n')

The dataset has 5000 rows and 23 columns

The dataset has 0 missing values

The dataset has 0 duplicated rows



In [18]:
# Impute missing values of numeric columns with the mean with 50% threshold
for col in df.select_dtypes(include=np.number).columns:
    if df[col].isnull().sum() > 0.5 * df.shape[0]:
        df.drop(col, axis=1, inplace=True)
    else:
        df[col].fillna(df[col].mean(), inplace=True)

# Impute missing values of categorical columns with the mode with 50% threshold
for col in df.select_dtypes(include='object').columns:
    if df[col].isnull().sum() > 0.5 * df.shape[0]:
        df.drop(col, axis=1, inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)
        

df.info()
    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   UID                             5000 non-null   int64  
 1   PRODUCT_NAME                    5000 non-null   object 
 2   IMAGE_URL                       5000 non-null   object 
 3   PRODUCT_PRICE                   5000 non-null   float64
 4   PRODUCT_PRICE_TREATED_OUTLIERS  5000 non-null   float64
 5   PRODUCT_LINK                    5000 non-null   object 
 6   PRODUCT_INFORMATION_T           5000 non-null   object 
 7   CATEGORY                        5000 non-null   object 
 8   SUB_CATEGORY                    5000 non-null   object 
 9   PRICE_PER_KG/L                  5000 non-null   float64
 10  SATURATED_FATTY_ACIDS           5000 non-null   float64
 11  CARBOHYDRATES                   5000 non-null   float64
 12  SUGAR                           50

In [19]:
# Choose the features to use for similarity. We'll use the ingredients, product name and brand name
df['combo'] = df['CATEGORY'] + ' ' + df['SUB_CATEGORY'] + ' ' + df['PRODUCT_NAME'] + ' ' + df['INGREDIENTS']

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=50, max_df=.9)
count_matrix = cv.fit_transform(df['combo'])

* We will use the cosine similarity to calculate a numeric quantity that denotes the similarity between two products. Mathematically, it is defined as follows:
	* $cos(x, y) = \frac{x \cdot y}{\left\Vert x \right\Vert \left\Vert y \right\Vert}$

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix) 

In [21]:
def get_recommendations(PRODUCT_NAME, top_n=3):
    product_index = df[df['CATEGORY']==PRODUCT_NAME].index[0]
    similar_products = cosine_sim[product_index].argsort()[:-top_n:-1]
    return df.iloc[similar_products].PRODUCT_NAME.values 

In [22]:
get_recommendations('Food & Beverage')

array(['Almdudler Original Herb Lemonade',
       'Almdudler Original herbal lemonade 1l'], dtype=object)

* The above function is an example of a content-based recommender system. It recommends products based on the similarity of their features. It is known as a content-based recommender because the features of the products are used to recommend other products.
* The function takes in a product name and returns the top 5 similar products.


* We could also use the Pearson Correlation Coefficient to calculate a numeric quantity that denotes the similarity
between two products. Mathematically, it is defined as follows:
    * $\rho_{X,Y} = \frac{cov(X,Y)}{\sigma_X \sigma_Y} = \frac{E[(X-\mu_X)(Y-\mu_Y)]}{\sigma_X \sigma_Y}$
    * $\rho_{X,Y} = \frac{\sum_{i=1}^{n}(x_i - \mu_X)(y_i - \mu_Y)}{\sqrt{\sum_{i=1}^{n}(x_i - \mu_X)^2}\sqrt{\sum_{i=1}^{n}(y_i - \mu_Y)^2}}$
    

* We will use the Euclidean Distance to calculate the distance between two vectors. Mathematically, it is defined as follows:
    * $d(p, q) = \sqrt{\sum_{i=1}^{n}(p_i - q_i)^2}$

In [10]:
# Saving the model
import pickle
pickle.dump(cosine_sim, open('cosine_sim.pkl', 'wb'))

## Topic-Based Recommender Systems
* We will use the Latent Dirichlet Allocation (LDA) to recommend products based on the topics. LDA is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar. For example, if observations are words collected into documents, it posits that each document is a mixture of a small number of topics and that each word's presence is attributable to one of the document's topics. LDA is an example of a topic model.

In [23]:
# Building LDA model
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(count_matrix)

# Saving the model
# pickle.dump(lda, open('lda.pkl', 'wb'))

In [24]:
# Transforming the count matrix
count_matrix_lda = lda.transform(count_matrix)

# Saving the transformed matrix
#pickle.dump(count_matrix_lda, open('count_matrix_lda.pkl', 'wb'))

# Building a recommendation system using LDA
def get_recommendations_lda(PRODUCT_NAME, top_n=3):
    product_index = df[df['CATEGORY']==PRODUCT_NAME].index[0]
    similar_products = count_matrix_lda[product_index].argsort()[:-top_n:-1]
    return df.iloc[similar_products].PRODUCT_NAME.values

get_recommendations_lda('Food & Beverage')

array(['almond butter brown 250g', 'Almdudler Sugar Free Herb Lemonade'],
      dtype=object)

## Incorporating LLMs.
* For our case, we will use the Claude LLM API from Anthropic using API keys.

In [31]:
api_key = ''

import anthropic
client = anthropic.Client(api_key)

client

<anthropic.api.Client at 0x2bd559e9720>

In [42]:

response = client.completion(
    prompt=f"{anthropic.HUMAN_PROMPT}Get groceries recommendations{anthropic.AI_PROMPT}",
    model = 'claude-v1',
    max_tokens_to_sample=100,
    stop_sequences=[anthropic.HUMAN_PROMPT]
)
