# Similarity Algorithms and Matrix
- Comparing and exploring different algorithms to measure the similarity between two recipeIDs

### Part 1)
- Create a concatonated text field that contains the important features that describe each recipeID

### Part 2)
- Exploring different algorithms

### Part 3) 
- Create a table with recipe pairs

### Part 4) 
- Create a similarity score for each pair using the algorithms created in Part 2 and functions in appendix

### Part 5) 
- Create similarity matrix

### Part 6)
- Join with similarity score dataset

### Appendix
- Each algorithm created in part 2) as a function 

In [61]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import en_core_web_sm
%matplotlib inline
pd.options.display.max_colwidth=500
import re
import networkx as nx
import sys
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
import nltk, string

nltk.download('punkt') # if necessary...


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\robert.lowe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

_____
### Part 1)
- Create a concatonated text field that contains the important features that describe each recipeID
____

In [2]:
df = pd.read_csv('recipes_info.csv')

In [3]:
df = df.replace(np.nan, '')

In [4]:


df['All'] = df['country'].astype(str)+' '+df['dish_category']+' '+df['dish_type']+' '+df['carbohydrate_category']+' '+df['carbohydrate_base'] + ' '+df['protein_cut'] +' '+df['protein'] +' '+df['spice_level'] +' '+df['protein_type']
        
        

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260 entries, 0 to 259
Data columns (total 15 columns):
recipe_id                260 non-null int64
country                  260 non-null object
country_secondary        260 non-null object
dish_category            260 non-null object
dish_type                260 non-null object
diet_type                260 non-null object
carbohydrate_base        260 non-null object
carbohydrate_category    260 non-null object
protein                  260 non-null object
protein_cut              260 non-null object
protein_type             260 non-null object
family_friendly          260 non-null object
spice_level              260 non-null object
prep_time                260 non-null object
All                      260 non-null object
dtypes: int64(1), object(14)
memory usage: 30.5+ KB


In [6]:
df.head(2).to_clipboard()

In [7]:
# Create a list of combined information (index for all is -1)
recipes = df.iloc[:,-1].tolist()
recipes_ids = df.iloc[:,0].tolist()

_____
### Part 2)
- Exploring different algorithms:
- 2a) Jaccard Similarity
- 2b) Cosine Similarity (using count vectorizer and tf-idf, also using stemming approach)
- 2c) SpaCy Similarity
____

In [8]:
print(recipes[11])
recipes[12]

Italy Stove top / bowl food Pasta Pasta White pasta Steak Pork Pork No Spice Poultry & Meat


'Thailand Stove top / bowl food Stir Fry Noodles Thai rice noodles Breast Chicken Chicken Mild Poultry & Meat'

In [9]:


def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))



In [10]:
get_jaccard_sim(recipes[11], recipes[12])

0.3076923076923077

In [11]:

# List of content
corpus = [recipes[11], recipes[12]]
vectorizer = TfidfVectorizer()

# Generate the tf-idf vectors for the corpus
tfidf_matrix = vectorizer.fit_transform(corpus)

# Compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)
print(cosine_sim[0][1])

0.1559289254870836


In [12]:
# List of content
corpus = [recipes[11], recipes[12]]

# Initialise countvectorizer
Cvectorizer = CountVectorizer()

# Generate the vectors for the corpus
tfidf_matrix = Cvectorizer.fit_transform(corpus)

# Compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)

print(cosine_sim[0][1])

0.26726124191242445


In [13]:
# remove punctuation, lowercase, stem
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

# Create tokenizer
def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

# User above as an argument for Tf-idf vectorizer
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

# Compute cosine similarity
def cosine_sim(text1, text2):
    corpus = [text1,text2]
    tfidf_matrix = vectorizer.fit_transform(corpus)
    cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)
    return cosine_sim[0,1]

print(cosine_sim(recipes[11], recipes[12]))

0.13679150108596264


In [14]:
print(recipes[11])
recipes[12]

Italy Stove top / bowl food Pasta Pasta White pasta Steak Pork Pork No Spice Poultry & Meat


'Thailand Stove top / bowl food Stir Fry Noodles Thai rice noodles Breast Chicken Chicken Mild Poultry & Meat'

#### 2c) SpaCy similarity
- En_core_web are available pretrained statistical models for English
- Similarity is determined by comparing word vectors or “word embeddings”, multi-dimensional meaning representations of a word.

In [62]:
# Use spaCy to compare recipes
nlp = en_core_web_sm.load()
nlp_recipe1 = nlp(recipes[11])
nlp_recipe2 = nlp(recipes[12])
spacy_similarity = nlp_recipe2.similarity(nlp_recipe1)
spacy_similarity

0.9471484539013452

___
### Part 3) 
- Create a table with recipe pairs
____

In [16]:
# Number of recipes
num_recipes = len(recipes)
print("A total of " + str(num_recipes) + " recipes")

A total of 260 recipes


In [17]:
# label recipes as recipeid1_1, recipeid_2 .. recipeid_n
recipeids = ["RecipeID" + str(i) for i in recipes_ids]
index_ids = ["_" + str(i) for i in range(num_recipes)]
d = pd.DataFrame({'recipeids': recipeids, 'index_ids': index_ids})

#concat two columns
d['All']=d['recipeids'].astype(str)+' '+d['index_ids']

d.head(1)

Unnamed: 0,recipeids,index_ids,All
0,RecipeID2,_0,RecipeID2 _0


In [18]:
#recipeid list
recipeids = d['All'].tolist()

# create a dictionary
recipe_dict = dict(zip(recipeids, recipes))

# get all the book ids in a list
ids = list(recipe_dict.keys())

# create all possible pairs
pairs = []
# create a list of tuples
for i, v in enumerate(ids):
    for j in ids[i+1:]:
        pairs.append((ids[i], j))
        
print("There are a total of " + str(len(pairs)) + " pairs")
display(pairs[:5])

There are a total of 33670 pairs


[('RecipeID2 _0', 'RecipeID5 _1'),
 ('RecipeID2 _0', 'RecipeID9 _2'),
 ('RecipeID2 _0', 'RecipeID16 _3'),
 ('RecipeID2 _0', 'RecipeID19 _4'),
 ('RecipeID2 _0', 'RecipeID20 _5')]

____
### Part 4) 
- Create a similarity score for each pair using the algorithms created in Part 2 and functions in appendix
____

In [29]:

#pairwise_spacy_similarity = [compute_spacy_similarity(pair) for pair in pairs]
compute_jaccard_similarity = [compute_jaccard_similarity(pair) for pair in pairs]
compute_countv_cos_similarity = [compute_countv_cos_similarity(pair) for pair in pairs]
compute_cosine_similarity_stem = [compute_cosine_similarity_stem(pair) for pair in pairs]
compute_cosine_similarity = [compute_cosine_similarity(pair) for pair in pairs]

# create a dataframe
data = pd.DataFrame({ 
    
                    'Recipe_IDs': pairs, 
                    #'spacy_similarity': pairwise_spacy_similarity,
                    'jaccard_similarity': compute_jaccard_similarity,
                    'countv_cos_similarity': compute_countv_cos_similarity,
                    'cosine_similarity_stem': compute_cosine_similarity_stem,
                    'cosine_similarity': compute_cosine_similarity
    
                    })



In [30]:
data = data.sort_values(by = 'cosine_similarity',  ascending=False)
data.head(40).to_clipboard()

___

### Part 5) 
- Create Similarity Matrix
___

In [31]:
# split recipe ides into two columns
data['Recipe_IDs'] = data['Recipe_IDs'].astype(np.str)
data[['Recipe_a','Recipe_b']] = data.Recipe_IDs.str.split(",",expand=True,)

# clean values
data['Recipe_a'] = data['Recipe_a'].str.replace("'", "")
data['Recipe_a'] = data['Recipe_a'].str.replace("(", "")
data['Recipe_b'] = data['Recipe_b'].str.replace("'", "")
data['Recipe_b'] = data['Recipe_b'].str.replace(")", "")

# extract recipe ids
data['b'] = data['Recipe_b'].apply(lambda x : re.findall(r'\d+', x)[0])
data['a'] = data['Recipe_a'].apply(lambda x : re.findall(r'\d+', x)[0])


In [32]:
# Duplicate tables and apend to create a data set for matrix
data2 = data[['a', 'b', 'cosine_similarity']]
data1 = data[['a', 'b', 'cosine_similarity']]
data1.rename(columns={'a': 'bb'}, inplace=True)
data1.rename(columns={'b': 'a'}, inplace=True)
data1.rename(columns={'bb': 'b'}, inplace=True)
data3 = data1.append(data2)

In [33]:

table1 = pd.pivot_table(data3, index = ['a'], columns = ['b'],
                       values = ['cosine_similarity'], 
                        aggfunc ={'cosine_similarity':np.mean}).rename(columns ={'cosine_similarity':'recipe_id'} )
table1



Unnamed: 0_level_0,recipe_id,recipe_id,recipe_id,recipe_id,recipe_id,recipe_id,recipe_id,recipe_id,recipe_id,recipe_id,recipe_id,recipe_id,recipe_id,recipe_id,recipe_id,recipe_id,recipe_id,recipe_id,recipe_id,recipe_id,recipe_id
b,101,1086,1090,1091,1094,1096,1097,1098,1099,1100,...,944,945,947,949,954,967,969,971,975,980
a,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
101,,0.000000,0.095302,0.036306,0.092321,0.000000,0.000000,0.000000,0.088703,0.037328,...,0.000000,0.000000,0.000000,0.259484,0.000000,0.048185,0.000000,0.000000,0.041007,0.000000
1086,0.000000,,0.049083,0.000000,0.230096,0.442981,0.044080,0.442981,0.047231,0.318784,...,0.136276,0.542158,0.234796,0.156541,0.132330,0.053551,0.000000,0.000000,0.095554,0.442981
1090,0.095302,0.049083,,0.000000,0.000000,0.037585,0.165242,0.000000,0.000000,0.116640,...,0.000000,0.037585,0.000000,0.000000,0.033276,0.044164,0.040479,0.114951,0.123785,0.037585
1091,0.036306,0.000000,0.000000,,0.000000,0.099786,0.210062,0.317868,0.390168,0.028126,...,0.283193,0.317868,0.085666,0.032021,0.027355,0.000000,0.191840,0.244237,0.030897,0.317868
1094,0.092321,0.230096,0.000000,0.000000,,0.220340,0.000000,0.220340,0.080983,0.197919,...,0.152346,0.168983,0.143646,0.229696,0.147805,0.203006,0.040479,0.035235,0.000000,0.220340
1096,0.000000,0.442981,0.037585,0.099786,0.220340,,0.033754,0.366513,0.116744,0.273494,...,0.181439,0.431613,0.258964,0.116744,0.098687,0.041007,0.000000,0.000000,0.072256,0.503103
1097,0.000000,0.044080,0.165242,0.210062,0.000000,0.033754,,0.000000,0.000000,0.000000,...,0.174606,0.033754,0.000000,0.000000,0.000000,0.000000,0.162847,0.575108,0.242464,0.033754
1098,0.000000,0.442981,0.000000,0.317868,0.220340,0.366513,0.000000,,0.384315,0.225765,...,0.225765,0.876351,0.307135,0.116744,0.135984,0.085479,0.037585,0.032716,0.072256,0.766932
1099,0.088703,0.047231,0.000000,0.390168,0.080983,0.116744,0.000000,0.384315,,0.068050,...,0.105612,0.384315,0.099746,0.219510,0.032021,0.139127,0.000000,0.000000,0.074981,0.384315
1100,0.037328,0.318784,0.116640,0.028126,0.197919,0.273494,0.000000,0.225765,0.068050,,...,0.091940,0.225765,0.155459,0.145874,0.256112,0.231417,0.161656,0.000000,0.101631,0.273494


___
### Part 6)

- Joining with dataset1
___

In [34]:
dataset1 = pd.read_csv('pairwaise_dataset1.csv')

In [35]:
dataset1 = dataset1[['recipe_a','recipe_b','score']]

In [45]:
table2 = pd.pivot_table(dataset1, index = ['recipe_a', 'recipe_b'],
                       values = ['score'], aggfunc ={'score':np.mean}).reset_index()

table2.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482 entries, 0 to 1481
Data columns (total 3 columns):
recipe_a    1482 non-null object
recipe_b    1482 non-null object
score       1482 non-null float64
dtypes: float64(1), object(2)
memory usage: 34.8+ KB


In [36]:
data3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67340 entries, 1723 to 0
Data columns (total 3 columns):
a                    67340 non-null object
b                    67340 non-null object
cosine_similarity    67340 non-null float64
dtypes: float64(1), object(2)
memory usage: 2.1+ MB


In [47]:
table2.head()

Unnamed: 0,recipe_a,recipe_b,score
0,115,2,1.0
1,117,19,1.0
2,16,322,1.0
3,16,386,1.0
4,16,392,1.0


In [39]:
data3.rename(columns={'b': 'recipe_b'}, inplace=True)
data3.rename(columns={'a': 'recipe_a'}, inplace=True)

In [48]:
table2['recipe_a'] = table2['recipe_a'].astype(np.str)
table2['recipe_b'] = table2['recipe_b'].astype(np.str)

In [49]:
data3['recipe_a'] = data3['recipe_a'].astype(np.str)
data3['recipe_b'] = data3['recipe_b'].astype(np.str)

In [63]:
joined = pd.merge(data3, table2, on=['recipe_a', 'recipe_b'], how='outer')
joined

Unnamed: 0,recipe_a,recipe_b,cosine_similarity,score
0,954,30,1.000000,
1,954,257,1.000000,
2,348,348,1.000000,
3,348,348,1.000000,
4,257,30,1.000000,
5,1244,886,1.000000,
6,197,197,1.000000,
7,197,197,1.000000,
8,1222,1184,1.000000,
9,1283,577,1.000000,


___
# Appendix
___

In [57]:

def compute_spacy_similarity(pair):

    # extract the indexes from the pair
    recipe1, recipe2 = pair

    # split on _ and get index
    recipe1_index = int(recipe1.split("_")[1])
    recipe2_index = int(recipe2.split("_")[1])

    # get the detail of the document
    recipe1_detail = recipes[recipe1_index]
    recipe2_detail = recipes[recipe2_index]

    nlp_recipe1 = nlp(recipe1_detail)
    nlp_recipe2 = nlp(recipe2_detail)
    spacy_similarity = nlp_recipe2.similarity(nlp_recipe1)

    return spacy_similarity

In [58]:
pair = ('RecipeID77 _11', 'RecipeID89 _12')
compute_spacy_similarity(pair)

0.9471484539013452

In [21]:
def compute_jaccard_similarity(pair):

    # extract the indexes from the pair
    recipe1, recipe2 = pair

    # split on _ and get index
    recipe1_index = int(recipe1.split("_")[1])
    recipe2_index = int(recipe2.split("_")[1])

    # get the detail of the document
    recipe1_detail = recipes[recipe1_index]
    recipe2_detail = recipes[recipe2_index]

    jacarrd_similarity = get_jaccard_sim(recipe1_detail,recipe2_detail)

    return jacarrd_similarity

In [22]:
pair = ('RecipeID77 _11', 'RecipeID89 _12')
compute_jaccard_similarity(pair)

0.3076923076923077

In [23]:
def compute_countv_cos_similarity(pair):

    # extract the indexes from the pair
    recipe1, recipe2 = pair

    # split on _ and get index
    recipe1_index = int(recipe1.split("_")[1])
    recipe2_index = int(recipe2.split("_")[1])

    # get the detail of the document
    recipe1_detail = recipes[recipe1_index]
    recipe2_detail = recipes[recipe2_index]

    corpus = [recipe1_detail,recipe2_detail]
    
    # Initialise countvectorizer
    Cvectorizer = CountVectorizer()
    
    # Generate the vectors for the corpus
    tfidf_matrix = Cvectorizer.fit_transform(corpus)
    cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)
    
    return cosine_sim[0][1]

In [24]:
pair = ('RecipeID77 _11', 'RecipeID89 _12')
compute_countv_cos_similarity(pair)

0.26726124191242445

In [25]:
def compute_cosine_similarity_stem(pair):

    # extract the indexes from the pair
    recipe1, recipe2 = pair
   
    # split on _ and get index
    recipe1_index = int(recipe1.split("_")[1])
    recipe2_index = int(recipe2.split("_")[1])
    
    # get the detail of the document
    recipe1_detail = recipes[recipe1_index]
    recipe2_detail = recipes[recipe2_index]
    
    corpus = [recipe1_detail,recipe2_detail]

    stemmer = nltk.stem.porter.PorterStemmer()
    remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

    def stem_tokens(tokens):
        return [stemmer.stem(item) for item in tokens]

    '''remove punctuation, lowercase, stem'''
    def normalize(text):
        return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

    vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
    
    
    tfidf_matrix = vectorizer.fit_transform(corpus)
    cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)

    return cosine_sim[0][1]

In [26]:
pair = ('RecipeID77 _11', 'RecipeID89 _12')
compute_cosine_similarity_stem(pair)

0.13679150108596264

In [27]:
def compute_cosine_similarity(pair):

    # extract the indexes from the pair
    recipe1, recipe2 = pair
   
    # split on _ and get index
    recipe1_index = int(recipe1.split("_")[1])
    recipe2_index = int(recipe2.split("_")[1])
    
    # get the detail of the document
    recipe1_detail = recipes[recipe1_index]
    recipe2_detail = recipes[recipe2_index]
    
    corpus = [recipe1_detail,recipe2_detail]

    tfidf_matrix = vectorizer.fit_transform(corpus)
    cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)

    return cosine_sim[0][1]

In [28]:
pair = ('RecipeID77 _11', 'RecipeID89 _12')
compute_cosine_similarity(pair)

0.13679150108596264