# Recommendation System

In [74]:
# import modules
import numpy as np 
import pandas as pd 
from collections import Counter 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import networkx as nx 
import plotly.offline as py
import plotly.io as pio

from plot import plot_network_graph

In [75]:
# load the data
df = pd.read_json('../data/train.json')
df = df.head(1000)

In [76]:
pd.set_option("display.max_colwidth", None)
df.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes, garlic, pepper, purple onion, seasoning, garbanzo beans, feta cheese crumbles]"
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, ground black pepper, thyme, eggs, green tomatoes, yellow corn meal, milk, vegetable oil]"
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, green chilies, grilled chicken breasts, garlic powder, yellow onion, soy sauce, butter, chicken livers]"
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pepper, onions, garlic paste, milk, butter, salt, lemon juice, water, chili powder, passata, oil, ground cumin, boneless chicken skinless thigh, garam masala, double cream, natural yogurt, bay leaf]"


## Preprocess the data and compute similarities between recipes

In [77]:
def compute_similarities(df, THRESHOLD):

    #### Create a document-term-matrix
    vectorizer = CountVectorizer(lowercase=True, min_df=1, analyzer='word', stop_words=None)

    #### one dtm with matching unique words
    onewordingredients = [["".join(i.split()) for i in inner] for inner in list(df['ingredients'])]
    original_ingredient_corpus = [" ".join(i) for i in onewordingredients]
    dtm_orignal_ingredient = vectorizer.fit_transform(original_ingredient_corpus)

    #### And another dtm where each word is its own token
    separate_words_corpus = [" ".join(i) for i in list(df['ingredients'])]
    dtm_separate_words = vectorizer.fit_transform(separate_words_corpus)

    # concatenate matrices
    dtm = np.concatenate((dtm_orignal_ingredient.toarray(), dtm_separate_words.toarray()), axis=1)

    #### Compute similarity between any two recipes
    similarity_csr = cosine_similarity(dtm, dense_output=False)

    # get similar recipes by index
    sim_recipes = np.argwhere(similarity_csr > THRESHOLD)
    sim_recipes = sim_recipes[sim_recipes[:, 0] != sim_recipes[:, 1]]
    
    return similarity_csr, sim_recipes

## Build the network graph of similar recipes

In [78]:
def build_graph(sim_recipes, RECIPE_INDEX, similarity_csr, THRESHOLD):
    
    first_order = [i[1] for i in sim_recipes if i[0] in [RECIPE_INDEX]]

    second_order = list(set([i[1] for i in sim_recipes if i[0] in first_order]))
    # remove original recipe
    if RECIPE_INDEX in second_order:
        second_order.remove(RECIPE_INDEX)
    second_order = [x for x in second_order if x not in first_order]

    third_order = list(set([i[1] for i in sim_recipes if i[0] in second_order]))
    # remove original recipe
    if RECIPE_INDEX in third_order:
        third_order.remove(RECIPE_INDEX)
    third_order = [x for x in third_order if x not in first_order+second_order]

    # get list of all recommended recipes by index
    all_recommendations = list(set([RECIPE_INDEX] + first_order + second_order + third_order))
    all_recommendations.sort()

    # keep only those recipes of interest 
    # - note that a new matrix will change the index number of the recommended recipes
    row_idx = np.array(all_recommendations)
    col_idx = np.array(all_recommendations)
    recommendation_csr = similarity_csr[row_idx[:, None], col_idx]

    # for the connected nodes keep only those pairs that have a similarity > THRESHOLD
    direct_recommendation_csr = (recommendation_csr > THRESHOLD) 

    # return the new indices of the narrowed matrix containing only the recommendations
    new_indices = [i for i in enumerate(all_recommendations)]

    # get the new index of the original recipe
    original_recipe_idx = [i[0] for i in new_indices if i[1]==RECIPE_INDEX][0]

    # get new indices of the recommendations
    first = []
    second = []
    third = []
    for idx,i in new_indices:
        if i in first_order:
            first.append(idx)
        if i in second_order:
            second.append(idx)
        if i in third_order:
            third.append(idx)

    # convert adjacency recommendation matrix to graph
    G = nx.from_numpy_matrix(direct_recommendation_csr)
    
    return all_recommendations, original_recipe_idx, first, second, third, G

## Create the visualization

In [79]:
# map a color to the recommendation level
def create_visualization(original_recipe_idx, first, second, third, df, all_recommendations, G, THRESHOLD):
    d = {}
    d[original_recipe_idx] = 0
    d.update({i: 1 for i in first})
    d.update({j: 2 for j in second})
    d.update({k: 3 for k in third})

    node_colors_by_position = [d[i] for i in sorted(d)]
    node_text_by_position = list(df.loc[all_recommendations]['id'].values)#list(pos.keys())

    fig = plot_network_graph(G, TITLE="Recommended recipes by distance with threshold of {}".format(THRESHOLD), list_of_colors_by_order_of_nodes=node_colors_by_position, list_of_text_by_order_of_nodes=node_text_by_position)
        
    pio.write_html(fig, '../figures/jupyter_notebook_graph_output.html') 
    
    return py.iplot(fig)

## main

In [80]:
# Choose a recipe by recipe ID
RECIPE = 41995
# get index by recipe ID
RECIPE_INDEX = df[df['id']==RECIPE].index.values[0]
THRESHOLD = 0.5

def recommend_recipes():
    similarity_csr, sim_recipes = compute_similarities(df, THRESHOLD)
    all_recommendations, original_recipe_idx, first, second, third, G = build_graph(sim_recipes, RECIPE_INDEX, similarity_csr, THRESHOLD)
    fig = create_visualization(original_recipe_idx, first, second, third, df, all_recommendations, G, THRESHOLD)
    return fig

In [81]:
recommend_recipes()