In [7]:
import json
import pandas as pd
import csv
import time
import os

from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer 

#### Load Recipes Data

In [8]:
recipes = json.load(open('./data/recipe_train.json'))

#### Convert recipes to text documents

In [9]:
recipes_as_doc = {}

for sample in recipes:
    key = sample['cuisine']
    # If key is in the dictionary, return its value. If not, insert key with a value of default and return default.
    recipes_as_doc.setdefault(key,[]).append(' '.join(sample['ingredients']).lower())

# create a single list with all the documents
all_docs = []
for k, v in recipes_as_doc.items():
   all_docs.append(' '.join(v))

#### Tf-idf vectorizer for text data

In [10]:
# code adapted from https://programminghistorian.org/en/lessons/analyzing-documents-with-tfidf

vectorizer = TfidfVectorizer(use_idf=True)
transformed_documents = vectorizer.fit_transform(all_docs)

#### Analyze results

In [11]:
transformed_documents_as_array = transformed_documents.toarray()
docs_scores_dfs = {}

# loop each item in transformed_documents_as_array, using enumerate to keep track of the current position
for counter, doc in enumerate(transformed_documents_as_array):
    # construct a data frame
    tf_idf_tuples = list(zip(vectorizer.get_feature_names(), doc))
    one_doc_as_df = pd.DataFrame\
        .from_records(tf_idf_tuples, columns=['term', 'score'])\
        .sort_values(by='score', ascending=False)\
        .reset_index(drop=True)
    docs_scores_dfs[list(recipes_as_doc.keys())[counter]] = one_doc_as_df

    print(list(recipes_as_doc.keys())[counter], end=' ')

greek southern_us filipino indian jamaican spanish italian mexican chinese british thai vietnamese cajun_creole brazilian french japanese irish korean moroccan russian 

#### Statistics of Scores

In [12]:
statistics = {}
for k in docs_scores_dfs.keys():
    df = docs_scores_dfs[k]
    scores = df['score']
    scores = scores.loc[scores>0.0]
    statistics[k] = [scores.mean(), scores.max(), scores.min(), df['score'].astype(bool).sum(axis=0)]


#### Load Baskets Market Data

In [13]:
# ! this parameter will be also used for outputting basketScore and clustering
basketsFilename= "synthetic20000.csv"

baskets = []
with open('./data/' + basketsFilename, newline=None) as f:
  reader = csv.reader(f)
  for row in reader:
    baskets.append([elem.replace(' ', '-') for elem in row])

#### Optimize computation
In order to optimize computation we compute ngram for tf.idf top score 


In [14]:
top_scores_ngram = {}
CUISINE_TOP_SCORE_THRESHOLD = 0.1
for cuisine in docs_scores_dfs.keys():
    scores_df = docs_scores_dfs[cuisine]
    scores_df = scores_df[scores_df['score']>CUISINE_TOP_SCORE_THRESHOLD]
    scores_df = scores_df.sort_values(by=['score'],ascending=False)
    
    ngram_scores = []
    for index, row in scores_df.iterrows():
        ngram = list(ngrams(row['term'], 3))
        score =  row['score']
        ngram_scores.append([ngram, score, row['term']])
        
    top_scores_ngram[cuisine] = ngram_scores

#### Define the distance metric 

In [15]:
def jaccard_distance(a, b):
    """Calculate the jaccard distance between sets A and B"""
    a = set(a)
    b = set(b)
    return 1.0 * len(a&b)/len(a|b)

#### Optimize computation
In order to optimize computation we calculate the ngrams of unique items in the baskets.

In [16]:
items_ngrams = {}
for basket in baskets:
    for item in basket:
        if items_ngrams.get(item) is None :
            items_ngrams[item] = list(ngrams(item, 3))

#### Compute Basket Scores
For each basket we iterate over all items and compute their similarities with the top 
terms in the cuisines docs.

In [17]:
start_time = time.time()
baskets_scores = []
all_similarities = {}
for basket in baskets:
    similarities = {}
    for cuisine in top_scores_ngram:
        value = 0
        sims = 0
        for item in basket:      
            item_ngram = items_ngrams[item]
            for ngram_term, score, term in top_scores_ngram[cuisine]:
                if all_similarities.get(item+term) is None:
                    if all_similarities.get(term+item) is None:
                        sim = jaccard_distance(item_ngram, ngram_term)
                        all_similarities[item+term] = sim
                        all_similarities[term+item] = sim
                    else:
                        sim = all_similarities[term+item]
                else:
                    sim = all_similarities[item+term]

                if sim > 0.2:
                    value += sim * score
                    sims += sim
        if sims != 0:
            value = value/sims
        else:
            value=0
                    
        similarities[cuisine] = value    
    baskets_scores.append(similarities)
    
baskets_scores_df = pd.DataFrame(baskets_scores)
baskets_scores_df=(baskets_scores_df-baskets_scores_df.mean())/baskets_scores_df.std()
print("--- %s seconds ---" % (time.time() - start_time))

--- 20.766939640045166 seconds ---


In [18]:
# save this data if you want to run elki clustering from dbscanELKI.sh script
baskets_scores_df.to_csv('./data/scores_' + basketsFilename, header=False, index=False)

#### Clustering baskets usign DBSCAN in ELKI Data Mining Framework

This is done calling the script present in the folder. Once the script completes the next cell 
will output 0.  

In [19]:
# if necessary change parameters and paths
os.system("./dbscanELKI.sh \
    eps=0.01 minPoints=1000 \
    data=/home/vale/Documenti/Uni/II-I\ Sem/Data\ Mining/Project/data/scores_" + basketsFilename + " \
    log=/home/vale/Documenti/Uni/II-I\ Sem/Data\ Mining/Project/data/log_" + basketsFilename + " \
    output=/home/vale/Documenti/Uni/II-I\ Sem/Data\ Mining/Project/data/clusters__" + basketsFilename)

0