In [1]:
import json
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

#### Read recipe data as dictionary

In [2]:
recipes = json.load(open('./data/recipe_train.json'))

#### Convert recipe to text document

In [3]:
recipes_as_doc = {}

for sample in recipes:
    key = sample['cuisine']
    # If key is in the dictionary, return its value. If not, insert key with a value of default and return default.
    recipes_as_doc.setdefault(key,[]).append(' '.join(sample['ingredients']).lower())

# create a single list with all the documents
all_docs = []
for k, v in recipes_as_doc.items():
   all_docs.append(' '.join(v))

#### Tfidf vectorizer for text data

In [4]:
# code adapted from https://programminghistorian.org/en/lessons/analyzing-documents-with-tfidf

vectorizer = TfidfVectorizer(use_idf=True, norm=None)
transformed_documents = vectorizer.fit_transform(all_docs)

#### Show results

In [5]:
transformed_documents_as_array = transformed_documents.toarray()
len(transformed_documents_as_array)

# loop each item in transformed_documents_as_array, using enumerate to keep track of the current position
for counter, doc in enumerate(transformed_documents_as_array):
    # construct a dataframe
    tf_idf_tuples = list(zip(vectorizer.get_feature_names(), doc))
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)

    print(list(recipes_as_doc.keys())[counter])
    print( one_doc_as_df)
    # output to a csv using the enumerated value for the filename
    #one_doc_as_df.to_csv("tf.idf" + counter.__str__() + ".csv")

greek
             term       score
0           fresh  852.000000
1          pepper  844.000000
2             oil  825.000000
3            salt  747.000000
4           olive  746.000000
5            feta  662.848740
6          ground  650.000000
7           lemon  644.000000
8          garlic  631.000000
9          cheese  581.000000
10        oregano  465.122724
11          juice  446.000000
12          dried  438.000000
13       tomatoes  424.000000
14          black  401.000000
15       crumbles  372.852416
16         cloves  308.000000
17            red  301.000000
18          onion  299.000000
19         yogurt  293.661246
20       cucumber  272.685443
21       kalamata  268.258351
22         onions  264.000000
23        chopped  260.000000
24        parsley  258.000000
25          greek  255.067300
26          extra  236.000000
27         virgin  229.000000
28        chicken  228.000000
29         olives  217.500665
...           ...         ...
2980      grilled    0.000000
2981