Upload the data to MongoDB. Remove duplicate recipes and cocktails then save the recipes as a dataframe and pickle the file to df_01.pkl.

# Imports

In [2]:
import numpy as np
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt
%matplotlib inline

import json

from pymongo import MongoClient

# MongoDB

## Load JSON

In [None]:
with open('epicurious_recipes/full_format_recipes.json') as json_data:
    recipes_json = json.load(json_data)

## Connect to MongoDB Client

In [None]:
# ssh -NL 12345:localhost:27017 myaws

In [None]:
client = MongoClient(port=12345) # this is the port set by the SSH tunnel

In [None]:
address = client.address
db_names = client.database_names()
host = client.HOST
port = client.PORT

print('address: ', address, '\n',
      'databases: ', db_names, '\n',
      'host: ', host, '\n', 
      'port: ', port)

In [None]:
# assign Kojak database to db
db = client.kojak
db.collection_names()

In [None]:
# initialize new collection for recipe data within Kojak database
recipes = db.recipes

## Upload Recipes to kojak.recipe collection

In [None]:
# try inserting one document from the recipe json file to the recipe collection
recipes.insert_one(recipes_json[0])

In [None]:
# insert the rest of the documents from the recipe json file
for recipe in recipes_json[1:]:
    recipes.insert_one(recipe)

In [None]:
recipes.count(), df.shape

# Dataframe Creation

## Retrieve Data from kojak.recipe collection

In [None]:
# ssh -NL 12345:localhost:27017 myaws

In [3]:
client = MongoClient(port=12345) # this is the port set by the SSH tunnel

In [4]:
# assign Kojak database to db
db = client.kojak
db.collection_names()

['recipes']

In [5]:
recipe_db = db.recipes

In [6]:
df_mongo = pd.DataFrame(list(recipe_db.find()))
df_mongo.head()

Unnamed: 0,_id,calories,categories,date,desc,directions,fat,ingredients,protein,rating,sodium,title
0,5a0e0ce39c2cf2542809b8b8,426.0,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",2006-09-01T04:00:00.000Z,,"[1. Place the stock, lentils, celery, carrot, ...",7.0,"[4 cups low-sodium vegetable or chicken stock,...",30.0,2.5,559.0,"Lentil, Apple, and Turkey Wrap"
1,5a0e0d2e9c2cf2542809b8b9,403.0,"[Food Processor, Onion, Pork, Bake, Bastille D...",2004-08-20T04:00:00.000Z,This uses the same ingredients found in boudin...,[Combine first 9 ingredients in heavy medium s...,23.0,"[1 1/2 cups whipping cream, 2 medium onions, c...",18.0,4.375,1439.0,Boudin Blanc Terrine with Red Onion Confit
2,5a0e0d2e9c2cf2542809b8ba,165.0,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",2004-08-20T04:00:00.000Z,,[In a large heavy saucepan cook diced fennel a...,7.0,"[1 fennel bulb (sometimes called anise), stalk...",6.0,3.75,165.0,Potato and Fennel Soup Hodge
3,5a0e0d2e9c2cf2542809b8bb,,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",2009-03-27T04:00:00.000Z,The Sicilian-style tomato sauce has tons of Me...,[Heat oil in heavy large skillet over medium-h...,,"[2 tablespoons extra-virgin olive oil, 1 cup c...",,5.0,,Mahi-Mahi in Tomato Olive Sauce
4,5a0e0d2e9c2cf2542809b8bc,547.0,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",2004-08-20T04:00:00.000Z,,[Preheat oven to 350°F. Lightly grease 8x8x2-i...,32.0,"[1 12-ounce package frozen spinach soufflé, th...",20.0,3.125,452.0,Spinach Noodle Casserole


## DF

In [13]:
df = df_mongo[['title', 'ingredients', 'directions', 'categories', 'desc']]
df.index.name = 'recipeID'
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20130 entries, 0 to 20129
Data columns (total 5 columns):
title          20111 non-null object
ingredients    20111 non-null object
directions     20111 non-null object
categories     20111 non-null object
desc           13495 non-null object
dtypes: object(5)
memory usage: 786.4+ KB


## Drop duplicates and drink recipes

In [19]:
df.drop_duplicates(subset=['title'],keep='first',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [34]:
df.dropna(axis=0, thresh=4,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [35]:
df.reset_index(inplace=True)

In [36]:
cocktail_list = []
for i in range(len(df)):
    if 'Cocktail' in df.iloc[i].categories:
        cocktail_list.append(i)

In [41]:
df.drop(df.index[cocktail_list],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [45]:
df.head()

Unnamed: 0,level_0,index,recipeID,title,ingredients,directions,categories,desc
0,0,0,0,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...","[1. Place the stock, lentils, celery, carrot, ...","[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",
1,1,1,1,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",[Combine first 9 ingredients in heavy medium s...,"[Food Processor, Onion, Pork, Bake, Bastille D...",This uses the same ingredients found in boudin...
2,2,2,2,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalk...",[In a large heavy saucepan cook diced fennel a...,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",
3,3,3,3,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup c...",[Heat oil in heavy large skillet over medium-h...,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",The Sicilian-style tomato sauce has tons of Me...
4,4,4,4,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, th...",[Preheat oven to 350°F. Lightly grease 8x8x2-i...,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",


In [48]:
df.drop(['level_0','index','recipeID'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [54]:
df.reset_index(drop=True,inplace=True)

In [57]:
df.to_pickle('df_01.pkl')