# Process retrieved recipe html's

Use functions imported from another file.

## Import packages / setup

In [1]:
# import public things

# general / random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipynb
import re # for string parsing / editing
import string # for string parsing / editing
from datetime import datetime
import time
import random
from pathlib import Path
import os
import ast

# for html
import requests # for getting html off the web
from bs4 import BeautifulSoup # for parsing html
import json

# for ML
from wordcloud import WordCloud, STOPWORDS
import snowballstemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import NMF

# import functions from my functions file
import ipynb.fs.full.functions as funcs

# update a module if it's been edited
# (this is just going around a jupyter feature where simply re-importing doesn't do anything)
# https://support.enthought.com/hc/en-us/articles/204469240-Jupyter-IPython-After-editing-a-module-changes-are-not-effective-without-kernel-restart
import importlib
importlib.reload(funcs)

# other useful settings
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5)

## The actual workflow

In [2]:
# input the name (which is a timestamp) of the folder for the round of interest

input_timestamp = 'joint_2021-05-11_11-28'

In [3]:
# Loop through all the recipe html files in the directory of interest and extract data from them

# go to the folder of interest
recipes_folder = f'/home/bkotryna/Allrecipes/data/{input_timestamp}/recipes/'
path_recipes_folder = Path(recipes_folder)
os.chdir(path_recipes_folder)
print(f'We\'ll read in recipes from the folder:\n{os.getcwd()}\nNumber of files in this folder is {len(os.listdir())}.\n')


# create a dictionary with key = recipe_id, value = a list of sub-df's with data from that recipe
recipe_dfs_dict = {}
# create a master df for all recipes
recipes_df = pd.DataFrame()

# count recipes that have (not) been processed successfully
failed_recipes_no = 0
successful_recipes_no = 0

failed_recipes_ids = []
successful_recipes_ids = []

# read in one html file at a time and extract data to populate a spreadsheet
# loop through the directory
for file in os.listdir(path_recipes_folder):
    filename = os.fsdecode(file)
    
    recipe_id = filename
    print(recipe_id)
    
    # open a file and read the contents
    with open(filename, 'r') as page_text:
        
        # convert contents into BS
        page = BeautifulSoup(page_text)
        
        # extract info from json
        recipe_info_df = funcs.extract_info_from_json_on_page_to_df(page)
        
        # if can't find json, ignore this page
        if recipe_info_df.empty:
            # count recipes that haven't been processed successfully
            failed_recipes_no += 1
            failed_recipes_ids.append(recipe_id)
            #break
        else:
            # create pandas df's for bits of useful information
            key_info_df = funcs.extract_key_info(recipe_info_df)

            #recipe_id = key_info_df.loc[0]['recipe_id']

            times_df = funcs.extract_times(recipe_info_df)
            ingredients_df = funcs.extract_ingredients(recipe_info_df)
            steps_df = funcs.extract_method_steps(recipe_info_df)
            nutrition_df = funcs.extract_nutritional_info(recipe_info_df)

            # extract stuff directly from html
            stars_and_reviews_df = funcs.extract_stars_and_review_info(page)
            multimedia_df = funcs.extract_multimedia_info(page)

            # incorporate the mini sub-df's into a list (eventually we'll make them into one master df)
            # BTW it's way more efficient to not iterative grow a df!
            # https://stackoverflow.com/questions/13784192/creating-an-empty-pandas-dataframe-then-filling-it
            recipe_dfs_list = [key_info_df, stars_and_reviews_df, multimedia_df, times_df,
                               ingredients_df, steps_df, nutrition_df]

            # add the list of mini df's to a dictionary. Key = recipe_id, value = list of mini df's for that recipe       
            dict_key = recipe_id
            dict_value = recipe_dfs_list
            recipe_dfs_dict[dict_key] = dict_value
            
            # count successfully processed recipes
            successful_recipes_no += 1
            successful_recipes_ids.append(recipe_id)
            
print(f'\nAll recipes have now been attempted.\nNumber of successful recipes is {successful_recipes_no}.\nNumber of failed recipes is {failed_recipes_no}.\n\nSuccessful recipe ids are {successful_recipes_ids}\nFailed recipe ids are {failed_recipes_ids}')

FileNotFoundError: [Errno 2] No such file or directory: '/home/bkotryna/ML_practice/allrecipes_project/data/joint_2021-05-11_11-28/recipes'

In [None]:
# use recipe_dfs_dict to populate recipes_df
# loop over all recipe ids
for recipe_id, content in recipe_dfs_dict.items():

    # for each recipe_id, make a single df
    one_recipe_df = pd.concat(content, axis=1)

    # add the one_recipe_df to the master df
    recipes_df = pd.concat([recipes_df, one_recipe_df], axis=0)

# reindex with recipe_id as the index
recipes_df.set_index('recipe_id', inplace=True)
    
# inspect
display(recipes_df)
display(recipes_df.describe())
display(recipes_df.info())

In [None]:
# save the master spreadsheet

# go to the folder of interest
round_folder = f'/home/bkotryna/Allrecipes/data/{input_timestamp}/'
path_round_folder = Path(round_folder)
os.chdir(path_round_folder)
print(f'We\'ll save recipes_df as an Excel spreadsheet in the folder for this round:\n{os.getcwd()}')

# save as Excel
name_to_save = 'processed_recipe_data.xlsx'
recipes_df.to_excel(name_to_save)