# Process retrieved recipe html's

Use functions imported from another file.

## Import packages / setup

In [33]:
# import public things

# general / random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipynb
import re # for string parsing / editing
import string # for string parsing / editing
from datetime import datetime
import time
import random
from pathlib import Path
import os
import ast

# for html
import requests # for getting html off the web
from bs4 import BeautifulSoup # for parsing html
import json

# for ML
from wordcloud import WordCloud, STOPWORDS
import snowballstemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import NMF

# import functions from my functions file
import ipynb.fs.full.functions as funcs

# update a module if it's been edited
# (this is just going around a jupyter feature where simply re-importing doesn't do anything)
# https://support.enthought.com/hc/en-us/articles/204469240-Jupyter-IPython-After-editing-a-module-changes-are-not-effective-without-kernel-restart
import importlib
importlib.reload(funcs)

# other useful settings
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5)

## The actual workflow

In [34]:
# input the name (which is a timestamp) of the folder for the round of interest

input_timestamp = 'joint_2021-05-11_11-28'

In [35]:
# Loop through all the recipe html files in the directory of interest and extract data from them

# go to the folder of interest
recipes_folder = f'/home/bkotryna/ML_practice/allrecipes_project/data/{input_timestamp}/recipes/'
path_recipes_folder = Path(recipes_folder)
os.chdir(path_recipes_folder)
print(f'We\'ll read in recipes from the folder:\n{os.getcwd()}\nNumber of files in this folder is {len(os.listdir())}.\n')


# create a dictionary with key = recipe_id, value = a list of sub-df's with data from that recipe
recipe_dfs_dict = {}
# create a master df for all recipes
recipes_df = pd.DataFrame()

# count recipes that have (not) been processed successfully
failed_recipes_no = 0
successful_recipes_no = 0

failed_recipes_ids = []
successful_recipes_ids = []

# read in one html file at a time and extract data to populate a spreadsheet
# loop through the directory
for file in os.listdir(path_recipes_folder):
    filename = os.fsdecode(file)
    
    recipe_id = filename
    print(recipe_id)
    
    # open a file and read the contents
    with open(filename, 'r') as page_text:
        
        # convert contents into BS
        page = BeautifulSoup(page_text)
        
        # extract info from json
        recipe_info_df = funcs.extract_info_from_json_on_page_to_df(page)
        
        # if can't find json, ignore this page
        if recipe_info_df.empty:
            # count recipes that haven't been processed successfully
            failed_recipes_no += 1
            failed_recipes_ids.append(recipe_id)
            #break
        else:
            # create pandas df's for bits of useful information
            key_info_df = funcs.extract_key_info(recipe_info_df)

            #recipe_id = key_info_df.loc[0]['recipe_id']

            times_df = funcs.extract_times(recipe_info_df)
            ingredients_df = funcs.extract_ingredients(recipe_info_df)
            steps_df = funcs.extract_method_steps(recipe_info_df)
            nutrition_df = funcs.extract_nutritional_info(recipe_info_df)

            # extract stuff directly from html
            stars_and_reviews_df = funcs.extract_stars_and_review_info(page)
            multimedia_df = funcs.extract_multimedia_info(page)

            # incorporate the mini sub-df's into a list (eventually we'll make them into one master df)
            # BTW it's way more efficient to not iterative grow a df!
            # https://stackoverflow.com/questions/13784192/creating-an-empty-pandas-dataframe-then-filling-it
            recipe_dfs_list = [key_info_df, stars_and_reviews_df, multimedia_df, times_df,
                               ingredients_df, steps_df, nutrition_df]

            # add the list of mini df's to a dictionary. Key = recipe_id, value = list of mini df's for that recipe       
            dict_key = recipe_id
            dict_value = recipe_dfs_list
            recipe_dfs_dict[dict_key] = dict_value
            
            # count successfully processed recipes
            successful_recipes_no += 1
            successful_recipes_ids.append(recipe_id)
            
print(f'\nAll recipes have now been attempted.\nNumber of successful recipes is {successful_recipes_no}.\nNumber of failed recipes is {failed_recipes_no}.\n\nSuccessful recipe ids are {successful_recipes_ids}\nFailed recipe ids are {failed_recipes_ids}')

We'll read in recipes from the folder:
/home/bkotryna/ML_practice/allrecipes_project/data/joint_2021-05-11_11-28/recipes
Number of files in this folder is 11753.

10004
10007
10014
10019
10022
10030
10080
10090
10095
10103
10145
10156
10166
10171
10195
10227
10253
10279
10292
10295
10299
10315
10316
10322
10355
10385
10409
10412
10416
10488
10543
10549
10583
10585
10593
10619
10627
10641
10693
10710
10722
10754
10783
10824
10839
10851
10854
10861
10888
10947
10963
11037
11052
11121
11130
11131
11146
11154
11162
11174
11179
11194
11210
11236
11267
11269
11287
11362
11366
11383
11393
11402
11405
11418
11431
11433
11438
11499
11532
11555
11560
11571
11589
11612
11613
11617
11619
11659
11675
11708
11760
11779
11783
11793
11828
11833
11847
11854
11867
11877
11923
11928
11971
11984
11993
12008
12013
12080
12101
12129
12157
12174
12185
12191
12202
12215
12254
12261
12262
12301
12305
12316
12325
12339
12366
12392
12395
12449
12480
12518
12542
12558
12559
12570
12572
12595
12605
12626
12636
126

25851
25852
25854
25855
25857
25858
25859
25861
25862
25863
25864
25865
25867
25870
25871
25872
25874
25875
25876
25877
25879
25882
25883
25885
25886
25892
25893
25894
25895
25896
25897
25898
25899
25900
25901
25903
25904
25905
25906
25907
25908
25909
25911
25912
25913
25914
25915
25916
25919
25921
25922
25923
25924
25925
25926
25927
25928
25931
25932
25933
25934
25935
25936
25937
25938
25939
25940
25941
25943
25944
25947
25948
25953
25955
25957
25958
25960
25963
25965
25966
25967
25969
25970
25971
25972
25973
25974
25976
25977
25979
25980
25982
25984
25985
25987
25988
25989
25990
25991
25992
25993
25995
25998
26000
26001
26003
26007
26009
26010
26011
26013
26014
26015
26016
26018
26019
26020
26023
26028
26030
26033
26036
26037
26038
26039
26040
26041
26042
26043
26044
26047
26048
26055
26058
26059
26061
26062
26067
260685
26072
26074
26075
26080
26082
26084
26085
26088
26089
26090
26091
26092
26094
26096
26099
26103
26104
26106
26107
26109
26111
26117
26120
26122
26123
26126
26129
261

30095
30097
30098
30099
30105
30108
30115
30118
30119
30122
30125
30126
30132
30135
30136
30138
30139
30144
30151
30152
30162
30163
30166
30169
30171
30175
30180
30188
30190
30195
30203
30208
30209
30218
30220
30228
30231
30258
30267
30271
30282
30292
30294
30299
30301
30310
30325
30327
30330
30331
30336
30342
30374
30380
30384
30390
30396
30408
30463
30466
30472
30473
30475
30479
30480
30483
30485
30486
30490
30491
30493
30494
30495
30497
30501
30503
30504
30505
30515
30516
30518
30519
30522
30523
30528
30539
30555
30575
30576
30577
30578
30580
30597
30601
30609
30616
30623
30626
30628
30629
30638
30644
30647
30656
30657
30658
30679
30690
30693
30702
30703
30737
30749
30773
30782
30794
30795
30815
30822
30826
30831
30834
30840
30862
30918
30923
30936
30938
30975
30983
30989
30992
31026
31028
31042
31044
31045
31059
31061
31064
31065
31066
31070
31072
31077
31078
31084
31087
31093
31094
31096
31099
31109
31144
31147
31160
31200
31230
31231
31239
31258
31259
31272
31276
31278
31312
3132

35553
35554
35559
35561
35563
35564
35570
35571
35572
35578
35580
35588
35589
35590
35592
35593
35598
35600
35604
35605
35606
35607
35611
35620
35622
35632
35633
35634
35636
35638
35639
35642
35651
35654
35656
35658
35689
35691
35698
35705
35706
35707
35709
35710
35717
35723
35724
35726
35729
35731
35735
35738
35739
35740
35743
35750
35753
35756
35762
35764
35767
35770
35773
35776
35777
35778
35781
35787
35789
35790
35791
35798
35802
35803
35805
35806
35810
35815
35817
35822
35825
35830
35831
35834
35837
35838
35842
35848
35851
35854
35855
35856
35857
35859
35862
35866
35869
35870
35871
35873
35874
35882
35887
35889
35890
35892
35894
35895
35896
35899
35908
35909
35910
35911
35914
35915
35920
35924
35937
35938
35940
35941
35945
35946
35949
35951
35958
35960
35961
35963
35966
35975
35976
35985
35986
35988
35990
35991
35993
35995
35999
36002
36005
36008
36018
36020
36021
36022
36029
36033
36036
36039
36044
36047
36053
36055
36057
36058
36062
36064
36065
36068
36069
36071
36072
36079
3608

39617
39618
39621
39624
39625
39632
39638
39641
39642
39643
39649
39650
39662
39666
39669
39670
39671
39676
39677
39681
39683
39695
39699
39701
39704
39712
39713
39715
39726
39734
39736
39739
39741
39743
39745
39746
39747
39748
39749
39762
39768
39772
39774
39775
39776
39778
39779
39780
39783
39784
39785
39788
39789
39790
39792
39795
39797
39798
39807
39811
39813
39815
39820
39823
39825
39826
39828
39830
39835
39839
39842
39844
39845
39846
39847
39852
39857
39864
39868
39869
39871
39872
39875
39876
39877
39878
39879
39883
39884
39885
39887
39888
39890
39891
39893
39894
39895
39897
39899
39905
39906
39909
39912
39918
39921
39933
39935
39939
39945
39948
39949
39951
39952
39954
39959
39961
39964
39965
39966
39971
39973
39975
39977
39978
39983
39985
39988
39990
39995
40001
40003
40014
40015
40017
40018
40019
40021
40023
40024
40025
40027
40029
40030
40031
40033
40042
40049
40052
40053
40057
40058
40062
40068
40069
40071
40075
40079
40080
40083
40084
40086
40087
40088
40091
40092
40095
4009

42617
42618
42619
42621
42623
42624
42625
42626
42629
42630
42631
42633
42634
42635
42636
42637
42638
42639
42642
42643
42644
42645
42646
42648
42650
42651
42652
42654
42657
42658
42659
42661
42663
42665
42666
42668
42671
42674
42680
42681
42685
42686
42687
42688
42689
42690
42691
42694
42696
42697
42698
42700
42702
42703
42704
42706
42707
42708
42709
42714
42715
42716
42717
42718
42720
42722
42723
42724
42725
42726
42728
42729
42731
42732
42733
42736
42739
42740
42741
42742
42743
42744
42745
42747
42749
42750
42751
42753
42754
42756
42758
42759
42760
42761
42763
42764
42765
42766
42768
42769
42771
42773
42774
42775
42776
42777
42779
42780
42781
42782
42783
42784
42785
42791
42792
42793
42794
42795
42796
42797
42801
42802
42803
42804
42805
42806
42807
42808
42809
42810
42811
42812
42813
42814
42815
42817
42818
42819
42820
42821
42822
42824
42825
42826
42827
42830
42831
42832
42833
42834
42835
42836
42837
42838
42840
42841
42842
42843
42844
42845
42847
42849
42853
42854
42855
42856
4285

44673
44674
44675
44676
44677
44680
44682
44683
44686
44687
44690
44694
44695
44696
44697
44698
44699
44700
44701
44704
44708
44709
44712
44713
44714
44716
44717
44718
44720
44721
44722
44724
44725
44726
44729
44730
44731
44733
44735
44738
44739
44741
44742
44744
44745
44747
44748
44749
44750
44751
44752
44753
44754
44755
44757
44759
44760
44761
44762
44763
44768
44769
44770
44772
44773
44774
44775
44776
44779
44783
44789
44790
44794
44796
44797
44798
44799
44802
44803
44805
44806
44807
44808
44810
44812
44813
44814
44816
44819
44820
44821
44822
44823
44824
44825
44826
44827
44829
44830
44831
44833
44834
44836
44838
44840
44841
44842
44843
44844
44845
44846
44847
44848
44849
44850
44852
44853
44856
44857
44858
44859
44860
44862
44864
44865
44867
44869
44870
44871
44873
44874
44876
44881
44882
44884
44886
44887
44888
44889
44890
44891
44894
44895
44896
44899
44900
44902
44903
44904
44905
44906
44907
44908
44909
44910
44911
44914
44915
44916
44917
44920
44921
44923
44925
44928
44929
4493

46693
46694
46696
46697
46699
46700
46701
46702
46703
46705
46706
46708
46710
46712
46713
46714
46715
46719
46720
46721
46722
46723
46725
46726
46729
46730
46731
46732
46734
46735
46736
46737
46738
46739
46740
46741
46742
46743
46745
46746
46747
46748
46749
46750
46751
46753
46754
46755
46756
46757
46758
46759
46762
46763
46764
46765
46766
46767
46771
46774
46776
46777
46778
46780
46781
46782
46785
46788
46790
46793
46794
46797
46799
46800
46801
46803
46804
46805
46806
46807
46809
46810
46811
46812
46814
46818
46819
46821
46823
46825
46826
46827
46828
46829
46830
46832
46835
46836
46837
46838
46839
46840
46843
46845
46848
46849
46851
46852
46853
46854
46859
46861
46862
46863
46865
46866
46868
46870
46871
46873
46876
46877
46878
46879
46880
46881
46884
46885
46888
46890
46891
46893
46894
46896
46898
46899
46900
46904
46907
46910
46913
46915
46916
46917
46918
46919
46922
46925
46926
46927
46929
46930
46931
46932
46933
46934
46936
46941
46945
46946
46948
46949
46950
46952
46953
46955
4695

48781
48783
48784
48787
48788
48789
48790
48791
48792
48793
48796
48798
48799
48800
48801
48803
48806
48810
48811
48812
48814
48815
48818
48819
48821
48822
48823
48824
48825
48827
48828
48829
48830
48831
48832
48833
48836
48838
48843
48844
48846
48847
48848
48850
48851
48853
48854
48855
48856
48857
48858
48859
48860
48861
48863
48866
48868
48870
48871
48872
48874
48875
48876
48877
48878
48879
48880
48881
48882
48884
48885
48886
48887
48888
48890
48891
48892
48893
48894
48896
48899
48900
48902
48903
48905
48908
48909
48910
48912
48913
48914
48915
48916
48917
48918
48922
48923
48926
48927
48931
48932
48933
48934
48936
48937
48939
48940
48941
48943
48944
48945
48946
48948
48952
48953
48954
48955
48956
48960
48961
48963
48966
48968
48971
48972
48974
48975
48976
48981
48982
48983
48985
48986
48989
48990
48991
48993
48994
48995
48996
48997
48998
48999
49000
49001
49003
49004
49005
49006
49008
49010
49013
49014
49019
49020
49021
49022
49024
49026
49027
49030
49031
49032
49035
49037
49039
4904

In [36]:
# use recipe_dfs_dict to populate recipes_df
# loop over all recipe ids
for recipe_id, content in recipe_dfs_dict.items():

    # for each recipe_id, make a single df
    one_recipe_df = pd.concat(content, axis=1)

    # add the one_recipe_df to the master df
    recipes_df = pd.concat([recipes_df, one_recipe_df], axis=0)

# reindex with recipe_id as the index
recipes_df.set_index('recipe_id', inplace=True)
    
# inspect
display(recipes_df)
display(recipes_df.describe())
display(recipes_df.info())

Unnamed: 0_level_0,title,date_published,description,avg_rating,ratings_no,recipe_cats,5 stars,4 stars,3 stars,2 stars,1 star,reviews_no,video_present,photo_count,prepTime,cookTime,totalTime,ingredients_no,ingredient_names,steps_no,steps_str,steps_words_no,nutrition.calories,nutrition.carbohydrateContent,nutrition.cholesterolContent,nutrition.fatContent,nutrition.fiberContent,nutrition.proteinContent,nutrition.saturatedFatContent,nutrition.servingSize,nutrition.sodiumContent,nutrition.sugarContent,nutrition.transFatContent,nutrition.unsaturatedFatContent
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
10004,Pavlova,2019-04-04,My grandmother's recipe from New Zealand for t...,4.52,25,"[Dessert Recipes, Specialty Dessert Recipes, P...",18,4,1,2,0,19,0,8,0,0,0,6,"[egg whites, distilled white vinegar, cold wat...",4,Preheat oven to 300 degrees F (150 degrees C)....,102,108.1 calories,26 g,,,,1.4 g,,,21.3 mg,25.2 g,,
10007,Evelyn's Cornflake Cookies,1997-09-26,I got this recipe from my ex mother-in-law ...,4,8,"[Dessert Recipes, Cookies]",4,1,2,1,0,8,0,2,0,0,0,9,"[cream cheese, butter, white sugar, all-purpos...",3,Cream together all ingredients except cornflak...,56,195.6 calories,24.6 g,28.1 mg,10.3 g,0.5 g,2 g,6.4 g,,198.6 mg,8.9 g,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9800,Dutch Cinnamon Biscuits,2008-06-30,Tender with a spicy flavor.,2.75,4,"[World Cuisine Recipes, European, Dutch]",0,1,2,0,1,4,0,1,0,0,0,7,"[butter, white sugar, all-purpose flour, cinna...",3,Preheat oven to 350 degrees F (175 degrees C)....,92,159.9 calories,17.5 g,35.8 mg,9 g,0.6 g,2.6 g,5.1 g,,61.1 mg,5.4 g,,
9954,French Butter Cakes (Madeleines),2008-06-30,Sponge cake cookie--in shell shaped molds.,4.52063,315,"[World Cuisine Recipes, European, French]",219,62,20,7,7,247,1,137,0,0,0,8,"[eggs, vanilla extract, salt, white sugar, all...",9,Preheat oven to 375 degrees F (190 degrees C)....,202,108.8 calories,15.3 g,41.2 mg,4.7 g,0.2 g,1.6 g,2.7 g,,63.3 mg,11.2 g,,


Unnamed: 0,5 stars,4 stars,3 stars,2 stars,1 star,reviews_no,video_present,photo_count,prepTime,cookTime,totalTime,steps_no,steps_words_no
count,3391.000000,3391.000000,3391.000000,3391.000000,3391.000000,3391.00000,3391.000000,3391.000000,3391.000000,3391.000000,3391.000000,3391.000000,3391.00000
mean,105.787968,35.176644,10.368033,3.991448,3.232085,119.62312,0.083161,15.745798,18.343851,36.931584,82.878502,3.000295,93.29077
...,...,...,...,...,...,...,...,...,...,...,...,...,...
75%,69.000000,29.000000,9.000000,4.000000,3.000000,89.00000,0.000000,11.000000,20.000000,40.000000,80.000000,4.000000,116.00000
max,9796.000000,2073.000000,755.000000,373.000000,385.000000,10043.00000,1.000000,1433.000000,720.000000,1080.000000,1440.000000,13.000000,723.00000


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3391 entries, 10004 to 9954
Data columns (total 34 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   title                            3391 non-null   object
 1   date_published                   3391 non-null   object
 2   description                      3391 non-null   object
 3   avg_rating                       3391 non-null   object
 4   ratings_no                       3391 non-null   object
 5   recipe_cats                      3391 non-null   object
 6   5 stars                          3391 non-null   int64 
 7   4 stars                          3391 non-null   int64 
 8   3 stars                          3391 non-null   int64 
 9   2 stars                          3391 non-null   int64 
 10  1 star                           3391 non-null   int64 
 11  reviews_no                       3391 non-null   int64 
 12  video_present                 

None

In [37]:
# save the master spreadsheet

# go to the folder of interest
round_folder = f'/home/bkotryna/ML_practice/allrecipes_project/data/{input_timestamp}/'
path_round_folder = Path(round_folder)
os.chdir(path_round_folder)
print(f'We\'ll save recipes_df as an Excel spreadsheet in the folder for this round:\n{os.getcwd()}')

# save as Excel
name_to_save = 'processed_recipe_data.xlsx'
recipes_df.to_excel(name_to_save)

We'll save recipes_df as an Excel spreadsheet in the folder for this round:
/home/bkotryna/ML_practice/allrecipes_project/data/joint_2021-05-11_11-28
