# Functions for allrecipes.com project

Just so it's not all in one massive hard-to-navigate file.<br>
Let's see.

## Import packages / setup

In [47]:
# import public things

# general / random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipynb
import re # for string parsing / editing
import string # for string parsing / editing
from datetime import datetime
import time
import random
from pathlib import Path
import os
import ast

# for html
import requests # for getting html off the web
from bs4 import BeautifulSoup # for parsing html
import json

# for ML
from wordcloud import WordCloud, STOPWORDS
import snowballstemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# html retrieval etc

In [48]:
def extract_all_recipe_links_from_html_file(page):
    # find all recipe links in a website and save them into a list
    # input = html page
    # output = a list of links
    
    links_list = []

    all_link_elements = page.find_all('a', attrs={'class':'card__titleLink manual-link-behavior'})

    for element in all_link_elements:
        actual_link = element.get('href')

        if '/recipe/' in actual_link:
            # don't add a link if it's already in the list
            if actual_link not in links_list:
                links_list.append(actual_link)
            
    return links_list

In [49]:
def extract_recipe_ids_from_all_links_from_html_file(page):
    # find all recipe links in a website and save them into a list
    # input = html page
    # output = a list of links
    
    ids_list = []

    all_link_elements = page.find_all('a', attrs={'class':'card__titleLink manual-link-behavior'})

    for element in all_link_elements:
        actual_link = element.get('href')

        if '/recipe/' in actual_link:
            # don't add a link if it's already in the list
            recipe_id = actual_link.split('/recipe/')[1].split('/')[0]
            if recipe_id not in ids_list:
                ids_list.append(recipe_id)
            
    return ids_list

In [50]:
def retrieve_all_pages_from_a_list_of_links(links_list):
    # retrieve all pages from a list of links
    # input = a list of links
    # output = a list of html pages + print the length of the list
    
    page_list = []

    for link in links_list: # can do the whole links_list if brave / have time
        retrieved = requests.get(link)
        page = BeautifulSoup(retrieved.content)
        page_list.append(page)

    print("Length of the recipe list is: ", len(page_list))
    
    return page_list

In [51]:
def load_html_pages_from_file(filename):
    # open a file that has lots of html pages and generate a list with individual html pages as elements
    # input = filename / location
    # output = (1) list of html pages, page_list, (2) print length of the page list

    page_list = []

    with open(filename, 'r') as file:
        file_content = file.read()
        search_phrase = '<!DOCTYPE html>'
        while search_phrase in file_content:
            # extract a single html page
            first_part = search_phrase
            second_part = file_content.split(search_phrase)[1]
            full_item = first_part + second_part
            html_page = BeautifulSoup(full_item)
            page_list.append(html_page)

            # edit the remaining text to remove the extracted html content
            file_content = file_content.split(full_item)[1:][0]
                
    print("Length of the page_list is ", len(page_list))
    
    return page_list

In [52]:
def retrieve_collections_from_id_list(id_list):
    # input = a list of numerical id's for pages on allrecipes.com
    # output =
    # (1) page_list = a list of retrieved html pages
    # (2) save each html as a separate file
    
    print('Start retrieving collections:\n')
    
    page_list = []
    for one_id in id_list:
        one_id = str(one_id)
        
        print(f'Now retrieving collection id {one_id}')
        
        base_url = 'https://www.allrecipes.com/recipes/'
        full_url = f"{base_url}{one_id}"
        retrieved = requests.get(full_url)
        page = BeautifulSoup(retrieved.content)
        page_list.append(page)

        name_to_save = f'{one_id}'
        with open(name_to_save, 'w') as f:
            for page in page_list:
                f.write("%s\n" % page)

    print('\nCollection retrieval now finished.\n')
    
    return page_list

In [53]:
def extract_a_collection_name_from_an_html_page(page):
    # input = an html page for a recipe collection
    # output = collection_name = the name of the recipe collection
    
    title = page.find('title').text
    collection_name = title.split('Recipes')[0].strip()
        
    return collection_name

In [54]:
def retrieve_recipes_from_id_list(id_list):
    # input = a list of numerical id's for pages on allrecipes.com
    # output =
    # (1) pages_retrieved = the number of retrieved html pages
    # (2) save them as a file
    
    print('Start retrieving recipes:\n')
    
    pages_tried = 0
    pages_retrieved = 0
    for one_id in id_list:
        one_id = str(one_id)
        base_url = 'https://www.allrecipes.com/recipe/'
        full_url = f"{base_url}{one_id}"
        retrieved = requests.get(full_url)
        pages_tried += 1
        print(f'\n{one_id} - attempting to retrieve. That makes it {pages_tried} pages tried so far.')
        
        if retrieved.status_code == 200:
            page = BeautifulSoup(retrieved.content)
            name_to_save = f'{one_id}'
            with open(name_to_save, 'a') as f:
                f.write("%s\n" % page)
            pages_retrieved += 1
            print(f'Success => {pages_retrieved} retrieved total')
        
        # delay so web-crawling doesn't get blocked by the website etc
        delay()
    
    print('\nRecipe retrieval now finished.\n')
    
    return pages_retrieved

# Extract useful information from recipe html's: no json

In [55]:
def does_recipe_have_video(page):
    # extract if the recipe has a video
    # get a yes-or-no answer (1 or 0)
    # input = html page
    # output = 1 if video present, 0 if not

    video_element = page.find('label', attrs={'class':'recipe-play-label'})
    video_present = 0 # default
    if video_element:
        video_present = 1
    else:
        video_present = 0
        
    return video_present

In [56]:
def extract_number_of_photos(page):
    # extract the number of photos the recipe has
    # input = html page
    # output = number of photos
    
    photo_count = 0 # default
    photo_count_raw = page.find('a', attrs={'class':'ugc-ratings-link ugc-photos-link'})
    if photo_count_raw:
        photo_count_text = page.find('a', attrs={'class':'ugc-ratings-link ugc-photos-link'}).text.strip()

        # only keep the digits
        photo_count_number = re.sub("[^0-9]", "", photo_count_text).strip()
        photo_count = int(photo_count_number)
        
    return photo_count

# Extract useful information from recipe html's: use json

In [57]:
def cautiously_populate_df(input_df, column_name):
    # input = pandas df, name of the column of interest
    # output =
    # (either) if the column exists => value of the first row in the column of interest
    # (or) if the column doesn't exist => number zero
    
    if column_name in input_df.columns:
        cell_value = input_df[column_name][0]
    else:
        cell_value = 0
    
    return cell_value

In [58]:
def extract_info_from_json_on_page_to_df(page):
    # input = BeautifulSoup html page of a recipe
    # output = pandas df that contains information from json near the beginning of the html

    recipe_info_df = pd.DataFrame()
    
    # get out a useful chunck of json from the recipe html
    json_block = page.find('script', type='application/ld+json')
    if json_block:
        json_block = json.loads(json_block.string)

        # extract some recipe parameters, eg cook time
        recipe_info_df = pd.json_normalize(json_block[1], max_level=None)
    
    return recipe_info_df

In [59]:
def extract_key_info(recipe_info_df):
    # input = recipe_info_df with raw data from json
    # output = key_info df

    # extract recipe_id
    recipe_id = recipe_info_df['mainEntityOfPage'][0]
    recipe_id = recipe_id.split('recipe/')[1].split('/')[0]
    recipe_id = int(recipe_id)

    # extract recipe title
    recipe_title = recipe_info_df['name'][0]

    # extract date of publication
    date_published = recipe_info_df['datePublished'][0]
    date_published = date_published.split('T')[0]

    # extract description
    recipe_description = recipe_info_df['description'][0]

    # extract ratings info
    avg_rating = cautiously_populate_df(recipe_info_df, 'aggregateRating.ratingValue')
    ratings_no = cautiously_populate_df(recipe_info_df, 'aggregateRating.ratingCount')
    
    # extract official "recipeCategories"
    recipe_cats = recipe_info_df['recipeCategory'][0]

    # make a df to store all the information
    key_info_df = pd.DataFrame(columns=['recipe_id', 'title', 'date_published', 'description',
                                        'avg_rating', 'ratings_no', 'recipe_cats'])
    key_info_df.loc[0] = [recipe_id, recipe_title, date_published, recipe_description,
                          avg_rating, ratings_no, recipe_cats]
    
    return key_info_df

In [60]:
def extract_times(recipe_info_df):
    # extract times
    # input = recipe_info_df with raw data from json
    # output = times_df with times in minutes

    # set up time naming conversions
    time_columns = ['prepTime', 'cookTime', 'totalTime']

    times_dict = {}
    for time_column in time_columns:

        if time_column in list(recipe_info_df.columns):
            # extract time in minutes
            time = recipe_info_df[time_column][0]
            time_markers = ['T', 'H', 'M']
            if  isinstance(time, str) and all(markers in time for markers in time_markers):
                time_h = int(time.split('T')[1].split('H')[0])
                time_min = int(time.split('H')[1].split('M')[0])
                time_in_min = time_h * 60 + time_min
            else:
                time_in_min = 0
        else:
            # if this type of time doesn't exist in our input data, let's put it as zero
            time_in_min = 0

        # append data to times_dict to store time information
        times_dict[time_column] = time_in_min

    # create a df to store information
    times_df = pd.DataFrame(times_dict, index=[0])
    
    return times_df

In [61]:
def extract_ingredients(recipe_info_df):
    # extract ingredients
    # input = recipe_info_df with raw data from json
    # output = ingredients_df with ingredients names in a list

    ingredients_list = recipe_info_df['recipeIngredient'][0]
    
    # number of ingredients
    ingredients_no = len(ingredients_list)

    # extract just the names of ingredients
    measurements = ['spoon', 'cup', 'pinch', 'package', 'ounce', ' kg ', ' g ', ' l ', ' ml ', 'pound', 'and']
    ingredient_names = []

    for item in ingredients_list:
        # remove numbers
        item_name = re.sub(r'[^-A-Za-z ]+', "", item)

        # remove non-informative words
        # BTW word cloud is an easy way to spot common non-informative words that have been left in
        useless_words = ['spoon', 'cup', 'pinch', 'package', 'ounce', ' kg ', ' g ', ' l ', ' ml ', 'pound',
                         'and','dash', 'dice', 'cube', 'chop', 'ground', 'drain', 'beat', 'or more', ' or ', 'to taste', 'taste',
                        'cut', 'piece', 'slice', 'inch', 'into', 'grate', 'peel', 'large', 'medium', 'small']

        item_name = ' '.join([i for i in item_name.split() if not any(w in i.lower() for w in useless_words)])
        item_name = item_name.strip(string.punctuation).strip()

        # add to list
        ingredient_names.append(item_name)
    
    # make a df as output
    ingredients_df = pd.DataFrame(columns=['ingredients_no', 'ingredient_names'])
    ingredients_df.loc[0] = [ingredients_no, ingredient_names]

    return ingredients_df

In [62]:
def extract_method_steps(recipe_info_df):
    # input = recipe_info_df
    # output = steps_df, which contains information on:
    # (1) the number of steps in the method
    # (2) a string that contains all the steps joined together

    # extract step information from recipes_info_df
    steps_json = recipe_info_df['recipeInstructions'][0]
    # de-json the steps into a pandas df
    steps_json_df = pd.json_normalize(steps_json)
    # join all steps into a list
    steps_list = steps_json_df['text'].tolist()

    # count number of steps
    steps_no = len(steps_list)

    # join all steps from the list into a string
    steps_str = ' '.join(steps_list)
    # remove newline characters
    steps_str = steps_str.replace('\n','')
    
    # count number of words
    steps_words = steps_str.split()
    steps_words_no = len(steps_words)

    # make a steps_dict and then convert it into a mini df
    steps_dict = {'steps_no': steps_no, 'steps_str': steps_str, 'steps_words_no': steps_words_no}
    steps_df = pd.DataFrame(steps_dict, index=[0])
    
    return steps_df

In [63]:
def extract_nutritional_info(recipe_info_df):
    # extract nutrition info
    # input = recipe_info_df with raw data from json
    # output = nutrition_df with lots of nutritional info

    nutrition_columns = ['nutrition.calories',
           'nutrition.carbohydrateContent', 'nutrition.cholesterolContent',
           'nutrition.fatContent', 'nutrition.fiberContent',
           'nutrition.proteinContent', 'nutrition.saturatedFatContent',
           'nutrition.servingSize', 'nutrition.sodiumContent',
           'nutrition.sugarContent', 'nutrition.transFatContent',
           'nutrition.unsaturatedFatContent']

    nutrition_df = recipe_info_df[nutrition_columns]
    
    return nutrition_df

In [64]:
def extract_stars_and_review_info(page):
    # extract info about star rating and number of reviews
    # input = BeautifulSoup html page of the recipe
    # output = stars_and_reviews df with the information

    # get number of votes for each star
    # input = html page
    # output = dictionary with keys = star names, values = number of ratings

    star_names = []
    star_votes = []

    ratings = page.find_all('li', attrs={'class':'rating'})

    # need to guard against there being no ratings at all
    if ratings:

        for one_rating in ratings:
            name_raw = one_rating.find('span', attrs={'class':'rating-stars'}).text[0]
            name = int(name_raw)
            star_names.append(name)

            vote_raw = one_rating.find('span', attrs={'class':'rating-count'}).text.strip()
            vote = int(vote_raw)
            star_votes.append(vote)

        unique_star_names = star_names[0:5]
        unique_star_votes = star_votes[0:5]

        star_dictionary = dict(zip(unique_star_names, unique_star_votes))
    else:
        # if no ratings can be found, fill it out with zeros
        star_dictionary = dict(zip([5, 4, 3, 2, 1], [0, 0, 0, 0, 0]))

    # get the number of reviews
    review_number_raw = page.find('span', attrs={'class':'review-headline-count'})
    # check if there are any reviews
    if review_number_raw:
        review_number_as_string = page.find('span', attrs={'class':'review-headline-count'}).text.strip('()')
        review_number = int(review_number_as_string)
    else:
        # if there are no reviews, set the review_number to be zero
        review_number = 0

    reviews_no = review_number
    # generate an output df
    stars_and_reviews_df = pd.DataFrame(star_dictionary, index=[0])
    stars_and_reviews_df.columns = ['5 stars', '4 stars', '3 stars', '2 stars', '1 star']
    stars_and_reviews_df['reviews_no'] = [reviews_no]    

    return stars_and_reviews_df 

In [65]:
def extract_multimedia_info(page):
    # extract info on photos & video
    # input = BeautifulSoup html page for the recipe
    # output = multimedia_df with photo count & a yes/no video indicator

    video_present = does_recipe_have_video(page)

    photo_count = extract_number_of_photos(page)
    
    multimedia_df = pd.DataFrame({'video_present': video_present, 'photo_count': photo_count}, index=[0])
    
    return multimedia_df

# Other functions (wordclouds, save df as Excel)

In [66]:
def delay() -> None:
    # delay for web crawling (so you don't get blocked by the website etc)
    
    # between 1-3 seconds (allrecipes.com/robots.txt says crawl-delay: 1)
    time.sleep(random.uniform(1, 3))
    return None

In [76]:
def make_wordclouds(column_name, recipes_df, random_color_func, input_timestamp):
    # make wordclouds for specific column in a pandas df
    # https://amueller.github.io/word_cloud/index.html
    
    # input = the name of the column of interest, pandas df with recipe data, recipe_type (cuisine), colouring function
    # output = display a wordcloud & save it as a jpeg file
    
    print(f'\n*********\nNow working on {column_name}:')
    text = ' '.join(recipes_df[column_name])
    wordcloud = WordCloud(stopwords=STOPWORDS,
                          background_color='white',
                          width=1200,
                          height=1000,
                          color_func=random_color_func
                          ).generate(text)

    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()

    # save image as file
    folder_location = f'/home/bkotryna/ML_practice/allrecipes_project/data/{input_timestamp}/wordclouds'
    name_to_save = f"{folder_location}/{column_name}.jpeg"
    plt.savefig(name_to_save)

In [68]:
def random_color_func(word=None, font_size=None, position=None, orientation=None, font_path=None, random_state=None):
    # set colours for the wordclouds

    # HSL colour system
    # can use Colour Picker to choose values
    # https://www.google.co.uk/search?q=colour+picker

    # code that converts from 256-range colour format (or whatever)    
    # h = int(360.0 * 60.0 / 255.0)
    # s = int(100.0 * 100.0 / 255.0)
    # l = int(100.0 * float(random_state.randint(60, 120)) / 255.0)

    # my direct way
    h = random_state.randint(50, 350) # 0 - 360
    s = 60 # 0 - 100
    l = random_state.randint(20, 80) # 0 - 100

    return "hsl({}, {}%, {}%)".format(h, s, l)

# Functions for ML

In [69]:
# convert nutrition specs into float (strings with calories, g, mg => float)

def remove_units(df, column_name, unit):
    
    print('Now removing units from column: ', column_name)
    
    # inspect as is
    # display(df[column_name].describe())
    
    # remove units and convert to float
    df[column_name] = df[column_name].str.replace(unit, '')
    df[column_name] = df[column_name].astype(float)
    
    # rename the column to include units
    column_name_new = f'{column_name} in {unit}'
    df = df.rename(columns={column_name: column_name_new})
    
    # inspect the new version
    # display(df[column_name_new].describe())
    
    # print('******\n\n')
    
    return df

In [79]:
# do NMF
# express entries in a text column in terms of its top NMF components

def make_nmf_K(col, df, nmf_df, n=4):
    # input = df, column of interest, number of NMF components to keep
    # output = augmented df that now contains n new columns, each corresponding to an NMF components.
    # text from each row in the column of interest is expressed in terms of the NMF components
    
    print(f"\n************\nNow working on column '{col}':")
    
    # obtain data
    # cell = a string
    data = df[col].to_numpy()
    
    # tokenise (make into a bag of words)
    vectorizer = CountVectorizer(stop_words='english')
    vectors = vectorizer.fit_transform(data).todense()
    print(f"vocabulary size: {vectors.shape[1]}")
    print('Tokenising done.')
    
    # Print vocab items with their frequencies, sorted in descending order by frequency
    word_and_frequency_tuples = []
    for word, index in vectorizer.vocabulary_.items():
        frequency_of_current_word = vectors[:, index].sum()
        word_and_frequency_tuples.append((word, frequency_of_current_word))
    by_freq = sorted(word_and_frequency_tuples, key=lambda x: x[1], reverse=True)
    print(f'Most freqeunt words are:\n{by_freq[0:5]}')
    vocab_size = len(word_and_frequency_tuples)
    
    # might be useful some time
    indices_to_words = {index : word for word, index in vectorizer.vocabulary_.items()}

    # do NMF
    nmf = NMF(n_components=n)
    nmf_projections = nmf.fit_transform(vectors)
    #display(nmf_projections)
    print('NMF transforming done.')

    # Inspect individual components
    for component_id in range(len(nmf.components_)):
        print(f'\n***\nComponent {component_id} for {col}')
        
        ### Inspect what each component is about (i.e. what the topic is)
        print('\nMost important words are:')
        
        # BTW nmf.components_ is an np.array
        # nmf.components_[component_id]
        word_indices_in_descending_order_by_importance_for_component = sorted(
            range(vocab_size),
            key=(lambda word_index: nmf.components_[component_id, word_index]),
            reverse=True
        )
        for index in word_indices_in_descending_order_by_importance_for_component[:5]:
            word = indices_to_words[index]
            importance = nmf.components_[component_id, index]
            print('{} : {:.3f}'.format(word, importance))

        ### find the sentence that has the largest projection along that component
        id_of_recipe_with_greatest_projection_along_component = max(
            range(len(data)),
            key=(lambda recipe_index: nmf_projections[recipe_index, component_id])
        )
        print('\nThe entry that most strongly embodies this component is:')
        print(data[id_of_recipe_with_greatest_projection_along_component])

    # generate column names for nmf df
    col_names_list = []
    for num in range(1, nmf_projections.shape[1] + 1):
        col_name = f"{col}_nmf_{num}"
        col_names_list.append(col_name)

    # generate nmf df
    our_col_nmf_df = pd.DataFrame(nmf_projections, columns=col_names_list)
    our_col_nmf_df
    print('\n***\nnp.array made into pd.df')

    # set index to match recipe_id
    our_col_nmf_df['recipe_id'] = df.index
    our_col_nmf_df.set_index('recipe_id', inplace=True)
    print("Index now reset back to recipe_id.")
    
    return our_col_nmf_df

In [82]:
# do stemming, then bag-of-words, then NMF

def stem_make_nmf(col, df, nmf_df, n=4):
    # input = df, column of interest, number of NMF components to keep
    # output = augmented df that now contains n new columns, each corresponding to an NMF components.
    # text from each row in the column of interest is expressed in terms of the NMF components
    
    print(f"\n************\nNow working on column '{col}':")
    
    # obtain data
    # cell = a string
    data = df  
    
    # let's stem
    stemmer = snowballstemmer.stemmer('english')
    # will generate a list with one item per recipe
    # each item will be a string of stemmed words
    data['stemmed'] = ''

    for index in data.index:
        # stem
        item = data.loc[index, col]
        item_stem = stemmer.stemWords(item.split())

        # generate a single string per recipe
        data_string = ' '.join(item_stem)

        # remove strange quotation marks
        # (Ig ideally would know how not to generate them in the first place)
        data_string = data_string.replace("'","") 
        data_string = data_string.replace('"','')

        # append the string to df
        data.at[index, 'stemmed'] = data_string
    print('Stemming done') 
    
    
    data = df['stemmed'].to_numpy()
    
    
    # data = df[col].to_numpy()
    # tokenise (make into a bag of words)
    
    from sklearn.feature_extraction import text 

    # add extra stop words
    extra_stop_words = ['i', 'ii', 'iii']
    stop_words = text.ENGLISH_STOP_WORDS.union(extra_stop_words)
    
    vectorizer = CountVectorizer(stop_words=stop_words)
    vectors = vectorizer.fit_transform(data).todense()
    print(f"vocabulary size: {vectors.shape[1]}")
    print('Tokenising done.')
    
    # Print vocab items with their frequencies, sorted in descending order by frequency
    word_and_frequency_tuples = []
    for word, index in vectorizer.vocabulary_.items():
        frequency_of_current_word = vectors[:, index].sum()
        word_and_frequency_tuples.append((word, frequency_of_current_word))
    by_freq = sorted(word_and_frequency_tuples, key=lambda x: x[1], reverse=True)
    print(f'Most freqeunt words are:\n{by_freq[0:5]}')
    vocab_size = len(word_and_frequency_tuples)
    
    # might be useful some time
    indices_to_words = {index : word for word, index in vectorizer.vocabulary_.items()}

    # do NMF
    nmf = NMF(n_components=n)
    nmf_projections = nmf.fit_transform(vectors)
    #display(nmf_projections)
    print('NMF transforming done.')

    # Inspect individual components
    for component_id in range(len(nmf.components_)):
        print(f'\n***\nComponent {component_id} for {col}')
        
        ### Inspect what each component is about (i.e. what the topic is)
        print('\nMost important words are:')
        
        # BTW nmf.components_ is an np.array
        # nmf.components_[component_id]
        word_indices_in_descending_order_by_importance_for_component = sorted(
            range(vocab_size),
            key=(lambda word_index: nmf.components_[component_id, word_index]),
            reverse=True
        )
        for index in word_indices_in_descending_order_by_importance_for_component[:5]:
            word = indices_to_words[index]
            importance = nmf.components_[component_id, index]
            print('{} : {:.3f}'.format(word, importance))

        ### find the sentence that has the largest projection along that component
        id_of_recipe_with_greatest_projection_along_component = max(
            range(len(data)),
            key=(lambda recipe_index: nmf_projections[recipe_index, component_id])
        )
        print('\nThe entry that most strongly embodies this component is:')
        original_col = df[col].to_numpy()
        print(original_col[id_of_recipe_with_greatest_projection_along_component])

    # generate column names for nmf df
    col_names_list = []
    for num in range(nmf_projections.shape[1]):
        col_name = f"{col}_nmf_{num}"
        col_names_list.append(col_name)

    # generate nmf df
    our_col_nmf_df = pd.DataFrame(nmf_projections, columns=col_names_list)
    our_col_nmf_df
    print('\n***\nnp.array made into pd.df')

    # set index to match recipe_id
    our_col_nmf_df['recipe_id'] = df.index
    our_col_nmf_df.set_index('recipe_id', inplace=True)
    print("Index now reset back to recipe_id.")
    
    return our_col_nmf_df

In [73]:
def train_and_evaluate_regressor(regressor, X_train, y_train, X_test, y_test):
    
    regressor.fit(X_train, y_train)
    
    feature_names = X_train.columns
    features_and_importances = []
    
    for feature_id in range(len(regressor.feature_importances_)):
        feature_name = feature_names[feature_id]
        feature_importance = regressor.feature_importances_[feature_id]
        features_and_importances.append((feature_name, feature_importance))
        
    features_and_importances.sort(key=(lambda pair: pair[1]), reverse=True)
    
    for pair in features_and_importances[:10]:
        print('Feature {}: importance = {:.3f}'.format(pair[0], pair[1]))
    
    train_predictions = regressor.predict(X_train)
    test_predictions = regressor.predict(X_test)
    mean_abs_error_on_train = mean_absolute_error(y_train, train_predictions)
    mean_abs_error_on_test = mean_absolute_error(y_test, test_predictions)
    print('Mean abs error on train = {:.3f}'.format(mean_abs_error_on_train))
    print('Mean abs error on test = {:.3f}'.format(mean_abs_error_on_test))
    
    mean_sq_error_on_train = mean_squared_error(y_train, train_predictions)
    mean_sq_error_on_test = mean_squared_error(y_test, test_predictions)
    print('Mean sq error on train = {:.3f}'.format(mean_sq_error_on_train))
    print('Mean sq error on test = {:.3f}'.format(mean_sq_error_on_test))

# Not used currently, but seem useful

In [74]:
def save_retrieved_html_pages(page_list):
    # save downloaded html pages
    # input = a list of recipe html's for different recipe types (cuisines)
    # output = individual html files saved on the local machine
    
    time_now = datetime.today().strftime('%Y-%m-%d_%H-%M')
    content_type = 'pages'
    extension = 'html'
    name_to_save = f"./data/{time_now}_recipe_pages_html"

    with open(name_to_save, 'w') as f:
        for page in page_list:
            f.write("%s\n" % page)

# Old / backups

In [71]:
# express entries in a text column in terms of its top NMF components

def make_nmf_K_backup(col, df, nmf_df, n=4):
    # input = df, column of interest, number of NMF components to keep
    # output = augmented df that now contains n new columns, each corresponding to an NMF components.
    # text from each row in the column of interest is expressed in terms of the NMF components
    
    print(f"\n\nNow working on column '{col}':")
    
    # obtain data
    # cell = a string
    # we reset index since we'll be arranging things by index or sth
    df = df.reset_index(drop=True)
    data = df[col].to_numpy()
    #display(data)
    
    # tokenise (make into a bag of words)
    vectorizer = CountVectorizer(stop_words='english')
    vectors = vectorizer.fit_transform(data).todense()
    print(f"vocabulary size: {vectors.shape[1]}")
    print('Tokenising done')
    
    # Print vocab items with their frequencies, sorted in descending order by frequency
    word_and_frequency_tuples = []
    for word, index in vectorizer.vocabulary_.items():
        frequency_of_current_word = vectors[:, index].sum()
        word_and_frequency_tuples.append((word, frequency_of_current_word))
    by_freq = sorted(word_and_frequency_tuples, key=lambda x: x[1], reverse=True)
    print(f'Most freqeunt words are\n{by_freq[0:5]}')

    vocab_size = len(word_and_frequency_tuples)
    print('Number of words in vocab = {}'.format(vocab_size))

    indices_to_words = {index : word for word, index in vectorizer.vocabulary_.items()}

    # do NMF
    nmf = NMF(n_components=n)
    nmf_projections = nmf.fit_transform(vectors)
    #display(nmf_projections)
    print('NMF transforming done')

    # Inspect what each component is about (i.e. what the topic is)
    for component_id in range(len(nmf.components_)):
        print(f'\nComponent {component_id} for {col}')
        # BTW nmf.components_ is an np.array
        # nmf.components_[component_id]
        word_indices_in_descending_order_by_importance_for_component = sorted(
            range(vocab_size),
            key=(lambda word_index: nmf.components_[component_id, word_index]),
            reverse=True
        )
        for index in word_indices_in_descending_order_by_importance_for_component[:5]:
            word = indices_to_words[index]
            importance = nmf.components_[component_id, index]
            print('{} : {:.3f}'.format(word, importance))

    # For each component, find the sentence that has the largest projection along that component
    for component_id in range(len(nmf.components_)):
        id_of_recipe_with_greatest_projection_along_component = max(
            range(len(data)),
            key=(lambda recipe_index: nmf_projections[recipe_index, component_id])
        )
        print(f'\nComponent {component_id} for {col}')
        print('The entry that most strongly embodies this component is:')
        print(data[id_of_recipe_with_greatest_projection_along_component])

    # generate column names for nmf df
    col_names_list = []
    for num in range(1, nmf_projections.shape[1] + 1):
        col_name = f"{col}_nmf_{num}"
        col_names_list.append(col_name)

    # generate nmf df
    our_col_nmf_df = pd.DataFrame(nmf_projections, columns=col_names_list)
    our_col_nmf_df
    print('\nnp made into pd.df')

    # set index to match recipe_id
    our_col_nmf_df['recipe_id'] = df.index
    our_col_nmf_df.set_index('recipe_id', inplace=True)
    print("\nSet_index done.")

    # add the nmf columns to the master nmf_df
    nmf_df = pd.concat([nmf_df, our_col_nmf_df], axis=1)
    print('\nMaster df updated.')
    
    return nmf_df

In [72]:
# express entries in a text column in terms of its top NMF components

def make_nmf_old(col, df, nmf_df, n=15):
    # input = df, column of interest, number of NMF components to keep
    # output = augmented df that now contains n new columns, each corresponding to an NMF components.
    # text from each row in the column of interest is expressed in terms of the NMF components
    
    print(f"\n\nNow working on column '{col}':")
    
    # obtain data
    # cell = a string
    our_col = df[col]
    
    # let's stem
    stemmer = snowballstemmer.stemmer('english')
    # will generate a list with one item per recipe
    # each item will be a string of stemmed words
    our_col_stem = []

    for item in our_col:
        # stem
        item_stem = stemmer.stemWords(item.split())

        # generate a single string per recipe
        our_col_string = ' '.join(item_stem)

        # remove strange quotation marks
        # (Ig ideally would know how not to generate them in the first place)
        our_col_string = our_col_string.replace("'","") 
        our_col_string = our_col_string.replace('"','')

        # append the string to master list (one item per recipe)
        our_col_stem.append(our_col_string)
    print('Stemming done') 
    
    # tokenise (make into a bag of words)
    count_vect = CountVectorizer(stop_words='english')
    our_col_counts = count_vect.fit_transform(our_col_stem)
    print(f"vocabulary size: {our_col_counts.shape[1]}")
    print('Tokenising done')   
    
    # tf-idf
    our_col_tf = TfidfTransformer(use_idf=False).fit_transform(our_col_counts)
    print("TF-IDF done")
    
    # NMF
    nmf = NMF(n_components=n, random_state=0)
    # fit NMF model to tf-idf output
    nmf.fit(our_col_tf)

    # express ingredients per recipe via NMF components
    our_col_nmf = nmf.transform(our_col_tf)
    print('NMF transforming done')

    # generate column names for nmf df
    col_names_list = []
    for num in range(1, our_col_nmf.shape[1] + 1):
        col_name = f"{col}_nmf_{num}"
        col_names_list.append(col_name)

    # generate nmf df
    our_col_nmf_df = pd.DataFrame(our_col_nmf, columns=col_names_list)
    our_col_nmf_df
    print('np made into pd.df')

    # set index to match recipe_id
    our_col_nmf_df['recipe_id'] = df.index
    our_col_nmf_df.set_index('recipe_id', inplace=True)
    print("Set_index done")

    # add the nmf columns to the master nmf_df
    nmf_df = pd.concat([nmf_df, our_col_nmf_df], axis=1)
    print('Master df updated')
    
    return nmf_df