In [11]:
import numpy as np
import pandas as pd

import ast

In [26]:
def extract_singular_values_from_json(df, col, key):
    """
    This function takes a DataFrame containing a colummn filled with JSON dictionaries 
    and extracts the value for the specified key.
    
    Rows that cause errors are left out.
    
    Input
    -----
    df: DataFrame containing JSON dictionaries
    col: column within DataFrame containing the JSON dictionaries
    key: JSON dictionary key for which to extact the value for
    
    Output
    ------
    temp_df: DataFrame containing extracted values from JSON dictionary
    error_list: list of errors during extraction if any.
    """
    
    # Initiate blank dictionary and lists 
    item_dict = {'row_index' : list(),
                 key : list()
                }
    error_list = list()
    error_count = 0

    # Looping through each row to gather the value for the specified key from JSON object
    for row_index, row in enumerate(df.loc[:,col]):

        try:
            # Convert string entry into list containing JSON
            # Then select first item in list to access JSON dictionary
            json_dict = ast.literal_eval(row)[0]

            item_dict['row_index'].append(row_index)
            item_dict[key].append(json_dict[key])

        except:
            error_count += 1
            print(f"Errors: {error_count}, last error at {row_index}.", end='\r')
            error_list.append(row_index)
            continue
    
    # Convert dictionary into a DataFrame, then examine DataFrame
    temp_df = pd.DataFrame(item_dict).set_index('row_index')
    # print(f"Errors: {error_count}, last error at {row_index}.")
    # print(f"Number of extracted values {len(temp_df)}")
    # display(temp_df.head())
    
    # Return DataFrame with extracted values
    return temp_df, error_list

In [2]:
raw_data_df = pd.read_csv("output.csv")

In [3]:
raw_data_df

Unnamed: 0,recipe_url,title,image,rating_average,rating_count,review_count,description,update_date,ingredient,direction,nutrition_summary,nutrition_detail,time,label,review_dict,description_additional
0,https://www.allrecipes.com/recipe/263037/insta...,Instant Pot Best Beef Stew,['https://www.allrecipes.com/thmb/d911MNd9YViv...,4.6,209,161 Reviews,"This Instant Pot stew recipe is the ultimate, ...","Updated on February 26, 2023","['1 tablespoon butter', '1 pound beef chuck, c...",['Gather all ingredients.\n\n\n\n\n\n\n\n\n\n\...,"{'Calories': '352', 'Fat': '16g ', 'Carbs': '3...","['% Daily Value *', 'Total Fat 16g', 'Saturate...","{'Prep Time:': '20 mins', 'Cook Time:': '55 mi...","['Recipes', 'Soups, Stews and Chili Recipes', ...","[{'@context': 'http://schema.org', '@type': ['...","['Gather all ingredients.', 'Turn on a multi-f..."


In [29]:
# Create a blank DataFrame with the recipe indices to store extracted values:
json_data_df = raw_data_df[['review_dict']].copy().drop(columns = "review_dict")

## Keys with single values
keys_with_single_values = ['name', 'datePublished', 'dateModified', 'description']

for key in keys_with_single_values:
    # Use predefined function to extract data from JSON dictionary
    temp_df, temp_errors = extract_singular_values_from_json(raw_data_df, 'review_dict', key)
    
    # Merge extracted data with DataFrame of extracted data using recipe indices
    json_data_df = pd.merge(
        json_data_df, 
        temp_df, 
        how = 'left', 
        left_index = True, 
        right_index = True,
        validate = '1:1' # validate was used to ensure that only 1 extracted value was matched with 1 recipe
    )

# Generate a new feature for the word count of recipe name and description
json_data_df['name_wc'] = [len(str(name).split(' ')) for name in json_data_df['name'].tolist()]
json_data_df['description_wc'] = [len(str(desc).split(' ')) for desc in json_data_df['description'].tolist()]

# Converting string column to datetime with timezone as UTC
json_data_df['datePublished'] = pd.to_datetime(json_data_df['datePublished'], utc=True)
json_data_df['dateModified'] = pd.to_datetime(json_data_df['dateModified'], utc=True)

## Reviews
temp_dict = {'review' : list(),
             'review_wc' : list(),
             'number_of_review' : list()}

recipe_dict = ast.literal_eval(raw_data_df["review_dict"])[0]
reviews = recipe_dict["review"]

# Initiate a blank string to store concatenated reviews
temp_review = str()

# Initiate zero for the word count of reviews
temp_review_wc = 0

# Iterate over each review within the recipe
for review in reviews:
    # Concatenate the review into a single string
    temp_review += review['reviewBody'] + ' '
    
    # Perform a word count for the review, then add to total word count
    temp_review_wc += len(str(review['reviewBody']).split(' '))

# Append extracted data back to blank dictionary
temp_dict['review'].append(temp_review)
temp_dict['number_of_review'].append(len(reviews))
temp_dict['review_wc'].append(temp_review_wc)

# Merge extracted data with DataFrame of extracted data
json_data_df = pd.merge(
    json_data_df, 
    temp_df, 
    how = 'left', 
    left_index = True, 
    right_index = True,
    validate = '1:1'
)

json_data_df

ValueError: malformed node or string: 0    [{'@context': 'http://schema.org', '@type': ['...
Name: review_dict, dtype: object

In [30]:
ast.literal_eval(raw_data_df["review_dict"])[0]

ValueError: malformed node or string: 0    [{'@context': 'http://schema.org', '@type': ['...
Name: review_dict, dtype: object