In [1]:
import pandas as pd
import re

# Step 1: Read the CSV file
# Assuming the CSV file has a column 'id' for recipe IDs and 'tags' for the associated tags
df = pd.read_csv('data/RAW_recipes.csv')  # Replace with your actual file path

In [2]:
# Define a function to clean the tags
def clean_tag(tag):
    # Remove leading/trailing whitespace and unwanted characters
    tag = re.sub(r"[\[\]']+", "", tag.strip())
    # Replace all hyphens with spaces
    tag = tag.replace('-', ' ')
    return tag

# Apply the function to clean each tag
df['tags'] = df['tags'].apply(clean_tag)

# Remove duplicate tags (assuming each row is a separate record and you want to remove duplicates within a single record)
df['tags'] = df['tags'].drop_duplicates()

In [3]:
df.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"60 minutes or less, time to make, course, main...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"30 minutes or less, time to make, course, main...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"time to make, course, preparation, main dish, ...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"60 minutes or less, time to make, course, main...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"weeknight, time to make, course, main ingredie...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [4]:
#Fix possible issue with the data format
df['tags'] = df['tags'].astype(str)

# Step 2: Extract tags and create a unique list of them
all_tags = set()
for tags_str in df['tags']:
    tags = tags_str.split(',')  # Split the tags string into a list
    all_tags.update(tags)
    
# Convert the set to a sorted list to have consistent ID assignment
unique_tags = sorted(list(all_tags))

# Create a mapping of tag names to tag IDs
tag_to_id = {tag: tag_id + 1 for tag_id, tag in enumerate(unique_tags)}  # Adding 1 to start IDs from 1

In [11]:
# Step 3: Create a CSV file with the unique tags
tags_df = pd.DataFrame(unique_tags, columns=['tag_name'])
tags_df.columns

Index(['tag_name'], dtype='object')

In [12]:
tags_df.to_csv('cleaned_data/tags.csv', index_label='tag_id')  # The index will act as the TagID

In [8]:
# Step 4: Create a CSV file with the recipe ID and its associated tags
recipe_tags = []
for index, row in df.iterrows():
    recipe_id = row['id']
    tags = row['tags'].split(',')
    for tag in tags:
        tag_id = tag_to_id[tag]
        recipe_tags.append({'recipe_id': recipe_id, 'tag_id': tag_id})

recipe_tags_df = pd.DataFrame(recipe_tags)
recipe_tags_df.to_csv('cleaned_data/recipe_tags_link.csv', index=False)

In [33]:
# Step 5: Generate SQL INSERT INTO statements
insert_statements = []
for recipe_tag in recipe_tags:
    statement = f"INSERT INTO Tags VALUES ( {recipe_tag['RecipeID']}, '{recipe_tag['TagID']}' );"
    insert_statements.append(statement)

# Optionally, you can write the insert statements to a file
with open('output/insert_statement.sql', 'w') as f:
    for statement in insert_statements:
        f.write(statement + '\n')

#Print out the first few insert statements as a sample
print('\n'.join(insert_statements[:5]))

INSERT INTO Tags VALUES ( 137739, '551' );
INSERT INTO Tags VALUES ( 137739, '513' );
INSERT INTO Tags VALUES ( 137739, '133' );
INSERT INTO Tags VALUES ( 137739, '295' );
INSERT INTO Tags VALUES ( 137739, '142' );
