In [3]:
import pandas as pd
import re

# Step 1: Read the CSV file
# Assuming the CSV file has a column 'id' for recipe IDs and 'tags' for the associated tags
df = pd.read_csv('cleaned_data/recipes.csv')  # Replace with your actual file path

In [4]:
# Define a function to clean the tags
def clean_tag(tag):
    # Remove leading/trailing whitespace and unwanted characters
    tag = re.sub(r"[\[\]']+", "", tag.strip())
    # Replace all hyphens with spaces
    tag = tag.replace('-', ' ')
    return tag

# Apply the function to clean each tag
df['tags'] = df['tags'].apply(clean_tag)

# Remove duplicate tags (assuming each row is a separate record and you want to remove duplicates within a single record)
df['tags'] = df['tags'].drop_duplicates()

In [5]:
df.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,description,calories
0,better than sex strawberries,42198,1460,41531,2002-10-03,"weeknight, time to make, course, main ingredie...",simple but sexy. this was in my local newspape...,734.1
1,chinese chop suey,8559,70,4481,2001-01-27,"weeknight, time to make, course, main ingredie...",easy one-pot dinner.,395.4
2,fried potatoes,37073,40,1533,2002-08-13,"60 minutes or less, time to make, course, main...","my husband made these up last week, very tasty...",132.6
3,momma s special marinade,30131,10,41480,2002-06-03,"15 minutes or less, time to make, course, cuis...",we usually use with chicken. my neices ask for...,199.2
4,munch without guilt tomatoes,30300,10,6164,2002-06-04,"15 minutes or less, time to make, course, main...",anytime munchies...another sweat free preparat...,3.0


In [6]:
#Fix possible issue with the data format
df['tags'] = df['tags'].astype(str)

# Step 2: Extract tags and create a unique list of them
all_tags = set()
for tags_str in df['tags']:
    tags = tags_str.split(',')  # Split the tags string into a list
    all_tags.update(tags)
    
# Convert the set to a sorted list to have consistent ID assignment
unique_tags = sorted(list(all_tags))

# Create a mapping of tag names to tag IDs
tag_to_id = {tag: tag_id + 1 for tag_id, tag in enumerate(unique_tags)}  # Adding 1 to start IDs from 1

print(tag_to_id)

{'': 1, ' 1 day or more': 2, ' 15 minutes or less': 3, ' 3 steps or less': 4, ' 30 minutes or less': 5, ' 4 hours or less': 6, ' 5 ingredients or less': 7, ' 60 minutes or less': 8, ' african': 9, ' american': 10, ' amish mennonite': 11, ' appetizers': 12, ' apples': 13, ' argentine': 14, ' asian': 15, ' asparagus': 16, ' australian': 17, ' avocado': 18, ' bacon': 19, ' baja': 20, ' baking': 21, ' bananas': 22, ' bar cookies': 23, ' barbecue': 24, ' bass': 25, ' beans': 26, ' beef': 27, ' beef ribs': 28, ' beef sausage': 29, ' beginner cook': 30, ' berries': 31, ' beverages': 32, ' birthday': 33, ' biscotti': 34, ' bisques cream soups': 35, ' black beans': 36, ' blueberries': 37, ' bread machine': 38, ' breads': 39, ' breakfast': 40, ' broccoli': 41, ' broil': 42, ' brown bag': 43, ' brown rice': 44, ' brownies': 45, ' brunch': 46, ' burgers': 47, ' cajun': 48, ' cake fillings and frostings': 49, ' cakes': 50, ' californian': 51, ' camping': 52, ' canadian': 53, ' candy': 54, ' canning

In [7]:
# Step 3: Create a CSV file with the unique tags
tags_df = pd.DataFrame(tag_to_id.items(), columns=['tag_name', 'tag_id']).dropna()
tags_df

Unnamed: 0,tag_name,tag_id
0,,1
1,1 day or more,2
2,15 minutes or less,3
3,3 steps or less,4
4,30 minutes or less,5
...,...,...
356,danish,357
357,lactose,358
358,,359
359,time to make,360


In [8]:
tags_df.to_csv('cleaned_data/tags.csv', index=False)  # The index will act as the TagID

In [9]:
# Step 4: Create a CSV file with the recipe ID and its associated tags
recipe_tags = []
for index, row in df.iterrows():
    recipe_id = row['id']
    tags = row['tags'].split(',')
    for tag in tags:
        if tag:
            tag_id = tag_to_id[tag]
            recipe_tags.append({'recipe_id': recipe_id, 'tag_id': tag_id})

recipe_tags_df = pd.DataFrame(recipe_tags)
recipe_tags_df.to_csv('cleaned_data/recipe_tags_link.csv', index=False)

In [10]:
# Step 5: Generate SQL INSERT INTO statements
insert_statements = []
for recipe_tag in recipe_tags:
    statement = f"INSERT INTO TagLink VALUES ( {recipe_tag['recipe_id']}, '{recipe_tag['tag_id']}' );"
    insert_statements.append(statement)

# Optionally, you can write the insert statements to a file
with open('../insert_statements/tag_link_insert_statements.txt', 'w') as f:
    for statement in insert_statements:
        f.write(statement + '\n')

#Print out the first few insert statements as a sample
print('\n'.join(insert_statements[:5]))

INSERT INTO TagLink VALUES ( 42198, '361' );
INSERT INTO TagLink VALUES ( 42198, '326' );
INSERT INTO TagLink VALUES ( 42198, '89' );
INSERT INTO TagLink VALUES ( 42198, '183' );
INSERT INTO TagLink VALUES ( 42198, '246' );
