In [1]:
!pip install --upgrade pandas pyarrow
!pip install --upgrade matplotlib seaborn missingno numexpr bottleneck



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import re
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as sns 
import missingno as msno 
import json
import os 
from IPython.display import display, HTML

np.random.seed(42) 


In [2]:
UTSG_DATA = pd.read_csv('DATA/UTSG_ML_data_food.csv')
UTM_DATA = pd.read_csv('DATA/UTM_ML_data_food.csv')
UTSG_DATA.insert(0, "campus", 'UTSG')
UTM_DATA.insert(0, "campus", 'UTM')

# frames = [UTSG_DATA,UTM_DATA]
# data = pd.concat(frames)
# TODO: what is Y?

In [None]:

def standardize_response_col_names(dataset):
    title = 'Answer the following questions about the food item:'
    new_cols = []
    cols_to_drop = []
    current_food = None
    
    for i, col in enumerate(dataset.columns):
        if title in str(col):
            current_food = col.split(':')[2].strip()
            cols_to_drop.append(col)  
        elif current_food and ':' in str(col):
            question = col.split(':', 1)[1].strip()
            new_cols.append(f"{current_food} {question}")
        else:
            new_cols.append(col)
    
    dataset = dataset.drop(columns=cols_to_drop)
    dataset.columns = new_cols
    
    return dataset

def clean_cols(dataset):
    cols_to_drop = [col for col in dataset.columns if '0.' in str(col)] + ['1.0', '0']
    data_dropped = dataset.drop(columns=cols_to_drop, errors='ignore')
    dataset = standardize_response_col_names(data_dropped)
    return dataset

def reshape_food_data(data_set):
    """
    Reshape dataframe from wide to long format.
    Each row is split into 3 rows (one for Pizza, Shawarma, and Sushi).
    """
    base_cols = [
        'From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)',
        'How many ingredients would you expect this food item to contain?',
        'In what setting would you expect this food to be served? Please check all that apply',
        'How much would you expect to pay for one serving of this food item?',
        'What movie do you think of when thinking of this food item?',
        'What drink would you pair with this food item?',
        'When you think about this food item, who does it remind you of?',
        'How much hot sauce would you add to this food item?'
    ]
    
    new_rows = []
    
    for idx, row in data_set.iterrows():
        pizza_data = {'ID': row['ID'], 'Food': 'Pizza'}
        for col in base_cols:
            pizza_data[col] = row[f'Pizza {col}']
        pizza_data['n correct'] = row['n correct']
        pizza_data['n incorrect'] = row['n incorrect']
        pizza_data['score'] = row['score']
        new_rows.append(pizza_data)
        
        shawarma_data = {'ID': row['ID'], 'Food': 'Shawarma'}
        for col in base_cols:
            shawarma_data[col] = row[f'Shawarma {col}']
        shawarma_data['n correct'] = row['n correct']
        shawarma_data['n incorrect'] = row['n incorrect']
        shawarma_data['score'] = row['score']
        new_rows.append(shawarma_data)
        
        sushi_data = {'ID': row['ID'], 'Food': 'Sushi'}
        for col in base_cols:
            sushi_data[col] = row[f'Sushi {col}']
        sushi_data['n correct'] = row['n correct']
        sushi_data['n incorrect'] = row['n incorrect']
        sushi_data['score'] = row['score']
        new_rows.append(sushi_data)
    
    data_set_split = pd.DataFrame(new_rows)
    
    return data_set_split


def parse_ingredient_count(value):

    # Deal with missing value 
    if pd.isna(value):
        return None, None, None
    
    value_str = str(value).strip()
    original = value_str
    
    # Word to number mapping
    word_to_num = {
        'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
        'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10,
        'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15,
        'sixteen': 16, 'seventeen': 17, 'eighteen': 18, 'nineteen': 19, 'twenty': 20
    }
    
    value_lower = value_str.lower()
    
    # remove unused symbol
    value_cleaned = value_str.replace('~', '').strip()
    
    # Best case: First just attempt to get the lone value
    try:
        count = float(value_str)
        return count, count, count
    except:
        pass

    # Harder cases
    # Find "to" ranges like e.g. "3 to 7"
    to_range_match = re.search(r'(\d+)\s+to\s+(\d+)', value_cleaned)
    if to_range_match:
        num1 = float(to_range_match.group(1)) #lower bound
        num2 = float(to_range_match.group(2)) #upper bound
        min_count = min(num1, num2)
        max_count = max(num1, num2)
        avg_count = (min_count + max_count) / 2
        return min_count, max_count, avg_count
    
    # Find ranges with dash e.g. "6-8"
    dash_range_match = re.search(r'(\d+)\s*-\s*(\d+)', value_cleaned)
    if dash_range_match:
        num1 = float(dash_range_match.group(1)) #lower bound
        num2 = float(dash_range_match.group(2)) #upper bound
        min_count = min(num1, num2)
        max_count = max(num1, num2)
        avg_count = (min_count + max_count) / 2
        return min_count, max_count, avg_count
    
    # Find "at least X" pattern
    at_least_match = re.search(r'at least\s+(\d+)', value_lower)
    if at_least_match:
        count = float(at_least_match.group(1))
        return count, count, count
    # Find "about/around X" pattern (treat as exact)
    about_match = re.search(r'(?:about|around|approximately)\s+(\d+)', value_lower)
    if about_match:
        count = float(about_match.group(1))
        return count, count, count
    
    # Find "X:<ingredient list>" pattern
    colon_match = re.search(r'^(\d+)\s*:', value_cleaned)
    if colon_match:
        count = float(colon_match.group(1))
        return count, count, count
    
    # Check for word numbers like "about three"
    for word, num in word_to_num.items():
        if word in value_lower:
            return float(num), float(num), float(num)
    
    # Count bullet point lists (markdown lists with * or -)
    bullet_matches = re.findall(r'^\s*[\*\-‚Ä¢]\s+.+', value_str, re.MULTILINE)
    if len(bullet_matches) > 0:
        count = float(len(bullet_matches))
        return count, count, count
    
    # Edge cases surrounded by junk text
    lines = value_str.split('\n')
    cleaned_lines = [line.strip() for line in lines if line.strip()]
    
    # Filter out header/intro lines
    ingredient_lines = []
    skip_patterns = [
        r'^i would expect',
        r'^it (would|should|might|could) (contain|have|include)',
        r'^\d+\s*:?\s*$',  # Just a number
    ]
    
    for line in cleaned_lines:
        line_lower = line.lower()
        if any(re.match(pattern, line_lower) for pattern in skip_patterns):
            continue
        ingredient_lines.append(line)
    
    # If we have multiple ingredient lines, count them
    if len(ingredient_lines) >= 2:
        count = float(len(ingredient_lines))
        return count, count, count
    
    # If we have exactly 1 ingredient line and there was a newline in original
    if len(ingredient_lines) == 1 and '\n' in value_str:
        return 1.0, 1.0, 1.0
    
    # Count commas in ingredient list
    if ',' in value_str:
        items = [item.strip() for item in value_str.split(',') if item.strip()]
        ingredient_items = [item for item in items if len(item) >= 2]
        if len(ingredient_items) > 1:
            count = float(len(ingredient_items))
            return count, count, count
    # Single word/phrase ingredient
    if '\n' not in value_str and len(value_str.split()) <= 4 and not any(char.isdigit() for char in value_str):
        return 1.0, 1.0, 1.0
    
    # Extract any number found in the string as last resort
    numbers = re.findall(r'\d+', value_str)
    if numbers:
        count = float(numbers[0])
        return count, count, count,
    
    return None, None, None


def apply_ingredient_parsing(df):
    column_name = 'How many ingredients would you expect this food item to contain?'
    results = df[column_name].apply(parse_ingredient_count)
    df[f'{column_name} (min)'] = results.apply(lambda x: x[0])
    df[f'{column_name} (max)'] = results.apply(lambda x: x[1])
    df['ingredient_count'] = results.apply(lambda x: x[2])

    
    return df



In [None]:
# canonicalizing movies 


def clean_text_for_matching(text):
    """Clean text for matching"""
    if pd.isna(text):
        return ''
    
    text = str(text).lower().strip()
    
    # Normalize all apostrophes to straight apostrophe FIRST
    text = text.replace(''', '\'').replace(''', '\'')
    
    # Remove quotation marks (both single and double) but NOT apostrophes
    text = text.replace('"', '').replace('"', '').replace('"', '')
    
    # Remove special whitespace characters
    text = re.sub(r'[\xa0\u200b\u200c\u200d\ufeff]', '', text)
    text = re.sub(r'\s+', ' ', text)
    
    # Remove common filler words at start
    text = re.sub(r'^(i think of |thinking of |reminds me of |the movie |movie |film )', '', text)
    
    # Remove punctuation at the end
    text = re.sub(r'[.,;!?]+$', '', text)
    
    return text.strip()


def map_to_canonical_movie(text, movie_mapping):
    """
    Map input text to a canonical movie title.
    """
    if pd.isna(text):
        return np.nan
    
    cleaned_text = clean_text_for_matching(text)
    
    if not cleaned_text:
        return np.nan
 
    # Try to match against canonical titles and their variations
    best_match = None
    longest_match_len = 0
    
    for canonical_title, variations in movie_mapping.items():
        for variation in variations:
            if len(variation) <= 2:
                pattern = r'\b' + re.escape(variation) + r'\b'
                if re.search(pattern, cleaned_text):
                    if len(variation) > longest_match_len:
                        longest_match_len = len(variation)
                        best_match = canonical_title
            else:
                if variation in cleaned_text:
                    if len(variation) > longest_match_len:
                        longest_match_len = len(variation)
                        best_match = canonical_title
    
    return best_match if best_match else np.nan


def apply_movie_mapping(df, column_name, output_column='movie_canonical', mapping_file='movie_mapping.json'):

    # Load movie mapping from JSON
    with open(mapping_file, 'r', encoding='utf-8') as f:
        movie_mapping = json.load(f)
    
    # Apply mapping
    df[output_column] = df[column_name].apply(
        lambda x: map_to_canonical_movie(x, movie_mapping)
    )
    

    
    
    return df


def create_movie_onehot_columns(df, movie_column='movie_canonical', prefix='movie'):

    # Get one-hot encoding exclude NaN from being a col
    one_hot = pd.get_dummies(df[movie_column], prefix=prefix, dummy_na=False)
    
    # Convert boolean to int 1/0 
    one_hot = one_hot.astype(int)
    
    # add one hot vectors
    df = pd.concat([df, one_hot], axis=1)
    

    
    return df


In [6]:
utsg_data_cleaned = clean_cols(UTSG_DATA) 
utm_data_cleaned = clean_cols(UTM_DATA) 

frames = [utsg_data_cleaned,utm_data_cleaned]
data = pd.concat(frames)
data = data.drop(data.columns[0:9], axis=1) 
data.insert(0, 'ID', range(len(data))) #TODO: drop the columns 
data.columns


Index(['ID',
       'Pizza From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)',
       'Pizza How many ingredients would you expect this food item to contain?',
       'Pizza In what setting would you expect this food to be served? Please check all that apply',
       'Pizza How much would you expect to pay for one serving of this food item?',
       'Pizza What movie do you think of when thinking of this food item?',
       'Pizza What drink would you pair with this food item?',
       'Pizza When you think about this food item, who does it remind you of?',
       'Pizza How much hot sauce would you add to this food item?',
       'Shawarma From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)',
       'Shawarma How many ingredients would you expect this food item to contain?',
       'Shawarma In what setting would you expect this food to be served? Please check 

In [7]:
data_reshaped = reshape_food_data(data)

display(HTML(f"""
<div style="max-height:400px; overflow-y:auto;">
{data_reshaped.to_html(index=False)}
</div>
"""))

ID,Food,"From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)",How many ingredients would you expect this food item to contain?,In what setting would you expect this food to be served? Please check all that apply,How much would you expect to pay for one serving of this food item?,What movie do you think of when thinking of this food item?,What drink would you pair with this food item?,"When you think about this food item, who does it remind you of?",How much hot sauce would you add to this food item?,n correct,n incorrect,score
0,Pizza,,,,,,,,,15,12,1.0
0,Shawarma,,,,,,,,,15,12,1.0
0,Sushi,,,,,,,,,15,12,1.0
1,Pizza,3.0,5,"Week day lunch,Week day dinner",10dollar,Cloudy with a chance of meatballs,Coca Cola,Friends,,18,9,1.0
1,Shawarma,3.0,3,"Week day lunch,Week day dinner",4 dollars,Dangal,Cola,Friends,,18,9,1.0
1,Sushi,3.0,4,Week day dinner,15,none,water,Friends,,18,9,1.0
2,Pizza,3.0,6,"Week day lunch,At a party,Late night snack",5,Cloudy with a Chance of Meatballs,Coke,Friends,A little (mild),15,12,1.0
2,Shawarma,4.0,8,"Week day lunch,Week day dinner,At a party,Late night snack",9,The Avengers,Hot water,Teachers,A lot (hot),15,12,1.0
2,Sushi,2.0,5,"Week day dinner,Weekend lunch,Weekend dinner,At a party,Late night snack",15,Spirited Away,Miso soup,Friends,A little (mild),15,12,1.0
3,Pizza,4.0,"bread, meet","Week day lunch,At a party,Late night snack",5$ for a large piece,All sort of american young boy movies,Coke,"Friends,Teachers,Strangers",,18,9,1.0


In [8]:
df = apply_ingredient_parsing(data_reshaped)
parse_check = df[[
    'How many ingredients would you expect this food item to contain?',
    'How many ingredients would you expect this food item to contain? (min)',
    'How many ingredients would you expect this food item to contain? (max)',
    'ingredient_count'
]]

display(HTML(f"""
<div style="max-height:400px; overflow-y:auto;">
{parse_check.to_html(index=False)}
</div>
"""))


How many ingredients would you expect this food item to contain?,How many ingredients would you expect this food item to contain? (min),How many ingredients would you expect this food item to contain? (max),ingredient_count
,,,
,,,
,,,
5,5.0,5.0,5.0
3,3.0,3.0,3.0
4,4.0,4.0,4.0
6,6.0,6.0,6.0
8,8.0,8.0,8.0
5,5.0,5.0,5.0
"bread, meet",2.0,2.0,2.0


In [None]:

df = apply_movie_mapping(
    df, 
    'What movie do you think of when thinking of this food item?',
    output_column='movie_canonical'
)



In [10]:
df.columns

Index(['ID', 'Food',
       'From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)',
       'How many ingredients would you expect this food item to contain?',
       'In what setting would you expect this food to be served? Please check all that apply',
       'How much would you expect to pay for one serving of this food item?',
       'What movie do you think of when thinking of this food item?',
       'What drink would you pair with this food item?',
       'When you think about this food item, who does it remind you of?',
       'How much hot sauce would you add to this food item?', 'n correct',
       'n incorrect', 'score',
       'How many ingredients would you expect this food item to contain? (min)',
       'How many ingredients would you expect this food item to contain? (max)',
       'ingredient_count', 'movie_canonical'],
      dtype='str')

In [11]:

parse_check = df[['What movie do you think of when thinking of this food item?', 'movie_canonical']]

display(HTML(f"""
<div style="max-height:400px; overflow-y:auto;">
{parse_check.to_html()}
</div>
"""))


Unnamed: 0,What movie do you think of when thinking of this food item?,movie_canonical
0,,
1,,
2,,
3,Cloudy with a chance of meatballs,cloudy with a chance of meatballs
4,Dangal,dangal
5,none,
6,Cloudy with a Chance of Meatballs,cloudy with a chance of meatballs
7,The Avengers,avengers
8,Spirited Away,spirited away
9,All sort of american young boy movies,american


In [12]:
parse_check = df.loc[
    df['movie_canonical'].isna(),
    ['What movie do you think of when thinking of this food item?', 'movie_canonical']
]

display(HTML(f"""
<div style="max-height:400px; overflow-y:auto;">
{parse_check.to_html()}
</div>
"""))


Unnamed: 0,What movie do you think of when thinking of this food item?,movie_canonical
0,,
1,,
2,,
5,none,
10,"None...\n\nOk tbf i think of my life at uoft, if life is a movie",
21,,
22,,
23,,
28,Nothing,
32,,


In [13]:
# Hard code problematic entries 
df.loc[85, 'movie_canonical'] = 'Shawarma Legend'
df.loc[570, 'movie_canonical'] = 'Five Nights at Freddy‚Äôs'
df.loc[782, 'movie_canonical'] = 'Who am I?'
df.loc[783, 'movie_canonical'] = 'Us'
df.loc[786, 'movie_canonical'] = 'documentary'
df.loc[989, 'movie_canonical'] = 'Billions'
df.loc[1179, 'movie_canonical'] = 'Ratatoullie'
df.loc[1351, 'movie_canonical'] = 'Ferris Bueller‚Äôs Day Off'
df.loc[1520, 'movie_canonical'] = 'Kiki‚Äôs Delivery Service'




In [14]:
parse_check = df.loc[
    df['movie_canonical'].isna(),
    ['What movie do you think of when thinking of this food item?', 'movie_canonical']
]

display(HTML(f"""
<div style="max-height:400px; overflow-y:auto;">
{parse_check.to_html()}
</div>
"""))


Unnamed: 0,What movie do you think of when thinking of this food item?,movie_canonical
0,,
1,,
2,,
5,none,
10,"None...\n\nOk tbf i think of my life at uoft, if life is a movie",
21,,
22,,
23,,
28,Nothing,
32,,


In [15]:
df.columns

Index(['ID', 'Food',
       'From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)',
       'How many ingredients would you expect this food item to contain?',
       'In what setting would you expect this food to be served? Please check all that apply',
       'How much would you expect to pay for one serving of this food item?',
       'What movie do you think of when thinking of this food item?',
       'What drink would you pair with this food item?',
       'When you think about this food item, who does it remind you of?',
       'How much hot sauce would you add to this food item?', 'n correct',
       'n incorrect', 'score',
       'How many ingredients would you expect this food item to contain? (min)',
       'How many ingredients would you expect this food item to contain? (max)',
       'ingredient_count', 'movie_canonical'],
      dtype='str')

In [16]:
df = create_movie_onehot_columns(df, movie_column='movie_canonical', prefix='movie')


In [17]:
df.columns

Index(['ID', 'Food',
       'From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)',
       'How many ingredients would you expect this food item to contain?',
       'In what setting would you expect this food to be served? Please check all that apply',
       'How much would you expect to pay for one serving of this food item?',
       'What movie do you think of when thinking of this food item?',
       'What drink would you pair with this food item?',
       'When you think about this food item, who does it remind you of?',
       'How much hot sauce would you add to this food item?',
       ...
       'movie_whale', 'movie_wicked', 'movie_wizards of waverly place',
       'movie_wolf of wall street', 'movie_wolverine', 'movie_yakuza',
       'movie_you don't mess with the zohan', 'movie_your name',
       'movie_youtube', 'movie_zootopia'],
      dtype='str', length=350)

In [18]:
df['movie_avengers']

0       0
1       0
2       0
3       0
4       0
       ..
1531    1
1532    0
1533    0
1534    1
1535    0
Name: movie_avengers, Length: 1536, dtype: int64

In [19]:
hot_sauce = df['How much hot sauce would you add to this food item?'].str.get_dummies(sep=',')
remind = df['When you think about this food item, who does it remind you of?'].str.get_dummies(sep=',')
setting = df['In what setting would you expect this food to be served? Please check all that apply'].str.get_dummies(sep=',')
scale = df['From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)'] #already dtype('float64') so no need to convert


In [20]:
remind

Unnamed: 0,Friends,Parents,Siblings,Strangers,Teachers
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0
...,...,...,...,...,...
1531,1,0,0,0,0
1532,1,0,1,1,0
1533,1,1,1,0,1
1534,1,0,0,1,0


In [21]:
setting

Unnamed: 0,At a party,Late night snack,Week day dinner,Week day lunch,Weekend dinner,Weekend lunch
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,1,1,0,0
4,0,0,1,1,0,0
...,...,...,...,...,...,...
1531,0,1,1,1,1,1
1532,0,1,0,1,0,1
1533,1,0,1,1,0,1
1534,0,0,0,1,0,1
