In [8]:
import pandas as pd
from Levenshtein import distance

In [9]:
restaurant_info = pd.read_csv('restaurant_info.csv')

# extracting preferences available in out dataset
food_types = pd.unique(restaurant_info['food'].dropna())
areas = pd.unique(restaurant_info['area'].dropna())
prices = pd.unique(restaurant_info['pricerange'].dropna())

In [10]:
def extract_preference(utterance: str, category_list: list, threshold_distance: int) -> str:
    words = utterance.lower().split()
    
    best_word = None
    for word in words:
        for keyword in category_list:
            word_distance = distance(word, keyword)
            if word_distance <= threshold_distance:
                threshold_distance = word_distance
                best_word = keyword

    return best_word

In [11]:
def extract_all_preferences(utterance: str, food_types: list, areas: list, prices: list) -> dict[str, str]:
    """
    Extracts all preferences from a single 'inform' utterance at once
    TO DISCUSS:
    In the exercise description maximal Levenshtein distance is 3. Imo 3 is too much.
    Examples where 3 still finds a preference (word in utterance -> preference):
    food -> seafood
    english -> polish
    care -> centre
    """
    value_dict = dict()
    value_dict['food_type'] = extract_preference(utterance, food_types, 2)
    value_dict['area'] = extract_preference(utterance, areas, 2)
    value_dict['price'] = extract_preference(utterance, prices, 2)
    
    return value_dict    

In [12]:
# idc = ['t care', 'any', 't matter']

sample_utterance = 'cheep, chinese food in amazing sothu of Utrecht'
preferences = extract_all_preferences(sample_utterance, food_types, areas, prices)
preferences

{'food_type': 'chinese', 'area': 'south', 'price': 'cheap'}