In [1]:
import polars as pl
import re

In [2]:
# Get raw data path and read it
path_to_data = 'C:/myDocuments/projects/instant-noodles/0_webscrape_data/data/instant_noodles.csv'

raw_data =  pl.read_csv(path_to_data, encoding = 'latin1')

In [3]:
# Process the data from string to numeric
extracted_data = raw_data.with_columns(
        pl.col("price").str.extract(r"(\d+\..*\d*)", 1).alias("price").cast(pl.Float32), # price in GBP
        pl.col("weight").str.extract(r"(\d+)g.*", 1).alias("weight").cast(pl.Float32), # weight in grams
        pl.col("weight").str.extract(r"\d+g.*,.*(\d+).*", 1).alias("servings").cast(pl.Int32).fill_null(1), # Servings if multipack
    )

# Add adjustment to price and weight according to the serving size
processed_data = extracted_data.with_columns(
    (pl.col("price")/ pl.col("servings")).round(2).alias("price"),
    (pl.col("weight")/ pl.col("servings")).round(0).alias("weight")
)


In [4]:
# Map each maker to a country to help the transformer model
mapping = {
    'Japan': ['Sanyo', 'Nissin Foods', 'Itsuki', 'Hikari Miso', 'Yamadai', 'Higashi Foods', 'Higashimaru', 'Yamamoto Seifun', 'Otafuku', 'Maruchan', 'Daikoku Foods'],
    'Singapore' : ['Myojo'],
    'South Korea' : ['Nong Shim','Samyang Foods','GARAK','Young Poong Foods'],
    'Vietnam' : ['Acecook']
}

assistant_prompt = 'Here are some useful information that might aid you in getting the region correct. '

# Create assistant prompt
for country in mapping:
    assistant_prompt += 'The following maker(s) are from ' + country + ': '
    for maker in mapping[country]:
        assistant_prompt += maker + ', '
    assistant_prompt = assistant_prompt[:-2] + '. '

# Additional assistance - region
assistant_prompt = assistant_prompt + 'Another useful tip, when product name has ramen in it, its region is usually Japan. Do note that when you see brackets in the product name, e.g. "(USA)", this means the noodle is distributed in that country, not necessarily meaning the instant noodles is from that region. Besides that, if region is Korea, please update these to South Korea. '

# Additional assistance - Spicy, Soupy, Flavour
assistant_prompt = assistant_prompt + 'Besides, udon and ramen noodles usually means the noodles originate from Japan. When deciding whether the noodles is spicy, soupy, or what type of base ingredietn it has. Please refer to the product name and description of the product. It should help you make a good decision. If undecided, when ramen is seen in the product name or description, it is usually one of these three: Vegetable, Pork or Chicken. Sesame oil flavoured instant noodles are considered as having Vegetable as the base ingredient. '


In [5]:
import openai
import os

openai.api_key = os.environ.get("OPENAI_API_KEY")

# Number of rows of data
nRow = processed_data.shape[0]

all_outputs = []

for i in range(nRow):

    # Extract data from row
    product = str(processed_data['product'].to_list()[i])
    maker = str(processed_data['maker'].to_list()[i])
    description = str(processed_data['description'].to_list()[i])

    # Create user prompt
    user_prompt = 'Name of instant noodles: ' + product + '. Description: ' + description + '. Maker: ' + maker + '.'

    instructions_prompt = 'please give your answer in the following format: [*Which region is the noodle inspired from*, *Spicy - Yes or No*, *Soupy - Yes or No*, *Which describes the ingredient base best, please select one of the options - Seafood/Chicken/Beef/Lamb/Pork/Vegetable/Vegan*]. For example, an output could look like [Japan, No, Yes, Chicken].'
    # Use the new `openai.ChatCompletion.create` method
    client = openai.OpenAI()

    response = client.chat.completions.create(
        model="gpt-4o-mini",  
        messages=[
            {"role": "developer", "content": "You are a helping to identify some characteristics of different kinds of instant noodles. Please try to answer the questions as correctly as possible based on the name, description, and maker of the instant noodles provided to you. For each product, "+ instructions_prompt},
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": assistant_prompt}
        ]
    )

    # Log responses from gpt model
    all_outputs.append(response.choices[0].message.content)


In [19]:
# Save outputs from chat GPT
gpt_output = processed_data.with_columns(
        pl.Series('gpt_outputs', all_outputs),
    )

# Add each category into a new column 
output_data = gpt_output.with_columns(
        pl.col('maker').str.replace_all(" ", "").str.to_lowercase(), # Set maker to lowercase and remove spaces
        pl.col('gpt_outputs').str.extract(r"\[(.*), .*, .*, .*\]", 1).str.replace_all(" ", "").str.to_lowercase().alias("region"),
        pl.col('gpt_outputs').str.extract(r"\[.*, (.*), .*, .*\]", 1).str.replace_all(" ", "").str.to_lowercase().alias("spicy"),
        pl.col('gpt_outputs').str.extract(r"\[.*, .*, (.*), .*\]", 1).str.replace_all(" ", "").str.to_lowercase().alias("soupy"),
        pl.col('gpt_outputs').str.extract(r"\[.*, .*, .*, (.*)\]", 1).str.replace_all(" ", "").str.to_lowercase().alias("base")
    )

# Create a version of the data that only contains columns we are interested in
final_data = output_data.select(
    ['product', 'price', 'weight', 'maker', 'region', 'spicy', 'soupy', 'base']
)

# Write both final data and the complete output data
final_data.write_csv('data/processed_data.csv')
output_data.write_csv('data/all_output_data.csv')