In [245]:
import pandas as pd
import numpy as np

import openai
import tiktoken

import backoff
import time

from tqdm import tqdm


In [246]:
model_openai = "gpt-3.5-turbo"
# "gpt-4"

In [247]:
def get_completion(prompt='', model="gpt-3.5-turbo", temperature=0.8, max_tokens=20):
    messages = [{"role": "user", "content": prompt}]

    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=messages,
            temperature=temperature,
            max_tokens=max_tokens
        )
        response = response.choices[0].message["content"]
    except openai.error.APIError as e:
        print(f"OpenAI API Error: {e}")
        response = "OpenAI API Error"
    except openai.error.APIConnectionError as e:
        print(f"OpenAI API Connection Error: {e}")
        response = "OpenAI API Connection Error"
    except Exception as e:
        # print(f"Exception: {e}")
        response = "Other Error"
    return response

@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def get_completion_with_backoff(**kwargs):
    time.sleep(0.1)
    return get_completion(**kwargs)

In [248]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [249]:
openai.api_key  = "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"

In [250]:
# get unique target

In [251]:
def create_prompt(name, components_name, material_descriptions):
    
    materials_description_sentences = " ".join([f"'{material}' means {desc.lower()}" for material, desc in material_descriptions.items()])
    
    if pd.isna(name):
        context = f"The component's name is not given, but its CAD assembly name is '{components_name}'."
    elif pd.isna(components_name):
        context = f"The component's name is '{name}', but its CAD assembly name is not given."
    else:
        context = f"The component's name is '{name}' and its CAD assembly name is '{components_name}'."
    
    prompt = f"""
    You're asked to determine the suitable material category for a component based on its name and CAD assembly name.
    {context}
    {materials_description_sentences}
    Which material is most appropriate? Only respond with one of the categories: 'Aluminum', 'Ferrous Metal', 'Steel', 'Non-Ferrous Metal', 'Other', 'Plastic', or 'Wood'. If unsure, reply "NaN".
    """

    return prompt


In [252]:
# Load your data
df_main = pd.read_pickle('df_main.pkl').head(3000)

In [253]:
material_categories = df_main["material_category"].unique().tolist()

In [254]:
# Adjust the material_descriptions dictionary to be more concise:
material_descriptions = {
    'Aluminum': 'Aluminum-based metal (e.g., Aluminum alloy).',
    'Ferrous Metal': 'Ferrous metal excluding carbon steel (e.g., Cast iron).',
    'Steel': 'Carbon steel (e.g., Stainless steel).',
    'Non-Ferrous Metal': 'Non-Ferrous metals like Platinum, silver.',
    'Other': 'Materials like glass, fabric, ceramic.',
    'Plastic': 'Plastic.',
    'Wood': 'Natural or engineered wood.'
}

In [255]:
valid_rows = df_main.dropna(subset=['name', 'components_name'], how='all')
print(valid_rows.shape)


(2277, 33)


In [256]:
# Find the first row where at least one of 'name' and 'components_name' is not NaN
sample_row = df_main.dropna(subset=['name', 'components_name'], how='all').iloc[0]

sample_prompt = create_prompt(sample_row['name'], sample_row['components_name'], material_descriptions)
print(sample_prompt)



    You're asked to determine the suitable material category for a component based on its name and CAD assembly name.
    The component's name is not given, but its CAD assembly name is 'Hinge'.
    'Aluminum' means aluminum-based metal (e.g., aluminum alloy). 'Ferrous Metal' means ferrous metal excluding carbon steel (e.g., cast iron). 'Steel' means carbon steel (e.g., stainless steel). 'Non-Ferrous Metal' means non-ferrous metals like platinum, silver. 'Other' means materials like glass, fabric, ceramic. 'Plastic' means plastic. 'Wood' means natural or engineered wood.
    Which material is most appropriate? Only respond with one of the categories: 'Aluminum', 'Ferrous Metal', 'Steel', 'Non-Ferrous Metal', 'Other', 'Plastic', or 'Wood'. If unsure, reply "NaN".
    


In [257]:
encoding = tiktoken.encoding_for_model(model_openai)
num_tokens_from_string(sample_prompt, "cl100k_base")

191

In [258]:
# Step 1: Drop duplicates based on 'name' and 'components_name'
unique_df = df_main.drop_duplicates(subset=['name', 'components_name'])
print(f"Unique rows based on 'name' and 'components_name': {unique_df.shape[0]}")

predictions = []

Unique rows based on 'name' and 'components_name': 924


In [259]:
# Step 2: Create predictions for this deduplicated dataframe
for index, row in tqdm(unique_df.iterrows(), total=unique_df.shape[0], desc="Predicting"):
    if pd.isna(row['name']) and pd.isna(row['components_name']):
        predictions.append(np.nan)
        continue
    
    prompt = create_prompt(row['name'], row['components_name'], material_descriptions)
    predicted_material = get_completion_with_backoff(prompt=prompt)
    predictions.append(predicted_material.strip("Material Choice: "))

Predicting: 100%|██████████| 924/924 [02:44<00:00,  5.63it/s]


In [260]:
unique_df.loc[:, 'predicted_material'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_df.loc[:, 'predicted_material'] = predictions


In [261]:
# Step 3: Merge the predictions back to the original dataframe
df_main = df_main.merge(unique_df[['name', 'components_name', 'predicted_material']], on=['name', 'components_name'], how='left')

In [262]:
# Map predicted materials back to original category names (only if you need them in the original format)
material_map = {
    'Aluminum': 'Metal_Aluminum',
    'Ferrous Metal': 'Metal_Ferrous',
    'Steel': 'Metal_Ferrous_Steel',
    'Non-Ferrous Metal': 'Metal_Non-Ferrous',
    'Other': 'Other',
    'Plastic': 'Plastic',
    'Wood': 'Wood'
}

df_main['original_category_prediction'] = df_main['predicted_material'].map(material_map)

In [263]:
df_main["predicted_material"].unique().tolist()

['NaN', 'Other E', nan]

In [264]:
unique_counts = df_main["predicted_material"].value_counts()

print(unique_counts)


predicted_material
Other E    2276
NaN           1
Name: count, dtype: int64


In [265]:
# Evaluate accuracy
accuracy = (df_main['original_category_prediction'] == df_main['material_category']).mean()
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.00
