In [1]:
%pip install -qU pandas python-dotenv openai numpy scikit-learn langgraph langchain-openai langchain-core pydantic

You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import openai
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Set up OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
import pandas as pd

df = pd.read_csv('./utils/AppStore_Data.csv')

# Get first 500 rows
df = df.head(500)

df.head()

Unnamed: 0,id,name,size,currency,price,rating_count_tot,user_rating,ver,prime_genre,app_desc
0,1134867821,NOT ALONE Story of a bird,116.121094,USD,2.99,1,3.0,1.1,Games,! Now on X'mas special sales (~2017 Jan. 3rd) ...
1,1145500015,Drifty Chase,180.987305,USD,0.0,1631,4.5,1.7,Games,!! 2016 Very Big Indie Pitch finalist at PGCon...
2,823804745,Multiplayer Terraria edition,15.058594,USD,3.99,6981,4.0,1.5,Games,!!! First and the only app which allows to pla...
3,949876643,Lumyer augmented reality camera effects,116.251953,USD,0.0,3896,4.5,4.0.1,Photo & Video,!!! NEW !!! TAP EFFECTS\nTry the new Tap Effe...
4,1086929344,Dancing with the Stars The Official Game,334.543945,USD,0.0,1098,4.0,2.7,Games,!!! Please note this app does not currently su...


In [4]:
def get_embeddings(texts: list[str], model="text-embedding-3-large"):
    embeddings = openai.embeddings.create(input=texts, model=model)
    return [embedding.embedding for embedding in embeddings.data]

In [5]:
# list of app descriptions
app_descriptions = df['app_desc'].tolist()
embeddings = get_embeddings(app_descriptions)

embedded_df = df.copy()
embedded_df['embeddings'] = embeddings
embedded_df.head()

Unnamed: 0,id,name,size,currency,price,rating_count_tot,user_rating,ver,prime_genre,app_desc,embeddings
0,1134867821,NOT ALONE Story of a bird,116.121094,USD,2.99,1,3.0,1.1,Games,! Now on X'mas special sales (~2017 Jan. 3rd) ...,"[-0.029529482126235962, -0.042813315987586975,..."
1,1145500015,Drifty Chase,180.987305,USD,0.0,1631,4.5,1.7,Games,!! 2016 Very Big Indie Pitch finalist at PGCon...,"[0.010901196859776974, -0.041957493871450424, ..."
2,823804745,Multiplayer Terraria edition,15.058594,USD,3.99,6981,4.0,1.5,Games,!!! First and the only app which allows to pla...,"[-0.04006137698888779, -0.024302134290337563, ..."
3,949876643,Lumyer augmented reality camera effects,116.251953,USD,0.0,3896,4.5,4.0.1,Photo & Video,!!! NEW !!! TAP EFFECTS\nTry the new Tap Effe...,"[-0.013417327776551247, -0.009647566825151443,..."
4,1086929344,Dancing with the Stars The Official Game,334.543945,USD,0.0,1098,4.0,2.7,Games,!!! Please note this app does not currently su...,"[-0.018154075369238853, -0.010654659010469913,..."


In [12]:
from pydantic.v1 import BaseModel, Field
from typing import List, Tuple
from langchain_core.tools import tool


class Document(BaseModel):
    id: int
    name: str
    size: float
    price: float
    currency: str
    rating_count_tot: int
    user_rating: float
    ver: str
    prime_genre: str
    app_desc: str

# Function to perform semantic search


def semantic_sort(query, df=embedded_df):
    """Sort the dataframe based on the cosine similarity of the query and the embeddings"""
    query_embedding = get_embeddings(query)[0]
    new_df = df.copy()
    new_df['similarity'] = new_df['embeddings'].apply(
        lambda x: cosine_similarity([x], [query_embedding])[0][0])
    results = new_df.sort_values('similarity', ascending=False)
    return results


def row_to_string(row):
    """Convert a row to a string"""
    app_string = f"""App Name: {row.name}
Size: {round(row.size, 2)} MB
Price: {row.price} {row.currency}
Rating Count: {row.rating_count_tot}
User Rating: {row.user_rating}
Version: {row.ver}
Genre: {row.prime_genre}
Description: {row.app_desc}"""
    return app_string


def df_to_string(df):
    """Convert a dataframe to a string"""
    return "\n----\n".join([row_to_string(row) for row in df.itertuples()])


def df_to_documents(df):
    """Convert a dataframe to a list of documents"""
    return [Document(**row._asdict()) for row in df.itertuples()]


top_k = 5


class SemanticSearch(BaseModel):
    query: str = Field(
        description="The query to search for in the App Store Data")

# @tool("semantic_search", args_schema=SemanticSearch, parse_docstring=True)


@tool
def semantic_search(query: str):
    """Perform a semantic search on the App Store Data"""
    top_df = semantic_sort(query).head(top_k)
    results_str = df_to_string(top_df)
    documents = df_to_documents(top_df)
    return {"results": results_str, "documents": documents}

In [7]:
results = semantic_search("What is the best app for learning Spanish?")
print(results["results"])

  results = semantic_search("What is the best app for learning Spanish?")


App Name: FreeSpeech  Build Language and Learn Grammar
Size: 188.13 MB
Price: 9.99 USD
Rating Count: 15
User Rating: 4.0
Version: 1.4
Genre: Education
Description: *** FEATURED BY APPLE - #1 BEST NEW APP! ***

FreeSpeech is an app to learn the English language! Drag and rearrange picture tiles, and FreeSpeech will convert them into a perfectly grammatical English sentence. 
Explore how tenses, sentence forms, parts of speech, and other grammatical properties change the structure of English -- and it's all visual! 

FreeSpeech is the perfect app for anyone who is learning the English language - whether you are 
- Learning English from a second language
- introducing someone to English grammar
- Teaching children English

Developed by an interdisciplinary team including SLPs, educators, and researchers with over 100 years of collective experience, FreeSpeech is the most advanced language learning technology currently available. The technology behind FreeSpeech was the subject of a TED ta

## Graph

In [13]:
from typing import TypedDict, Annotated, List

from langchain_core.messages import HumanMessage, BaseMessage, AIMessage, SystemMessage, ToolMessage
from langchain_openai import ChatOpenAI
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import END, START, StateGraph, MessagesState
from langgraph.prebuilt import ToolNode, ToolExecutor, ToolInvocation
from langgraph.graph.message import AnyMessage, add_messages



class AgentState(TypedDict):
    messages: Annotated[list[AnyMessage], add_messages]
    queries: List[str]
    documents: List[Document]
    answer: str


tools = [semantic_search]

tool_executor = ToolExecutor(tools)

model = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
llm = model.bind_tools(tools, strict=True)


  tool_executor = ToolExecutor(tools)


In [14]:
import json

# Define the function that determines whether to continue or not


def should_continue(state: AgentState):
    messages = state['messages']
    last_message = messages[-1]
    # If the LLM makes a tool call, then we route to the "tool_node"
    if isinstance(last_message, AIMessage) and last_message.additional_kwargs.get("tool_calls", []):
        return "tool_node"
    # Otherwise, we stop
    return END

# Define the function that calls the model


def call_model(state: AgentState):
    messages = state['messages']
    response = llm.invoke(messages)

    # If the response has content, then we return it with the output field
    if response.content:
        return {"messages": [response], "answer": response.content}

    return {"messages": [response]}


def search_node(state: AgentState):
    tool_calls = state['messages'][-1].tool_calls
    tool_messages = []
    all_documents = []
    all_queries = []
    
    for tool_call in tool_calls:
        tool_invocation = ToolInvocation(
            tool=tool_call["name"], tool_input=tool_call["args"])
        search_results = tool_executor.invoke(tool_invocation)
        all_queries.append(tool_call["args"]["query"])
        tool_message = ToolMessage(
            content=search_results["results"], name=tool_call["name"], tool_call_id=tool_call["id"])
        tool_messages.append(tool_message)
        all_documents.extend(search_results["documents"])

    return {"messages": tool_messages, "documents": all_documents, "queries": all_queries}


# Define a new graph
workflow = StateGraph(AgentState)

# Define the two nodes we will cycle between
workflow.add_node("call_model", call_model)
workflow.add_node("tool_node", search_node)

# Set the entrypoint as `call_model`
workflow.set_entry_point("call_model")

# We now add a conditional edge
workflow.add_conditional_edges(
    # First, we define the start node. We use `call_model`.
    "call_model",
    # Next, we pass in the function that will determine which node is called next.
    should_continue,
    {
        "tool_node": "tool_node",
        END: END
    }
)

# We now add a normal edge from `tool_node` to `call_model`.
workflow.add_edge("tool_node", "call_model")


# Compile the graph
app = workflow.compile()

# Example usage
messages = [HumanMessage(content="What is the best app for learning Spanish?")]
result = app.invoke({"messages": messages})
result

  tool_invocation = ToolInvocation(


{'messages': [HumanMessage(content='What is the best app for learning Spanish?', id='ae525025-f129-4e4b-9476-654d6ea15543'),
  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_Qn1QpUZWY6wx5Q84D3DiP98b', 'function': {'arguments': '{"query":"best app for learning Spanish"}', 'name': 'semantic_search'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 56, 'total_tokens': 74}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_f33667828e', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-b7805346-c4c4-449d-ad55-9c139a08d390-0', tool_calls=[{'name': 'semantic_search', 'args': {'query': 'best app for learning Spanish'}, 'id': 'call_Qn1QpUZWY6wx5Q84D3DiP98b', 'type': 'tool_call'}], usage_metadata={'input_tokens': 56, 'output_tokens': 18, 'total_tokens': 74}),
  ToolMessage(content='App Name: FreeSpeech  Build Language and Learn Grammar\nSize: 188.13 MB\nPrice: 9.99 USD\nRating Count

In [10]:
tool_message = result['messages'][2]