In [1]:
%pip install -qU pandas tiktoken langchain_core langchain-openai langchain-google-genai matplotlib langsmith python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from langsmith import Client
from dotenv import load_dotenv

load_dotenv()

client = Client()

app_data_df = pd.read_csv('./data/AppleStore.csv')

descriptions_df = pd.read_csv('./data/appleStore_description.csv')

full_app_df = pd.merge(app_data_df, descriptions_df, on='id', how='left')
full_app_df.head()


# Assuming combined_df is already defined
# Step 1: Select only the required columns
new_df = full_app_df[['id', 'track_name_x', 'size_bytes_x', 'currency',
                      'price', 'rating_count_tot', 'user_rating', 'ver', 'prime_genre', 'app_desc']]

# Step 2: Rename columns
new_df = new_df.rename(
    columns={'track_name_x': 'name', 'size_bytes_x': 'size'})

# Step 3: Convert size from bytes to MB
new_df['size'] = new_df['size'] / (1024 * 1024)  # 1 MB = 1024 * 1024 bytes


# Remove special characters from the 'name' field
new_df['name'] = new_df['name'].str.replace(r"[^a-zA-Z0-9\s]+", "", regex=True)

# Remove rows where the 'name' column is empty
new_df = new_df[new_df['name'].str.strip() != ""]

# Remove rows where rating_count_total is 0
new_df = new_df[new_df['rating_count_tot'] != 0]

# Remove rows with asian characters in the app_desc
new_df = new_df[new_df['app_desc'].str.contains(r'[\u4e00-\u9fff]') == False]

# sort by name
new_df = new_df.sort_values(by='app_desc')

# Display the new DataFramene
new_df.head()


LangSmithUserError: API key must be provided when using hosted LangSmith API

## Utils

In [None]:
from tiktoken import get_encoding


def count_tokens(text: str):
    """Count the number of tokens in a string"""
    encoder = get_encoding("cl100k_base")
    return len(encoder.encode(text))


def row_to_string(row):
    """Convert a row to a string"""
    app_string = f"""AppName: {row.name}
App Size: {round(row.size, 2)} MB
App Price: {row.price} {row.currency}
App Rating Count: {row.rating_count_tot}
App User Rating: {row.user_rating}
App Version: {row.ver}
App Genre: {row.prime_genre}
App Description: {row.app_desc}"""
    return app_string


def slice_df_by_tokens(df: pd.DataFrame, max_total_tokens: int):
    """
    Slices a dataframe by the number of tokens.
    """
    delimiter = "\n================\n"
    app_str = ""

    for i, row in enumerate(df.itertuples()):
        row_str = row_to_string(row)
        num_tokens = count_tokens(f"{app_str}{delimiter}{row_str}")
        if num_tokens < max_total_tokens:  # If we havent hit the token limit, add the row
            app_str += f"{row_str}{delimiter}"
        else:
            return app_str, df[:i]


def get_names(df: pd.DataFrame):
    return [name.lower() for name in df['name'].tolist()]


def get_biggest_apps(df: pd.DataFrame, top_n: int = 5):
    """
    Get the top n apps by size
    """
    sorted_df = df.sort_values(by='size', ascending=False)
    sliced_df = sorted_df[:top_n]
    # Return list of app names
    return get_names(sliced_df)


def get_most_ratings(df: pd.DataFrame, top_n: int = 5):
    """
    Get the top n apps by rating count
    """
    sorted_df = df.sort_values(by='rating_count_tot', ascending=False)
    sliced_df = sorted_df[:top_n]
    # Return list of app names
    return get_names(sliced_df)


def get_best_rated(df: pd.DataFrame, top_n: int = 5):
    """
    Get the top n apps by user rating count and user rating
    """
    sorted_df = df.sort_values(
        by=['rating_count_tot', 'user_rating'], ascending=False)
    sliced_df = sorted_df[:top_n]
    # Return list of app names
    return get_names(sliced_df)


def get_accurate_report(tokens: int, top_n=5):
    _, df = slice_df_by_tokens(new_df, tokens)
    biggest_apps = get_biggest_apps(df, top_n)
    most_rated_apps = get_most_ratings(df, top_n)
    best_rated_apps = get_best_rated(df, top_n)
    return {"biggest_apps": biggest_apps, "most_rated_apps": most_rated_apps, "best_rated_apps": best_rated_apps}

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.messages import SystemMessage, HumanMessage
from typing import List


class ReportForm(BaseModel):
    """A report of the biggest apps, the most rated apps, and the best rated apps"""

    biggest_apps: List[str] = Field(
        description="The list of app names in the ## Biggest Apps section of the report"
    )
    # most_rated_apps: List[str] = Field(
    #     description="The list of app names in the ## Most Rated Apps section of the report"
    # )
    # best_rated_apps: List[str] = Field(
    #     description="The list of app names in the ## Best Rated Apps section of the report"
    # )


def llm_parse_report(text: str, max_tokens=2000):
    """Use an LLM to parse the names from the LLM output to an array"""
    llm = ChatOpenAI(model="gpt-4o", temperature=0.5,
                     max_tokens=max_tokens).with_structured_output(ReportForm)
    system_prompt = """The user was tasked with analyzing and writing an analysis of the biggest apps, the most rated apps, and the best rated apps. \
Your job is to parse the user's response and return exactly the names of the biggest apps from the report \
in the order that they are mentioned. Be very exact! The integrity of your response is very important. 

- If you are off by a single character or line item, you will be penalized!! 
- If the a section of the report doesnt exist or include anything, use an empty array.
- Only include the names of the apps in the array. If the name includes descriptions or any other information, exclude that and only include the App Name
"""
    response: ReportForm = llm.invoke(
        [SystemMessage(content=system_prompt), HumanMessage(content=text)])
    return response

## Experiment

### Dataset

In [None]:
dataset_name = "1M Token Steps"


def make_dataset(increments=20):
    """Make and fill dataset if it doesnt exist"""
    if client.has_dataset(dataset_name=dataset_name):
        return client.read_dataset(dataset_name=dataset_name)

    dataset = client.create_dataset(
        dataset_name=dataset_name, description="Steps for 1M token length")

    # 1m token length
    max_tokens = 1000000
    steps = max_tokens // increments

    size = steps
    i = 0
    while size <= max_tokens:
        splits = ["base"]
        if i % 2 != 0:
            splits.append("even")
        if i < 1:
            splits.append("tiny")
        if i < 2:
            splits.append("small")
        if i < 5:
            splits.append("medium")
        if i < 10:
            splits.append("large")

        client.create_example(
            inputs={"tokens": size}, split=splits, dataset_name=dataset.name)

        size += steps
        i += 1

    return dataset


dataset = make_dataset()
dataset

Dataset(name='1M Token Steps', description='Steps for 1M token length', data_type=<DataType.kv: 'kv'>, id=UUID('ada4a79c-e1e1-4b51-adbb-86f112be6c7a'), created_at=datetime.datetime(2024, 6, 18, 20, 3, 52, 747121, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2024, 6, 18, 20, 3, 52, 747121, tzinfo=datetime.timezone.utc), example_count=20, session_count=16, last_session_start_time=datetime.datetime(2024, 6, 18, 23, 11, 49, 681935))

## Evaluation

In [None]:
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_google_genai import ChatGoogleGenerativeAI

top_n = 5

# 2. Top {top_n} Most Rated Apps - List of the names of the top {top_n} most rated apps in the dataset in descending order by highest Rating Count.
# 3. Top {top_n} Best Rated Apps - List of the names of the top {top_n} best rated apps in the dataset in descending order by highest User Rating and \

# ## Top {top_n} Most Rated Apps by Rating Count
# 1. AppName
# 2. ...

# ## Top {top_n} Best Rated Apps by User Rating and Rating Count
# 1. AppName
# 2. ...


def predict(inputs: dict):
    token_count: int = inputs["tokens"]
    app_data_str, df = slice_df_by_tokens(new_df, token_count)
    system_prompt = f"""You are an expert data analyzer. Your job is to review the content below in <APP STORE DATA> and produce a report that \
includes the following sections:

1. Top {top_n} Biggest Apps - List the names of the top {top_n} biggest apps in the dataset in descending order by Size.

You will be graded on precision, recall, and order correctness. Be sure to be as accurate as possible or you will be penalized!! ONLY use the \
data provided in the <APP STORE DATA> section. Do not make up any data or use any other data or knowledge to make your report or you will be PENALIZED! \
ONLY provide the name of the app

Here is the format you should respond in:

```
# Thoughts
[Think through your reasoning. Make decisions here and write about your through process step by step to reach a conclusion about the data. Deliberate on \
your options within the data. We need to know how you came to your conclusion, be verbose and mention real data points.]

# App Store Data Report

## Top {top_n} Biggest Apps Ordered by Size
1. AppName
2. ...
```

<APP STORE DATA>
{app_data_str}
</APP STORE DATA>"""
    report_llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro")
    response = report_llm.invoke(system_prompt)
    print("Gemini Responded!")
    report: ReportForm = llm_parse_report(response.content)
    print(response.content)
    expected_report = get_accurate_report(token_count)
    return {"report": report.dict(), "expected_report": expected_report}


# predict({"tokens": 4000})

  from .autonotebook import tqdm as notebook_tqdm


### Evaluation Functions

In [None]:
from langsmith.schemas import Example, Run

sections = ["biggest_apps"]

if not top_n:
    top_n = 5

total_answers = top_n * len(sections)


def get_score_card():
    """Get empty scorecard"""
    return {
        "biggest_apps": {"correct": 0, "incorrect_answers": []},
        "most_rated_apps": {"correct": 0, "incorrect_answers": []},
        "best_rated_apps": {"correct": 0, "incorrect_answers": []}
    }


def precision(root_run: Run, example: Example) -> dict:
    """The LLM's ability to only recall real titles from the expected list"""
    report = root_run.outputs["report"]
    tokens = example.inputs["tokens"]

    if not report:
        return {"score": 0, "key": "precision", "comment": "No report provided"}

    expected_report = get_accurate_report(tokens)
    scorecard = get_score_card()

    # For every name in each section of the report. If that name exists in that section of the expected report, its correct
    for section in sections:
        for name in report[section]:
            if name.lower() in expected_report[section]:
                scorecard[section]["correct"] += 1
            else:
                scorecard[section]["incorrect_answers"].append(name)

    scores = []
    for section in sections:
        final_score = scorecard[section]["correct"] / top_n
        # make comment
        if scorecard[section]["incorrect_answers"]:
            comment = f"Incorrect Answers in {section}: {', '.join(scorecard[section]['incorrect_answers'])}"
        else:
            comment = None

        scores.append(
            {"score": final_score, "key": "precision", "comment": comment})

    return {"results": scores}


def recall(root_run: Run, example: Example) -> dict:
    """The LLM's ability to recall all real titles from the expected list"""
    report = root_run.outputs["report"]
    tokens = example.inputs["tokens"]

    if not report:
        return {"score": 0, "key": "precision", "comment": "No report provided"}

    expected_report = get_accurate_report(tokens)
    scorecard = get_score_card()

    for section in sections:
        for name in expected_report[section]:
            lowered_section = [n.lower() for n in report[section]]
            if name in lowered_section:
                scorecard[section]["correct"] += 1
            else:
                scorecard[section]["incorrect_answers"].append(name)

    scores = []
    for section in sections:
        final_score = scorecard[section]["correct"] / top_n
        # make comment
        if scorecard[section]["incorrect_answers"]:
            comment = f"Missed Apps in {section}: {', '.join(scorecard[section]['incorrect_answers'])}"
        else:
            comment = None
        scores.append(
            {"score": final_score, "key": "recall", "comment": comment})

    return {"results": scores}


def order(root_run: Run, example: Example) -> dict:
    """The LLM's ability to order the titles correctly"""
    report = root_run.outputs["report"]
    tokens = example.inputs["tokens"]

    if not report:
        return {"score": 0, "key": "precision", "comment": "No report provided"}

    expected_report = get_accurate_report(tokens)
    scorecard = get_score_card()

    for section in sections:
        for i, name in enumerate(expected_report[section]):
            if name.lower() == report[section][i].lower():
                scorecard[section]["correct"] += 1
            else:
                scorecard[section]["incorrect_answers"].append(name)
                break

    scores = []
    for section in sections:
        final_score = scorecard[section]["correct"] / top_n
        if scorecard[section]["incorrect_answers"]:
            comment = f"First apps to be out of order in {section}: {', '.join(scorecard[section]['incorrect_answers'])}"
        else:
            comment = None
        scores.append(
            {"score": final_score, "key": "order", "comment": comment})

    return {"results": scores}

In [None]:
from langsmith import evaluate

evaluate(
    predict,
    data=client.list_examples(dataset_id=dataset.id, splits=["tiny"]),
    evaluators=[precision, recall, order],
    max_concurrency=1,
)

View the evaluation results for experiment: 'advanced-change-18' at:
https://smith.langchain.com/o/d967989d-4221-53db-b0a5-665b504acba2/datasets/ada4a79c-e1e1-4b51-adbb-86f112be6c7a/compare?selectedSessions=afe785c7-0699-4e2f-afd8-673796472ccc




0it [00:00, ?it/s]

Gemini Responded!
```
# Thoughts
I will go through the data and identify the App Size of each app. Then I will sort the apps by size in descending order, and list the top 5.

The top 5 app sizes are:
1. Flight Unlimited 2K16  Flight Simulator (2860.06 MB)
2. Broken Age  (2756.35 MB)
3. XCOM Enemy Within (3346.28 MB)
4. FINAL FANTASY  (3681.57 MB)
5. Bully Anniversary Edition (2301.59 MB)

# App Store Data Report

## Top 5 Biggest Apps Ordered by Size
1. XCOM Enemy Within
2. FINAL FANTASY 
3. Flight Unlimited 2K16  Flight Simulator
4. Broken Age 
5. Bully Anniversary Edition
```


1it [01:01, 61.54s/it]


<ExperimentResults advanced-change-18>