In [1]:
from langchain_experimental.tools import PythonREPLTool
import text

# define python repl
python_repl = PythonREPLTool()

# initiate python_repl to ignore warnings
python_repl.invoke('import warnings\nwarnings.simplefilter("ignore")')

Python REPL can execute arbitrary code. Use with caution.


''

In [9]:
from dotenv import load_dotenv, find_dotenv
import re

from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnableLambda
from langchain_community.chat_models import BedrockChat

# read local .env file
_ = load_dotenv(find_dotenv()) 

# define language model
model_id = 'anthropic.claude-3-sonnet-20240229-v1:0'
#model_id = 'anthropic.claude-3-haiku-20240307-v1:0'
llm = BedrockChat(model_id=model_id, model_kwargs={'temperature': 0})

# set a distance threshold for when to create a new plan vs modify an existing plan
threshold = .5


def extract_text_between_markers(text):
    '''Helper function to extract code'''
    start_marker = '```python'
    end_marker = '```'

    pattern = re.compile(f'{re.escape(start_marker)}(.*?){re.escape(end_marker)}', re.DOTALL)
    #matches = pattern.findall(text.content)
    matches = pattern.findall(text)
    return matches[0]


CONVERT_SYSTEM_PROMPT = '''<instructions>You are a highly skilled Python programmer.  Your goal is to help a user execute a plan by writing code for a Python REPL.</instructions>

Text between the <function_detail></function_detail> tags is documentation on the functions in use.  Do not attempt to use any feature that is not explicitly listed in the data dictionary for that function.
<function_detail> 
{function_detail}
</function_detail>

Text between the <task></task> tags is the goal of the plan.
<task>
{task}
</task>

Text between the <plan></plan> tags is the entire plan that will be executed.
<plan>
{plan}
</plan>

Text between the <rules></rules> tags are rules that must be followed.
<rules>
1. Import all necessary libraries at the start of your code.
2. Always assign the result of a pybaseball function call to a variable.
3. When writing code for the last step in the plan, always use print() to write a detailed summary of the results.
4. Never write functions
5. Return all python code between three tick marks like this:
```python
python code goes here
```
6. Comment your code liberally to be clear about what is happening and why.
7. If the entire plan has been executed, return the results between <results></results> tags.
</rules>
'''

In [3]:
convert_prompt_template = ChatPromptTemplate.from_messages([
    ("system", CONVERT_SYSTEM_PROMPT),
    MessagesPlaceholder(variable_name="messages"), 
]).partial(function_detail=text.function_detail, task=text.task, plan=text.plan)

convert_chain = convert_prompt_template | llm #| RunnableLambda(extract_text_between_markers)

In [4]:
messages = [HumanMessage(content='Convert the next step of the plan into code that can be executed in a Python REPL.'),
 AIMessage(content="The previous step completed successfully with the following code:\n\n```python\n\n# Import necessary libraries\nfrom pybaseball import statcast, playerid_lookup\nimport pandas as pd\n\n# 1. Get all pitch data from 2020-08-01 to 2020-08-07\nall_pitches = statcast('2020-08-01', '2020-08-07')\n\n```\n\nHere was the result: This is a large query, it may take a moment to complete\n"),
 HumanMessage(content='Convert the next step of the plan into code that can be executed in a Python REPL.'),
 AIMessage(content="The previous step completed successfully with the following code:\n\n```python\n\n# 2. Filter for just curveballs\nall_curves = all_pitches[all_pitches['pitch_type'] == 'CU']\n\n```\n\nHere was the result: "),
 HumanMessage(content='Convert the next step of the plan into code that can be executed in a Python REPL.'),
 AIMessage(content='The previous step reached an error with the following code:\n\n```python\n\n# 3. Create a feature vector for each pitcher\'s curveballs\npitcher_curves = all_curves.groupby(\'pitcher\')\npitcher_features = pitcher_curves[[\'release_speed\', \'release_spin\', \'pfx_x\', \'pfx_z\']].mean().reset_index()\n\n```\n\nHere was the error: KeyError("Columns not found: \'release_spin\'")'),
 HumanMessage(content='What information would be useful in order to troubleshoot this error?  Write Python code that can be executed in a python repl to confirm this information.'),
 AIMessage(content="The following code was executed to help troubleshoot this error:\n\n```python\n\nprint(all_curves.columns)\n\n```\n\nHere is the result:\n\nIndex(['pitch_type', 'game_date', 'release_speed', 'release_pos_x',\n       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',\n       'description', 'spin_dir', 'spin_rate_deprecated',\n       'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des',\n       'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type',\n       'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x',\n       'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b',\n       'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y',\n       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',\n       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',\n       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',\n       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',\n       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',\n       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',\n       'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle',\n       'woba_value', 'woba_denom', 'babip_value', 'iso_value',\n       'launch_speed_angle', 'at_bat_number', 'pitch_number', 'pitch_name',\n       'home_score', 'away_score', 'bat_score', 'fld_score', 'post_away_score',\n       'post_home_score', 'post_bat_score', 'post_fld_score',\n       'if_fielding_alignment', 'of_fielding_alignment', 'spin_axis',\n       'delta_home_win_exp', 'delta_run_exp'],\n      dtype='object')\n"),
 HumanMessage(content='Convert the next step of the plan into code that can be executed in a Python REPL.'),
 AIMessage(content="The previous step completed successfully with the following code:\n\n```python\n\n# 3. Create a feature vector for each pitcher's curveballs \npitcher_curves = all_curves.groupby('pitcher')\npitcher_features = pitcher_curves[['release_speed', 'release_spin_rate', 'pfx_x', 'pfx_z']].mean().reset_index()\n\n```\n\nHere was the result: "),
 HumanMessage(content='Convert the next step of the plan into code that can be executed in a Python REPL.'),
 AIMessage(content="The previous step completed successfully with the following code:\n\n```python\n\n# 4. Get Max Scherzer's player ID\nfrom pybaseball import playerid_lookup\nscherzer_id = playerid_lookup('scherzer', 'max')['key_mlbam'][0]\n\n```\n\nHere was the result: Gathering player lookup table. This may take a moment.\n"),
 HumanMessage(content='Convert the next step of the plan into code that can be executed in a Python REPL.'),
 AIMessage(content="The previous step completed successfully with the following code:\n\n```python\n\n# 5. Get Scherzer's feature vector\nscherzer_features = pitcher_features[pitcher_features['pitcher'] == scherzer_id]\n\n```\n\nHere was the result: "),
 HumanMessage(content='Convert the next step of the plan into code that can be executed in a Python REPL.'),
 AIMessage(content="The previous step completed successfully with the following code:\n\n```python\n\n# 6. Import scikit-learn and create a knn model\nfrom sklearn.neighbors import NearestNeighbors\nknn = NearestNeighbors(n_neighbors=4)\nknn.fit(pitcher_features[['release_speed', 'release_spin_rate', 'pfx_x', 'pfx_z']])\n\n```\n\nHere was the result: "),
 HumanMessage(content='Convert the next step of the plan into code that can be executed in a Python REPL.'),
 AIMessage(content="The previous step completed successfully with the following code:\n\n```python\n\n\n# 7. Find the 3 pitchers closest to Scherzer\ndistances, indices = knn.kneighbors(scherzer_features[['release_speed', 'release_spin_rate', 'pfx_x', 'pfx_z']])\nclosest_indices = indices[0][1:4]\nsimilar_pitchers = pitcher_features.iloc[closest_indices]\n\n# Print the pitcher IDs of the 3 most similar\nprint(similar_pitchers[['pitcher']])\n\n\n```\n\nHere was the result:      pitcher\n85    593833\n199   664285\n122   608331\n"),
 HumanMessage(content='Convert the next step of the plan into code that can be executed in a Python REPL.')
 ]

In [5]:
# invoke convert chain
code = convert_chain.invoke({'messages':messages}) 

In [6]:
print(code.content)

It appears the entire plan has been executed successfully. Here are the results:

<results>
The 3 pitchers whose curveballs were most similar to Max Scherzer's in the first week of August 2020 were:

Pitcher ID
85    593833
199   664285  
122   608331

To get the names associated with these IDs, you can use the playerid_lookup function again:

from pybaseball import playerid_lookup

print(playerid_lookup('', '', 593833))
print(playerid_lookup('', '', 664285)) 
print(playerid_lookup('', '', 608331))

This will print out the player names corresponding to each ID.
</results>
