In [1]:
from langchain_experimental.tools import PythonREPLTool
import text

# define python repl
python_repl = PythonREPLTool()

# initiate python_repl to ignore warnings
python_repl.invoke('import warnings\nwarnings.simplefilter("ignore")')

Python REPL can execute arbitrary code. Use with caution.


''

In [2]:
from dotenv import load_dotenv, find_dotenv
import re

from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnableLambda
from langchain_community.chat_models import BedrockChat

# read local .env file
_ = load_dotenv(find_dotenv()) 

# define language model
model_id = 'anthropic.claude-3-sonnet-20240229-v1:0'
#model_id = 'anthropic.claude-3-haiku-20240307-v1:0'
llm = BedrockChat(model_id=model_id, model_kwargs={'temperature': 0})

# set a distance threshold for when to create a new plan vs modify an existing plan
threshold = .5


def extract_text_between_markers(text):
    '''Helper function to extract code'''
    start_marker = '```python'
    end_marker = '```'

    pattern = re.compile(f'{re.escape(start_marker)}(.*?){re.escape(end_marker)}', re.DOTALL)
    matches = pattern.findall(text.content)
    return matches[0]


CONVERT_SYSTEM_PROMPT = '''<instructions>You are a highly skilled Python programmer.  Your goal is to help a user execute a plan by writing code for a Python REPL.</instructions>

Text between the <function_detail></function_detail> tags is documentation on the functions in use.  Do not attempt to use any feature that is not explicitly listed in the data dictionary for that function.
<function_detail> 
{function_detail}
</function_detail>

Text between the <task></task> tags is the goal of the plan.
<task>
{task}
</task>

Text between the <plan></plan> tags is the entire plan that will be executed.
<plan>
{plan}
</plan>

Text between the <rules></rules> tags are rules that must be followed.
<rules>
1. Import all necessary libraries at the start of your code.
2. Always assign the result of a pybaseball function call to a variable.
3. When writing code for the last step in the plan, always use print() to write a detailed summary of the results.
4. Never write functions
5. Return all python code between three tick marks like this:
```python
python code goes here
```
6. Comment your code liberally to be clear about what is happening and why.
</rules>
'''

In [3]:
convert_prompt_template = ChatPromptTemplate.from_messages([
    ("system", CONVERT_SYSTEM_PROMPT),
    MessagesPlaceholder(variable_name="messages"), 
]).partial(function_detail=text.function_detail, task=text.task, plan=text.plan)

convert_chain = convert_prompt_template | llm | RunnableLambda(extract_text_between_markers)

convert_message = 'Convert the next step of the plan into code that can be executed in a Python REPL.'

messages = [HumanMessage(content=convert_message)]

# invoke convert chain
code = convert_chain.invoke({'messages':messages}) 

In [4]:
print(code)


# Import necessary libraries
from pybaseball import statcast
import pandas as pd

# Get all pitch data from 2020-08-01 to 2020-08-07
all_pitches = statcast('2020-08-01', '2020-08-07')



In [5]:
result = python_repl.invoke(code)

100%|██████████| 7/7 [00:04<00:00,  1.62it/s]


In [6]:
result

'This is a large query, it may take a moment to complete\n'

In [7]:
'''
step = """2. Filter for just curveballs:
all_curves = all_pitches[all_pitches['pitch_type'] == 'CU']"""
'''

'\nstep = """2. Filter for just curveballs:\nall_curves = all_pitches[all_pitches[\'pitch_type\'] == \'CU\']"""\n'

In [8]:
messages.append(AIMessage(content=f'The previous step completed successfully with the following code:\n\n```python\n{code}\n```\n\nHere was the result: {result}'))
messages.append(HumanMessage(content=convert_message))

# invoke convert chain
code = convert_chain.invoke({'messages':messages}) 

In [9]:
print(code)


# Filter for just curveballs
all_curves = all_pitches[all_pitches['pitch_type'] == 'CU']



In [10]:
result = python_repl.invoke(code)

In [11]:
print(result)




In [12]:
messages.append(AIMessage(content=f'The previous step completed successfully with the following code:\n\n```python\n{code}\n```\n\nHere was the result: {result}'))
messages.append(HumanMessage(content=convert_message))

# invoke convert chain
code = convert_chain.invoke({'messages':messages}) 

In [13]:
print(code)


# Create a feature vector for each pitcher's curveballs
pitcher_curves = all_curves.groupby('pitcher')
pitcher_features = pitcher_curves[['release_speed', 'release_spin', 'pfx_x', 'pfx_z']].mean().reset_index()



In [14]:
result = python_repl.invoke(code)

In [15]:
result

'KeyError("Columns not found: \'release_spin\'")'

In [16]:
messages.append(AIMessage(content=f'The previous step reached an error with the following code:\n\n```python\n{code}\n```\n\nHere was the error: {result}'))
messages.append(HumanMessage(content=f'What information would be useful in order to troubleshoot this error?  Write Python code that can be executed in a python repl to confirm this information.'))

# invoke convert chain
code = convert_chain.invoke({'messages':messages}) 

In [17]:
print(code)


print(all_curves.columns)



In [18]:
result = python_repl.invoke(code)

In [19]:
print(result)

Index(['pitch_type', 'game_date', 'release_speed', 'release_pos_x',
       'release_pos_z', 'player_name', 'batter', 'pitcher', 'events',
       'description', 'spin_dir', 'spin_rate_deprecated',
       'break_angle_deprecated', 'break_length_deprecated', 'zone', 'des',
       'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type',
       'hit_location', 'bb_type', 'balls', 'strikes', 'game_year', 'pfx_x',
       'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b',
       'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y',
       'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2', 'umpire', 'sv_id',
       'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot',
       'hit_distance_sc', 'launch_speed', 'launch_angle', 'effective_speed',
       'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1',
       'fielder_2.1', 'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6',
       'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
       'estima

In [20]:
messages.append(AIMessage(content=f'The following code was executed to help troubleshoot this error:\n\n```python\n{code}\n```\n\nHere is the result:\n\n{result}'))
messages.append(HumanMessage(content=convert_message))

# invoke convert chain
code = convert_chain.invoke({'messages':messages}) 

In [21]:
print(code)


# Create a feature vector for each pitcher's curveballs 
pitcher_curves = all_curves.groupby('pitcher')
pitcher_features = pitcher_curves[['release_speed', 'release_spin_rate', 'pfx_x', 'pfx_z']].mean().reset_index()



In [22]:
result = python_repl.invoke(code)

In [23]:
print(result)




In [24]:
messages.append(AIMessage(content=f'The previous step completed successfully with the following code:\n\n```python\n{code}\n```\n\nHere was the result: {result}'))
messages.append(HumanMessage(content=convert_message))

# invoke convert chain
code = convert_chain.invoke({'messages':messages}) 

In [25]:
print(code)


# Import necessary libraries
from pybaseball import playerid_lookup

# Get Max Scherzer's player ID
scherzer_id = playerid_lookup('scherzer', 'max')['key_mlbam'][0]



In [26]:
result = python_repl.invoke(code)

In [27]:
print(result)

Gathering player lookup table. This may take a moment.



In [28]:
messages.append(AIMessage(content=f'The previous step completed successfully with the following code:\n\n```python\n{code}\n```\n\nHere was the result: {result}'))
messages.append(HumanMessage(content=convert_message))

# invoke convert chain
code = convert_chain.invoke({'messages':messages}) 

In [29]:
print(code)



# Get Scherzer's feature vector
scherzer_features = pitcher_features[pitcher_features['pitcher'] == scherzer_id]




In [30]:
result = python_repl.invoke(code)

In [31]:
print(result)




In [32]:
messages.append(AIMessage(content=f'The previous step completed successfully with the following code:\n\n```python\n{code}\n```\n\nHere was the result: {result}'))
messages.append(HumanMessage(content=convert_message))

# invoke convert chain
code = convert_chain.invoke({'messages':messages}) 

In [33]:
print(code)


# Import scikit-learn and create a knn model
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=4)
knn.fit(pitcher_features[['release_speed', 'release_spin_rate', 'pfx_x', 'pfx_z']])



In [34]:
result = python_repl.invoke(code)

In [35]:
print(result)




In [36]:
messages.append(AIMessage(content=f'The previous step completed successfully with the following code:\n\n```python\n{code}\n```\n\nHere was the result: {result}'))
messages.append(HumanMessage(content=convert_message))

# invoke convert chain
code = convert_chain.invoke({'messages':messages}) 

In [37]:
print(code)



# Find the 3 pitchers closest to Scherzer
distances, indices = knn.kneighbors(scherzer_features[['release_speed', 'release_spin_rate', 'pfx_x', 'pfx_z']])
closest_indices = indices[0][1:4]
similar_pitchers = pitcher_features.iloc[closest_indices]




In [38]:
result = python_repl.invoke(code)

In [40]:
print(result)




In [42]:
result = python_repl.invoke('print(similar_pitchers)')

In [43]:
print(result)

     pitcher  release_speed  release_spin_rate     pfx_x     pfx_z
85    593833          74.55             2866.0     1.105     -1.01
199   664285      80.103333        2865.466667 -1.099333 -1.139333
122   608331      75.194118        2853.058824 -0.944118 -1.347647

