In [1]:
# general
from datetime import datetime
from dotenv import load_dotenv, find_dotenv
import os

# custom local libs
from function import baseball_lambda

Python REPL can execute arbitrary code. Use with caution.


In [2]:
# read local .env file
_ = load_dotenv(find_dotenv()) 

In [3]:
# set Langsmith project
today = datetime.now().strftime("%Y%m%d")
os.environ["LANGCHAIN_PROJECT"] = f"Baseball Curveballs - {today}"

In [4]:
session_id = '4911'

In [5]:
task =  "Consider the first week of August 2020 - find 3 pitchers who's curveballs were most similar to Max Scherzer's."
result = baseball_lambda.execute_workflow(task, session_id)
print(result)

{'Orchestrate': {'next': 'Plan'}}
----
Distance to neareast plan: 0.5457633137702942
Formulating a new plan based on User input
Collecting metadata for functions playerid_lookup,statcast_pitcher,statcast
Modifying plan with function metadata
{'Plan': {'messages': [HumanMessage(content="Here is the updated plan to find the 3 pitchers whose curveballs were most similar to Max Scherzer's during the first week of August 2020:\n\n1. Use the `playerid_lookup` function to get the 'key_mlbam' ID for Max Scherzer by passing in his last name 'scherzer' and first name 'max'.\n\n2. Use the `statcast` function to get pitch-level data for all games between '2020-08-01' and '2020-08-07'. \n\n3. From the output, filter to only include rows where:\n   - The 'pitch_type' is 'CU' (curveball)\n   - The 'pitcher' value is NOT equal to Scherzer's 'key_mlbam' ID\n\n4. Calculate summary statistics like average 'release_speed', 'release_spin', 'pfx_x' (horizontal movement), and 'pfx_z' (vertical movement) for 

In [6]:
task = """
make sure the plan follows this general flow:
1) create an average vector for each pitcher's curveball.
2) Train a knn model on this data
3) Use this model to find the 3 pitchers with the most similar curveball (not including Max Scherzer himself)
"""
result = baseball_lambda.execute_workflow(task, session_id)
print(result)

{'Orchestrate': {'next': 'Revise'}}
----
{'Revise': {'messages': [HumanMessage(content="Thank you for the additional guidance. Here is an updated plan that follows the general flow you outlined:\n\n1. Use the `playerid_lookup` function to get the 'key_mlbam' ID for Max Scherzer by passing in his last name 'scherzer' and first name 'max'.\n\n2. Use the `statcast` function to get pitch-level data for all games between '2020-08-01' and '2020-08-07'.\n\n3. From the output, filter to only include rows where the 'pitch_type' is 'CU' (curveball).\n\n4. Create a feature matrix X containing the following columns for each curveball:\n   - 'release_speed'\n   - 'release_spin'  \n   - 'pfx_x' (horizontal movement)\n   - 'pfx_z' (vertical movement)\n\n5. Create a target vector y containing the 'pitcher' value for each row.\n\n6. Remove the row for Max Scherzer's curveballs from X and y.\n\n7. Fit a k-Nearest Neighbors model on X and y to learn a representation of each pitcher's curveball.\n\n8. Cre

In [7]:
task = 'approved'
result = baseball_lambda.execute_workflow(task, session_id)
print(result)

{'Orchestrate': {'next': 'Convert'}}
----
{'Convert': {'code': '\n# Import necessary libraries\nimport pandas as pd\nfrom pybaseball import playerid_lookup, statcast\nfrom sklearn.neighbors import NearestNeighbors\n\n# 1. Get Max Scherzer\'s \'key_mlbam\' ID\nscherzer_id = playerid_lookup(\'scherzer\', \'max\')[\'key_mlbam\'].values[0]\n\n# 2. Get pitch-level data for 2020-08-01 to 2020-08-07\ndata = statcast(\'2020-08-01\', \'2020-08-07\')\n\n# 3. Filter for curveballs only\ncurveballs = data[data[\'pitch_type\'] == \'CU\']\n\n# 4. Create feature matrix X\nX = curveballs[[\'release_speed\', \'release_spin_rate\', \'pfx_x\', \'pfx_z\']]\n\n# 5. Create target vector y\ny = curveballs[\'pitcher\']\n\n# 6. Remove Scherzer\'s curveballs from X and y\nX = X[y != scherzer_id]\ny = y[y != scherzer_id]\n\n# 7. Fit kNN model\nknn = NearestNeighbors(n_neighbors=3)\nknn.fit(X, y)\n\n# 8. Create average feature vector for Scherzer\'s curveballs\nscherzer_curveballs = curveballs[curveballs[\'pitche

  0%|          | 0/7 [00:00<?, ?it/s]

Result: Gathering player lookup table. This may take a moment.

Executing: # 2. Get pitch-level data for 2020-08-01 to 2020-08-07
data = statcast('2020-08-01', '2020-08-07')


100%|██████████| 7/7 [00:16<00:00,  2.33s/it]


Result: This is a large query, it may take a moment to complete

Executing: # 3. Filter for curveballs only
curveballs = data[data['pitch_type'] == 'CU']
Executing: # 4. Create feature matrix X
X = curveballs[['release_speed', 'release_spin_rate', 'pfx_x', 'pfx_z']]
Executing: # 5. Create target vector y
y = curveballs['pitcher']
Executing: # 6. Remove Scherzer's curveballs from X and y
X = X[y != scherzer_id]
y = y[y != scherzer_id]
Executing: # 7. Fit kNN model
knn = NearestNeighbors(n_neighbors=3)
knn.fit(X, y)
Result: ValueError('Input X contains NaN.\nNearestNeighbors does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/im

In [8]:
task = 'I want the names of the pitchers, you have returned their index'
result = baseball_lambda.execute_workflow(task, session_id)
print(result)

{'Orchestrate': {'next': 'Revise'}}
----
{'Revise': {'messages': [HumanMessage(content="Thank you for the feedback. Here is an updated plan that returns the pitcher names instead of just their IDs:\n\n1. Use the `playerid_lookup` function to get the 'key_mlbam' ID for Max Scherzer by passing in his last name 'scherzer' and first name 'max'.\n\n2. Use the `statcast` function to get pitch-level data for all games between '2020-08-01' and '2020-08-07'. \n\n3. From the output, filter to only include rows where the 'pitch_type' is 'CU' (curveball).\n\n4. Create a feature matrix X containing the following columns for each curveball:\n   - 'release_speed'\n   - 'release_spin'\n   - 'pfx_x' (horizontal movement)  \n   - 'pfx_z' (vertical movement)\n\n5. Create a target vector y containing the 'player_name' value for each row.\n\n6. Remove the rows for Max Scherzer's curveballs from X and y.\n\n7. Fit a k-Nearest Neighbors model on X and y to learn a representation of each pitcher's curveball. 

In [9]:
task = 'yes'
result = baseball_lambda.execute_workflow(task, session_id)
print(result)

{'Orchestrate': {'next': 'Convert'}}
----


  0%|          | 0/7 [00:00<?, ?it/s]

{'Convert': {'code': '\n# Import necessary libraries\nimport pandas as pd\nfrom pybaseball import playerid_lookup, statcast\nfrom sklearn.neighbors import NearestNeighbors\n\n# 1. Get Max Scherzer\'s \'key_mlbam\' ID\nscherzer_id = playerid_lookup(\'scherzer\', \'max\')[\'key_mlbam\'].values[0]\n\n# 2. Get pitch-level data for 2020-08-01 to 2020-08-07\npitches = statcast(\'2020-08-01\', \'2020-08-07\')\n\n# 3. Filter for curveballs only\ncurveballs = pitches[pitches[\'pitch_type\'] == \'CU\']\n\n# 4. Create feature matrix X\nX = curveballs[[\'release_speed\', \'release_spin_rate\', \'pfx_x\', \'pfx_z\']]\n\n# 5. Create target vector y with pitcher names\ny = curveballs[\'player_name\']\n\n# 6. Remove Scherzer\'s curveballs from X and y\nscherzer_mask = curveballs[\'player_name\'] != \'Max Scherzer\'\nX_other = X[scherzer_mask]\ny_other = y[scherzer_mask]\n\n# 7. Fit kNN model on other pitchers\' curveballs\nknn = NearestNeighbors(n_neighbors=3)\nknn.fit(X_other, y_other)\n\n# 8. Create

100%|██████████| 7/7 [00:05<00:00,  1.33it/s]


Result: This is a large query, it may take a moment to complete

Executing: # 3. Filter for curveballs only
curveballs = pitches[pitches['pitch_type'] == 'CU']
Executing: # 4. Create feature matrix X
X = curveballs[['release_speed', 'release_spin_rate', 'pfx_x', 'pfx_z']]
Executing: # 5. Create target vector y with pitcher names
y = curveballs['player_name']
Executing: # 6. Remove Scherzer's curveballs from X and y
scherzer_mask = curveballs['player_name'] != 'Max Scherzer'
X_other = X[scherzer_mask]
y_other = y[scherzer_mask]
Executing: # 7. Fit kNN model on other pitchers' curveballs
knn = NearestNeighbors(n_neighbors=3)
knn.fit(X_other, y_other)
Result: ValueError('Input X contains NaN.\nNearestNeighbors does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for i

In [10]:
task = 'yes, thank you'
result = baseball_lambda.execute_workflow(task, session_id)
print(result)

{'Orchestrate': {'next': 'Memorize'}}
----
{'Memorize': {'messages': [AIMessage(content='Thank you, task has been commited to memory')], 'previous_node': 'Memorize'}}
----
Thank you, task has been commited to memory
