In [1]:
# general
from datetime import datetime
from dotenv import load_dotenv, find_dotenv
import os

# custom local libs
from function import baseball_lambda

Python REPL can execute arbitrary code. Use with caution.


In [2]:
# read local .env file
_ = load_dotenv(find_dotenv()) 

In [3]:
# set Langsmith project
today = datetime.now().strftime("%Y%m%d")
os.environ["LANGCHAIN_PROJECT"] = f"Baseball Curveballs - {today}"

In [4]:
session_id = '4911'

In [5]:
task =  "Consider the first week of August 2020 - find 3 pitchers who's curveballs were most similar to Max Scherzer's."
result = baseball_lambda.execute_workflow(task, session_id)
print(result)

{'Orchestrate': {'next': 'Plan'}}
----
Distance to neareast plan: 0.7576422095298767
Formulating a new plan based on User input
Collecting metadata for functions playerid_lookup,statcast_pitcher,statcast
Modifying plan with function metadata
{'Plan': {'messages': [HumanMessage(content='Updated Plan:\n\n1. Use the `statcast_pitcher` function from the pybaseball library.\n2. Pass in the following attributes:\n   - start_dt: "2020-08-01"\n   - end_dt: "2020-08-07"\n   - player_id: MLBAM player ID for Max Scherzer\n3. Retrieve the output dataframe from the function.\n4. Filter the dataframe to only include curveballs thrown by Max Scherzer.\n5. Extract the relevant fields from the dataframe:\n   - pitch_type\n   - release_speed\n   - release_spin\n   - release_extension\n6. Calculate a similarity metric between the curveballs thrown by Max Scherzer and other pitchers\' curveballs. This can be done using a combination of release_speed, release_spin, and release_extension.\n7. Rank the pitch

In [6]:
task = """
make sure the plan follows this general flow:
1) create an average vector for each pitcher's curveball.
2) Train a knn model on this data
3) Use this model to find the 3 pitchers with the most similar curveball (not including Max Scherzer himself)
"""
result = baseball_lambda.execute_workflow(task, session_id)
print(result)

{'Orchestrate': {'next': 'Revise'}}
----
{'Revise': {'messages': [HumanMessage(content='Based on the feedback provided, here is the revised plan that incorporates the suggested steps of creating an average vector for each pitcher\'s curveball, training a KNN model, and using this model to find the 3 pitchers with the most similar curveball to Max Scherzer\'s, excluding Scherzer himself:\n\n1. Use the `playerid_lookup` function from the pybaseball library to find Max Scherzer\'s MLBAM player ID.\n   - Pass in "Scherzer" as the last name and "Max" as the first name.\n   \n2. Use the `statcast_pitcher` function to retrieve Max Scherzer\'s pitching data.\n   - Pass in the appropriate date range and Max Scherzer\'s MLBAM player ID.\n   \n3. Filter the retrieved data to only include curveballs thrown by Max Scherzer.\n   - Use the `pitch_type` field to filter for curveballs.\n   \n4. Calculate the average vector for Max Scherzer\'s curveballs using relevant fields such as `release_speed`, `r

In [7]:
task = 'approved'
result = baseball_lambda.execute_workflow(task, session_id)
print(result)

{'Orchestrate': {'next': 'Convert'}}
----
{'Convert': {'code': '\n# Importing necessary libraries\nfrom pybaseball import playerid_lookup, statcast_pitcher, statcast\nfrom sklearn.neighbors import NearestNeighbors\nimport pandas as pd\n\n# Step 1: Find Max Scherzer\'s MLBAM player ID\nplayer_id_df = playerid_lookup(\'Scherzer\', \'Max\')\nplayer_id = player_id_df[\'key_mlbam\'].values[0]\n\n# Step 2: Retrieve Max Scherzer\'s pitching data\nsch_pitch_data = statcast_pitcher(\'2021-04-01\', \'2021-10-01\', player_id)\n\n# Step 3: Filter data to only include curveballs thrown by Max Scherzer\nsch_curveballs = sch_pitch_data[sch_pitch_data[\'pitch_type\'] == \'CU\']\n\n# Step 4: Calculate the average vector for Max Scherzer\'s curveballs\nsch_avg_vector = sch_curveballs[[\'release_speed\', \'release_spin_rate\', \'release_extension\']].mean()\n\n# Step 5: Retrieve a comprehensive dataset of pitchers and their pitches\nall_pitch_data = statcast(\'2021-04-01\', \'2021-10-01\')\n\n# Step 6: F

  0%|          | 0/184 [00:00<?, ?it/s]

Result: Gathering Player Data

Executing: # Step 3: Filter data to only include curveballs thrown by Max Scherzer
sch_curveballs = sch_pitch_data[sch_pitch_data['pitch_type'] == 'CU']
Executing: # Step 4: Calculate the average vector for Max Scherzer's curveballs
sch_avg_vector = sch_curveballs[['release_speed', 'release_spin_rate', 'release_extension']].mean()
Executing: # Step 5: Retrieve a comprehensive dataset of pitchers and their pitches
all_pitch_data = statcast('2021-04-01', '2021-10-01')


100%|██████████| 184/184 [02:29<00:00,  1.23it/s]


Result: This is a large query, it may take a moment to complete

Executing: # Step 6: Filter this dataset to only include curveballs
all_curveballs = all_pitch_data[all_pitch_data['pitch_type'] == 'CU']
Executing: # Step 7: Group the data by pitcher and calculate the average vector for each pitcher's curveballs
pitcher_avg_vectors = all_curveballs.groupby('player_name')[['release_speed', 'release_spin_rate', 'release_extension']].mean()
Executing: # Exclude Max Scherzer's data from the training set
pitcher_avg_vectors = pitcher_avg_vectors[pitcher_avg_vectors.index != 'Max Scherzer']
Executing: # Step 8: Train a KNN model on this dataset of average vectors for each pitcher
knn = NearestNeighbors(n_neighbors=3)
knn.fit(pitcher_avg_vectors)
Result: ValueError('Input X contains NaN.\nNearestNeighbors does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing 

In [8]:
task = 'yes'
result = baseball_lambda.execute_workflow(task, session_id)
print(result)

{'Orchestrate': {'next': 'Memorize'}}
----
{'Memorize': {'messages': [AIMessage(content='Thank you, task has been commited to memory')], 'previous_node': 'Memorize'}}
----
Thank you, task has been commited to memory
