In [1]:
import streamlit as st
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import altair as alt
import requests
from io import StringIO
import re


In [2]:
def load_data(filepath):
    data = pd.read_csv(filepath)
    return data

In [12]:
pitcher_boxscores = load_data('datasets/complex_pitchers.csv')
simple_basic_gamelogs = load_data('datasets/basicGameLogs.csv')


pitcher_boxscores = pitcher_boxscores.sort_values(by='game_date', ascending=True)
pitcher_boxscores = pitcher_boxscores[pitcher_boxscores['isStarter']]


r_per_inning = load_data('datasets/r_per_inning.csv')



In [13]:

grouped_runs_df = r_per_inning.groupby(['game_id','homeOrAway']).agg({
    '1': 'sum', '2': 'sum', '3': 'sum', '4': 'sum',
    '5': 'sum', '6': 'sum', '7': 'sum', '8': 'sum', '9': 'sum'
    }).reset_index()

In [14]:

teams = pitcher_boxscores[pitcher_boxscores['seasonNumber'] == 2024]['Team'].unique()
game_id_dict = {}
game_date_dict= {}

In [15]:
for team in teams:
    game_id_dict[team] = pitcher_boxscores[pitcher_boxscores['Team'] == team]['game_id'].to_list()
    game_date_dict[team] = pitcher_boxscores[pitcher_boxscores['Team'] == team]['game_date'].to_list()


In [26]:
inning_selection = 1

nrfi_dict = {}

number_of_games = 5


for team_name in sorted(teams):
    game_id_list = game_id_dict[team_name]


    merged_df = pd.merge(grouped_runs_df, pitcher_boxscores, on='game_id', how='inner').sort_values(by='game_date')
    merged_df = merged_df[::-1]

    full_df = merged_df[merged_df['game_id'].isin(game_id_list)].copy()

    team_df = full_df[full_df['Team'] == team_name].sort_values(by='game_date', ascending=False)
    # opponent_df = full_df[full_df['Team'] != team_name].sort_values(by='game_date', ascending=False)

    # combined_df = full_df.groupby(['game_id', 'game_date']).agg({
    #     '1': 'sum', '2': 'sum', '3': 'sum', '4': 'sum',
    #     '5': 'sum', '6': 'sum', '7': 'sum', '8': 'sum', '9': 'sum'
    #     }).reset_index().sort_values(by='game_date', ascending=False)





    stat_list = team_df[str(inning_selection)].to_list()

    nrfi_dict[team_name] = stat_list


In [52]:
# Function to calculate the percentage of the last 5 values greater than 1
def calculate_percentage(row, df):
    current_index = row.name
    if current_index < 10:
        return None  # Not enough data to calculate for the first 4 rows
    previous_values = df['run_values'].iloc[current_index-10:current_index]
    count_greater_than_1 = previous_values[previous_values > 0.5].count()
    return (count_greater_than_1 / 10) * 100

In [65]:
# team_name = 'Arizona Diamondbacks'
for team_name in teams:
    df = pd.DataFrame(nrfi_dict[team_name], columns=['run_values'])


    df['percentage_greater_than_1'] = df.apply(lambda row: calculate_percentage(row, df), axis=1)

    df['isNRFI'] = (df['run_values'] < 0.5).astype(int)
    df['isYRFI'] = (df['run_values'] > 0.5).astype(int)




    df = df.dropna(subset=['percentage_greater_than_1'])


    bins = [0, 15, 35, 55, 75, 95, 100]

    labels = ['0-15%', '15-35%', '35-55%', '55-75%','75-95%', '95-100%']
    df['percentage_category'] = pd.cut(df['percentage_greater_than_1'], bins=bins, labels=labels, include_lowest=True)

    result = df.groupby('percentage_category', observed=False)['isYRFI'].mean()

    print(team_name)
    print('YRFI')
    print(result)


    result = df.groupby('percentage_category', observed=False)['isNRFI'].mean()
    print(team_name)
    print('NRFI')
    print(result)


San Diego Padres
YRFI
percentage_category
0-15%      0.287379
15-35%     0.281273
35-55%     0.285714
55-75%     0.391608
75-95%     0.333333
95-100%         NaN
Name: isYRFI, dtype: float64
San Diego Padres
NRFI
percentage_category
0-15%      0.712621
15-35%     0.718727
35-55%     0.714286
55-75%     0.608392
75-95%     0.666667
95-100%         NaN
Name: isNRFI, dtype: float64
Los Angeles Dodgers
YRFI
percentage_category
0-15%      0.314433
15-35%     0.296123
35-55%     0.290727
55-75%     0.333333
75-95%     0.250000
95-100%         NaN
Name: isYRFI, dtype: float64
Los Angeles Dodgers
NRFI
percentage_category
0-15%      0.685567
15-35%     0.703877
35-55%     0.709273
55-75%     0.666667
75-95%     0.750000
95-100%         NaN
Name: isNRFI, dtype: float64
New York Yankees
YRFI
percentage_category
0-15%      0.271100
15-35%     0.297336
35-55%     0.318357
55-75%     0.232558
75-95%          NaN
95-100%         NaN
Name: isYRFI, dtype: float64
New York Yankees
NRFI
percentage_catego

In [59]:
df[df['percentage_greater_than_1'] == 90]
df

Unnamed: 0,run_values,percentage_greater_than_1,isNRFI,isYRFI,percentage_category
10,0,30.0,1,0,15-35%
11,0,30.0,1,0,15-35%
12,0,30.0,1,0,15-35%
13,2,30.0,0,1,15-35%
14,1,30.0,0,1,15-35%
...,...,...,...,...,...
2809,1,20.0,0,1,15-35%
2810,2,10.0,0,1,0-15%
2811,0,0.0,1,0,0-15%
2812,0,0.0,1,0,0-15%
