In [33]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import linear, relu, sigmoid
import os
import csv 

TEAMS = ['hou', 'laa', 'oak', 'sea', 'tex', 'bal', 'bos', 'nyy', 'tbr', 'tor', 
         'chw', 'cle', 'det', 'kcr', 'min', 'atl', 'mia', 'nym', 'phi', 'wsn', 
         'chc', 'cin', 'mil', 'pit', 'stl', 'ari', 'col', 'lad', 'sdp', 'sfg']

def make_csv_paths(teams, data_type):
    base_path = f"C:/data/2022/{data_type}"
    return [f"{base_path}/{team}-{data_type}.csv" for team in teams]

def read_csv_paths(paths):
    dataframes = [pd.read_csv(path) for path in paths]
    return pd.concat(dataframes)

game_data_files = make_csv_paths(TEAMS, 'games')
batting_data_files = make_csv_paths(TEAMS, 'batting')
pitching_data_files = make_csv_paths(TEAMS, 'pitching')

In [53]:
def clean_data(df, percentage_cols, numeric_cols):
    # Replace any string-based missing values with numpy NaN
    df.replace('?', np.nan, inplace=True)

    # Convert percentage columns to float
    for col in percentage_cols:
        df[col] = df[col].str.rstrip('%').astype('float') / 100.0

    # Convert all other numeric columns to float
    for col in numeric_cols:
        df[col] = df[col].astype(float)

    # Handle missing values
    for col in df.columns:
        if df[col].dtype == np.float64:
            df[col].fillna(df[col].mean(), inplace=True)

    return df

def clean_and_combine_data(clean_function, csv_files):
    # Create an empty DataFrame to store all the data
    all_data = pd.DataFrame()

    for file_path in csv_files:
        # Clean the data
        df = clean_function(file_path)

        # Add a new column to indicate the team
        team_name = os.path.basename(file_path).split('-')[0]
        df['Team'] = team_name

        # Append the data to the main DataFrame
        all_data = all_data.append(df)

    # Reset the index of the combined DataFrame
    all_data.reset_index(drop=True, inplace=True)

    return all_data

def clean_batting_data(filename):
    df = pd.read_csv(filename)
    percentage_cols = ['BB%', 'K%']
    numeric_cols = ["G","PA","HR","R","RBI","SB","ISO","BABIP","AVG","OBP","SLG","wOBA","xwOBA","wRC+","BsR","Off","Def","WAR","playerid"]
    return clean_data(df, percentage_cols, numeric_cols)

def clean_pitching_data(filename):
    df = pd.read_csv(filename)
    percentage_cols = ['LOB%', 'GB%', 'HR/FB']
    numeric_cols = ["W","L","SV","G","GS","IP","K/9","BB/9","HR/9","BABIP","vFA (pi)","ERA","xERA","FIP","xFIP","WAR","playerid"]
    return clean_data(df, percentage_cols, numeric_cols)

def clean_game_data(filename):
    # Load the data
    df = pd.read_csv(filename)

    # Replace any string-based missing values with numpy NaN
    df.replace('?', np.nan, inplace=True)

    # Convert the W/L column to 1 for a win and 0 for a loss
    df['W/L'] = df['W/L'].apply(lambda x: 1 if 'W' in x else 0)

    # Convert GB to a numeric value
    df['GB'] = df['GB'].apply(gb_to_float)

    # Handle missing values for numeric columns
    numeric_cols = ["Gm#", "R", "RA", "Inn", "Rank", "Attendance", "cLI"]
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Handle missing values for non-numeric columns
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col].fillna('Unknown', inplace=True)

    return df

pitching_data = clean_and_combine_data(clean_pitching_data, pitching_data_files)
batting_data = clean_and_combine_data(clean_batting_data, batting_data_files)
game_data = clean_and_combine_data(clean_game_data, game_data_files)

#spread season totals out across each game 
# Get the number of games each team played
# Get the number of games each team played
num_games = game_data['Team'].value_counts()

# List of numeric columns in the pitching and batting data
# dividing certain columns (like 'W', 'L', 'SV', 'HR', etc.) is bad change this
pitching_numeric_cols = ['W', 'L', 'SV', 'G', 'GS', 'IP', 'K/9', 'BB/9', 'HR/9',
       'BABIP', 'LOB%', 'GB%', 'HR/FB', 'vFA (pi)', 'ERA', 'xERA', 'FIP',
       'xFIP', 'WAR', 'playerid']

batting_numeric_cols = ['G', 'PA', 'HR', 'R', 'RBI', 'SB', 'BB%', 'K%', 'ISO',
       'BABIP', 'AVG', 'OBP', 'SLG', 'wOBA', 'xwOBA', 'wRC+', 'BsR', 'Off',
       'Def', 'WAR', 'playerid']

# Divide the seasonal statistics by the number of games
pitching_data_per_game = pitching_data.copy()
batting_data_per_game = batting_data.copy()

for team in num_games.index:
    pitching_data_per_game.loc[pitching_data_per_game['Team'] == team, pitching_numeric_cols] /= num_games[team]
    batting_data_per_game.loc[batting_data_per_game['Team'] == team, batting_numeric_cols] /= num_games[team]
    pitching_data_per_game['Gm#'] = pitching_data_per_game.groupby('Team').cumcount() + 1
    batting_data_per_game['Gm#'] = batting_data_per_game.groupby('Team').cumcount() + 1





  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data

In [54]:
#normalize and preprocess data 
print("Pitching data columns: ", pitching_data_per_game.columns)
print("Batting data columns: ", batting_data_per_game.columns)


# Merge the three dataframes on 'Team' and 'Gm#'
combined_data = pd.merge(pitching_data_per_game, batting_data_per_game, on=['Team', 'Gm#'])
combined_data = pd.merge(combined_data, game_data, on=['Team', 'Gm#'])


# Sort by 'Team' and 'Gm#' to get temporal data for each team
combined_data.sort_values(by=['Team', 'Gm#'], inplace=True)

# Group by 'Team' to get sequences of data for each team
combined_data_grouped = combined_data.groupby('Team').apply(lambda x: x.values)

# Now, `combined_data_grouped` is a Series where each element is a 2D numpy array representing the sequence of data for a team.
# You can convert this to a 3D numpy array for input to your model:
combined_data_array = np.array([x for x in combined_data_grouped])


import sklearn
from sklearn.preprocessing import MinMaxScaler

pitching_data_grouped = pitching_data.groupby('Team').mean()
batting_data_grouped = batting_data.groupby('Team').mean()
game_data_grouped = game_data.groupby('Team').mean()

scaler = MinMaxScaler()

pitching_data_normalized = pd.DataFrame(scaler.fit_transform(pitching_data_grouped), columns=pitching_data_grouped.columns, index=pitching_data_grouped.index)
batting_data_normalized = pd.DataFrame(scaler.fit_transform(batting_data_grouped), columns=batting_data_grouped.columns, index=batting_data_grouped.index)
game_data_normalized = pd.DataFrame(scaler.fit_transform(game_data_grouped), columns=game_data_grouped.columns, index=game_data_grouped.index)


Pitching data columns:  Index(['Name', 'Team', 'W', 'L', 'SV', 'G', 'GS', 'IP', 'K/9', 'BB/9', 'HR/9',
       'BABIP', 'LOB%', 'GB%', 'HR/FB', 'vFA (pi)', 'ERA', 'xERA', 'FIP',
       'xFIP', 'WAR', 'playerid', 'Gm#'],
      dtype='object')
Batting data columns:  Index(['Name', 'Team', 'G', 'PA', 'HR', 'R', 'RBI', 'SB', 'BB%', 'K%', 'ISO',
       'BABIP', 'AVG', 'OBP', 'SLG', 'wOBA', 'xwOBA', 'wRC+', 'BsR', 'Off',
       'Def', 'WAR', 'playerid', 'Gm#'],
      dtype='object')


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (30,) + inhomogeneous part.

In [44]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dropout, Flatten, Dense

# Assume input_shape is (steps, input_dim)
input_shape = (100, 1)  # You need to determine the right shape

model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(Flatten())  # This is needed before the fully connected layer
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

# Assume X_train, y_train are your training data and labels
# And X_test, y_test are your testing data and labels

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_2 (Conv1D)           (None, 98, 64)            256       
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 49, 64)           0         
 1D)                                                             
                                                                 
 conv1d_3 (Conv1D)           (None, 47, 128)           24704     
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 23, 128)          0         
 1D)                                                             
                                                                 
 dropout_2 (Dropout)         (None, 23, 128)           0         
                                                                 
 flatten_1 (Flatten)         (None, 2944)             

ValueError: in user code:

    File "C:\Users\jeb\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\engine\training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\jeb\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\engine\training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\jeb\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\engine\training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\jeb\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\engine\training.py", line 1023, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\jeb\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\jeb\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\engine\input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_2" is incompatible with the layer: expected shape=(None, 100, 1), found shape=(None, 4)
