In [20]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import linear, relu, sigmoid
import os
import csv 

prefixesUpper = ['HOU', 'LAA', 'OAK', 'SEA', 'TEX', 'BAL', 'BOS', 'NYY', 'TBR', 'TOR', 'CHW', 'CLE', 'DET', 'KCR', 'MIN', 'ATL', 'MIA', 'NYM', 'PHI', 'WSN', 'CHC', 'CIN', 'MIL', 'PIT', 'STL', 'ARI', 'COL', 'LAD', 'SDP', 'SFG']
prefixesLower = ['hou', 'laa', 'oak', 'sea', 'tex', 'bal', 'bos', 'nyy', 'tbr', 'tor', 'chw', 'cle', 'det', 'kcr', 'min', 'atl', 'mia', 'nym', 'phi', 'wsn', 'chc', 'cin', 'mil', 'pit', 'stl', 'ari', 'col', 'lad', 'sdp', 'sfg']

def makeGameCsvPaths(list):
    csvPaths = []
    for prefix in list:
        currentPath = f"C:/data/2022/game/{prefix}-games.csv"
        csvPaths.append(currentPath)
    return csvPaths

def makeBattingCsvPaths(list):
    csvPaths = []
    for prefix in list:
        currentPath = f"C:/data/2022/batting/{prefix}-batting.csv"
        csvPaths.append(currentPath)
    return csvPaths

def makePitchingCsvPaths(list):
    csvPaths = []
    for prefix in list:
        currentPath = f"C:/data/2022/pitching/{prefix}-pitching.csv"
        csvPaths.append(currentPath)
    return csvPaths

        
def readCsvPaths(paths):
    dataFromCsvList = []
    # can add columnsToDrop = ['Date', 'Time', 'Attendance'] and then in append data.drop(columnsToDrop, axis=1)
    for path in paths:
        currentCsvData = pd.read_csv(path)
        dataFromCsvList.append(currentCsvData)
    return pd.concat(dataFromCsvList)

gameDataFiles = makeGameCsvPaths(prefixesLower)
battingDataFiles = makeBattingCsvPaths(prefixesLower)
pitchingDataFiles = makePitchingCsvPaths(prefixesLower)
data = readCsvPaths(gameDataFiles)

#iterate_csv_data(gameData)


In [27]:
#clean data 

def clean_batting_data(filename):
    df = pd.read_csv(filename)

    # Replace any string-based missing values with numpy NaN
    df.replace('?', np.nan, inplace=True)

    # Convert percentage columns to float
    for col in ['BB%', 'K%']:
        df[col] = df[col].str.rstrip('%').astype('float') / 100.0

    # Convert all other numeric columns to float
    numeric_cols = ["G","PA","HR","R","RBI","SB","ISO","BABIP","AVG","OBP","SLG","wOBA","xwOBA","wRC+","BsR","Off","Def","WAR","playerid"]
    for col in numeric_cols:
        df[col] = df[col].astype(float)

    # Handle missing values - this is just one approach
    for col in df.columns:
        if df[col].dtype == np.float64:
            df[col].fillna(df[col].mean(), inplace=True)

    # Return the cleaned data
    return df

def clean_pitching_data(filename):
    df = pd.read_csv(filename)

    # Replace any string-based missing values with numpy NaN
    df.replace('?', np.nan, inplace=True)

    # Convert percentage columns to float
    for col in ['LOB%', 'GB%', 'HR/FB']:
        df[col] = df[col].str.rstrip('%').astype('float') / 100.0

    # Convert all other numeric columns to float
    numeric_cols = ["W","L","SV","G","GS","IP","K/9","BB/9","HR/9","BABIP","vFA (pi)","ERA","xERA","FIP","xFIP","WAR","playerid"]
    for col in numeric_cols:
        df[col] = df[col].astype(float)

    # Handle missing values
    for col in df.columns:
        if df[col].dtype == np.float64:
            df[col].fillna(df[col].mean(), inplace=True)

    return df

def clean_and_combine_batting_data(csv_files):
    # Create an empty DataFrame to store all the data
    all_data = pd.DataFrame()

    for file_path in csv_files:
        # Clean the data
        df = clean_batting_data(file_path)

        # Add a new column to indicate the team
        team_name = os.path.basename(file_path).split('-')[0]
        df['Team'] = team_name

        # Append the data to the main DataFrame
        all_data = all_data.append(df)

    # Reset the index of the combined DataFrame
    all_data.reset_index(drop=True, inplace=True)

    return all_data

def clean_and_combine_pitching_data(csv_files):
    # Create an empty DataFrame to store all the data
    all_data = pd.DataFrame()

    # Iterate over every file in the list
    for file_path in csv_files:
        # Clean the data
        df = clean_pitching_data(file_path)

        # Add a new column to indicate the team
        team_name = os.path.basename(file_path).split('-')[0]
        df['Team'] = team_name

        # Append the data to the main DataFrame
        all_data = all_data.append(df)

    # Reset the index of the combined DataFrame
    all_data.reset_index(drop=True, inplace=True)

    return all_data

def gb_to_float(gb):
    if isinstance(gb, float):
        return gb  # Return the float value as it is
    gb_split = gb.split()
    if 'up' in gb:
        if len(gb_split) > 1:
            return -float(gb_split[1])
        else:
            return 0.0  # or whatever value makes sense in this context
    elif 'Tied' in gb:
        return 0.0
    else:
        if len(gb_split) > 0:
            return float(gb_split[0])
        else:
            return 0.0  # or whatever value makes sense in this context



def clean_game_data(filename):
    # Load the data
    df = pd.read_csv(filename)

    # Replace any string-based missing values with numpy NaN
    df.replace('?', np.nan, inplace=True)

    # Convert the W/L column to 1 for a win and 0 for a loss
    df['W/L'] = df['W/L'].apply(lambda x: 1 if 'W' in x else 0)

    # Convert GB to a numeric value
    df['GB'] = df['GB'].apply(gb_to_float)

    # Convert numeric columns to float
    numeric_cols = ["Gm#", "R", "RA", "Inn", "Rank", "Time", "Attendance", "cLI"]
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Handle missing values - this is just one approach
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col].fillna('Unknown', inplace=True)
        else:
            df[col].fillna(0, inplace=True)

    return df

def clean_and_combine_game_data(csv_files):
    # Create an empty DataFrame to store all the data
    all_data = pd.DataFrame()

    # Iterate over every file in the list
    for file_path in csv_files:
        # Clean the data
        df = clean_game_data(file_path)

        # Add a new column to indicate the team
        team_name = os.path.basename(file_path).split('-')[0]
        df['Team'] = team_name

        # Append the data to the main DataFrame
        all_data = all_data.append(df)

    # Reset the index of the combined DataFrame
    all_data.reset_index(drop=True, inplace=True)

    return all_data

pitching_data = clean_and_combine_pitching_data(pitchingDataFiles)
batting_data = clean_and_combine_batting_data(battingDataFiles)
game_data = clean_and_combine_game_data(gameDataFiles)
print(pitching_data.groupby('Team').size())

  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data

Team
ari    18
atl    18
bal    22
bos    19
chc    19
chw    19
cin    17
cle    16
col    20
det    18
hou    19
kcr    22
laa    18
lad    17
mia    17
mil    22
min    21
nym    18
nyy    18
oak    15
phi    23
pit    17
sdp    18
sea    18
sfg    17
stl    21
tbr    19
tex    19
tor    20
wsn    18
dtype: int64


  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)
  all_data = all_data.append(df)


In [2]:
# Replace 'W' with 1 and 'L' with 0
print(data)
data = data.replace({'W': 1, 'W-wo':1, 'L': 0, 'L-wo': 0})
# Preprocess the data
X = data[['R', 'RA', 'Inn', 'Tm']] # replace with the names of the relevant features
y = data['W/L'] # replace with the name of the label column
X = (X - X.mean()) / X.std() # standardize the features
y = y.values.reshape(-1, 1) # reshape the label to a column vector
X = X.astype('float32') # convert X to float32
y = tf.convert_to_tensor(y, dtype=tf.float32)

# Split the data into training and testing sets
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Build the neural network model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

     Gm#             Date Unnamed: 2   Tm Unnamed: 4  Opp   W/L   R  RA   Inn  \
0      1   Thursday Apr 7   boxscore  HOU          @  LAA     W   3   1   NaN   
1      2     Friday Apr 8   boxscore  HOU          @  LAA     W  13   6   NaN   
2      3   Saturday Apr 9   boxscore  HOU          @  LAA     L   0   2   NaN   
3      4    Sunday Apr 10   boxscore  HOU          @  LAA     W   4   1   NaN   
4      5   Tuesday Apr 12   boxscore  HOU          @  ARI     W   2   1   NaN   
..   ...              ...        ...  ...        ...  ...   ...  ..  ..   ...   
157  158   Saturday Oct 1   boxscore  SFG        NaN  ARI     L   4   8   NaN   
158  159     Sunday Oct 2   boxscore  SFG        NaN  ARI  W-wo   4   3  10.0   
159  160     Monday Oct 3   boxscore  SFG          @  SDP     L   4   7   NaN   
160  161    Tuesday Oct 4   boxscore  SFG          @  SDP     L   2   6   NaN   
161  162  Wednesday Oct 5   boxscore  SFG          @  SDP     W   8   1   NaN   

     ...      GB          W

  X = (X - X.mean()) / X.std() # standardize the features
  X = (X - X.mean()) / X.std() # standardize the features


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2130e5e2a40>