# CS545: Machine Learning
## Fall 2019 - Final Project

Brent Staab and Seth Hughes

## Import code needed in this notebook

In [1]:
import numpy as np
import pandas
import torch
import copy

import os
import sys
import time

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
%matplotlib inline

from sys import platform

## Definition of `NeuralNetwork_Convolutional` (from A4)

In [2]:
class NeuralNetwork_Convolutional():
    # Initilization function (a.k.a. constructor) for object
    def __init__(self, 
                 n_channels_in_image,         # (int) number of values per pixel 
                 image_size,                  # (int) number of rows in image, same as number of columns
                 n_units_in_conv_layers,      # (list of ints) number of units in each convolutional layer
                 kernels_size_and_stride,     # (list of lists) each list is [kernel_size, kernel_stride] for each convolutional layer
                 n_units_in_fc_hidden_layers, # (list of ints) number of units in fully-connected layers
                 classes,                     # (list of ints) labels for each class
                 use_gpu=False):              # (boolean) flag indicating GPU use or not (default = no GPU)

        if not isinstance(n_units_in_conv_layers, list):
            raise Exception('n_units_in_conv_layers must be a list')

        if not isinstance(n_units_in_fc_hidden_layers, list):
            raise Exception('n_units_in_fc_hidden_layers must be a list')
        
        if use_gpu and not torch.cuda.is_available():
            print('\nGPU is not available. Running on CPU.\n')
            use_gpu = False

        self.n_channels_in_image = n_channels_in_image
        self.image_size = image_size 
        self.n_units_in_conv_layers = n_units_in_conv_layers
        self.n_units_in_fc_hidden_layers = n_units_in_fc_hidden_layers
        self.kernels_size_and_stride = kernels_size_and_stride
        self.n_outputs = len(classes)
        self.classes = np.array(classes)
        self.use_gpu = use_gpu
        
        self.n_conv_layers = len(self.n_units_in_conv_layers)
        self.n_fc_hidden_layers = len(self.n_units_in_fc_hidden_layers)
      
        # Build the net layers
        self.nnet = torch.nn.Sequential()

        # Add convolutional layers
        n_units_previous = self.n_channels_in_image
        output_size_previous = self.image_size
        n_layers = 0
        if self.n_conv_layers > 0:

            for (n_units, kernel) in zip(self.n_units_in_conv_layers, self.kernels_size_and_stride):
                n_units_previous, output_size_previous = self._add_conv2d_tanh(n_layers,
                                        n_units_previous, output_size_previous, n_units, kernel)
                n_layers += 1 # for text label in layer
                
        self.nnet.add_module('flatten', torch.nn.Flatten())  # prepare for fc layers

        n_inputs = output_size_previous ** 2 * n_units_previous
        if self.n_fc_hidden_layers > 0:
            for n_units in self.n_units_in_fc_hidden_layers:
                n_inputs = self._add_fc_tanh(n_layers, n_inputs, n_units)
                n_layers += 1

        self.nnet.add_module(f'output_{n_layers}', torch.nn.Linear(n_inputs, self.n_outputs))
        
        # Define loss and optimizer functions
        self.loss_F = torch.nn.CrossEntropyLoss()
        
        # Member variables for standardization
        self.Xmeans = None
        self.Xstds = None

        if self.use_gpu:
            self.nnet.cuda()

        self.n_epochs = 0
        self.error_trace = []

    def _add_conv2d_tanh(self, n_layers, n_units_previous, output_size_previous,
                   n_units, kernel_size_and_stride):
        kernel_size, kernel_stride = kernel_size_and_stride
        self.nnet.add_module(f'conv_{n_layers}', torch.nn.Conv2d(n_units_previous, n_units,
                                                                 kernel_size, kernel_stride))
        self.nnet.add_module(f'output_{n_layers}', torch.nn.Tanh())
        output_size_previous = (output_size_previous - kernel_size) // kernel_stride + 1
        n_units_previous = n_units                
        return n_units_previous, output_size_previous
    
    def _add_fc_tanh(self, n_layers, n_inputs, n_units):
        self.nnet.add_module(f'linear_{n_layers}', torch.nn.Linear(n_inputs, n_units))
        self.nnet.add_module(f'output_{n_layers}', torch.nn.Tanh())
        n_inputs = n_units
        return n_inputs

    def __repr__(self):
        str = f'''{type(self).__name__}(
                            n_channels_in_image={self.n_channels_in_image},
                            image_size={self.image_size},
                            n_units_in_conv_layers={self.n_units_in_conv_layers},
                            kernels_size_and_stride={self.kernels_size_and_stride},
                            n_units_in_fc_hidden_layers={self.n_units_in_fc_hidden_layers},
                            classes={self.classes},
                            use_gpu={self.use_gpu})'''

        str += self.nnet
        if self.n_epochs > 0:
            str += f'\n   Network was trained for {self.n_epochs} epochs that took {self.training_time:.4f} seconds.'
            str += f'\n   Final objective value is {self.error_trace[-1]:.3f}'
        else:
            str += '  Network is not trained.'
        return str
        
    def _standardizeX(self, X):
        result = (X - self.Xmeans) / self.XstdsFixed
        result[:, self.Xconstant] = 0.0
        return result

    def _unstandardizeX(self, Xs):
        return self.Xstds * Xs + self.Xmeans

    def _setup_standardize(self, X, T):
        if self.Xmeans is None:
            self.Xmeans = X.mean(axis=0)
            self.Xstds = X.std(axis=0)
            self.Xconstant = self.Xstds == 0
            self.XstdsFixed = copy.copy(self.Xstds)
            self.XstdsFixed[self.Xconstant] = 1

    def train(self, X, T, n_epochs, learning_rate=0.01):

        start_time = time.time()
        
        self.learning_rate = learning_rate

        if T.ndim == 1:
            T = T.reshape((-1, 1))

        _, T = np.where(T == self.classes)  # convert to labels from 0

        self._setup_standardize(X, T)
        X = self._standardizeX(X)

        X = torch.tensor(X)
        T = torch.tensor(T.reshape(-1))
        if self.use_gpu:
            X = X.cuda()
            T = T.cuda()

        optimizer = torch.optim.Adam(self.nnet.parameters(), lr=self.learning_rate)
        
        # You fill in the rest of the train function, following lecture notes example.
        for epoch in range(n_epochs):
            optimizer.zero_grad()

            Y = self.nnet(X)

            error = self.loss_F(Y, T)
            self.error_trace.append(error)
            if epoch % 5 == 0:
                print(f'Epoch {epoch} error {error:.5f}')

            error.backward()

            optimizer.step()
        
        self.training_time = time.time() - start_time
        
    def get_error_trace(self):
        return self.error_trace
    
    def _softmax(self, Y):
        mx = Y.max()
        expY = np.exp(Y - mx)
        denom = expY.sum(axis=1).reshape((-1, 1)) + sys.float_info.epsilon
        return expY / denom
    
    def use(self, X):
        self.nnet.eval()  # turn off gradients and other aspects of training
        X = self._standardizeX(X)
        X = torch.tensor(X)
        if self.use_gpu:
            X = X.cuda()

        Y = self.nnet(X)

        if self.use_gpu:
            Y = Y.cpu()
        Y = Y.detach().numpy()
        Yclasses = self.classes[Y.argmax(axis=1)].reshape((-1, 1))

        return Yclasses, self._softmax(Y)

## DATA

##### Download the data

In [3]:
# I'm not sure why this doesn't work.  When I run the following two commands, I get the
# output at the bottom of this cell.  I had to manually download from the browser and
# unzip for this notebook to work.

#!curl -O https://github.com/chadwickbureau/baseballdatabank/archive/v2019.2.zip
#!unzip -o v2019.2.zip

#---------------------------------------------------------------------------------
#   % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
#                                  Dload  Upload   Total   Spent    Left  Speed
# 100   137    0   137    0     0    193      0 --:--:-- --:--:-- --:--:--   193
# Archive:  v2019.2.zip
#   End-of-central-directory signature not found.  Either this file is not
#   a zipfile, or it constitutes one disk of a multi-part archive.  In the
#   latter case the central directory and zipfile comment will be found on
#   the last disk(s) of this archive.
# unzip:  cannot find zipfile directory in one of v2019.2.zip or
#         v2019.2.zip.zip, and cannot find v2019.2.zip.ZIP, period.

#### Import data from files

In [4]:
if platform == 'linux':
    people_file = 'baseballdatabank-2019.2/baseballdatabank-2019.2/core/People.csv'
    batting_file = 'baseballdatabank-2019.2/baseballdatabank-2019.2/core/Batting.csv'
    fielding_file = 'baseballdatabank-2019.2/baseballdatabank-2019.2/core/Fielding.csv'
    pitchcing_file = 'baseballdatabank-2019.2/baseballdatabank-2019.2/core/Pitching.csv'
    halloffame_file = 'baseballdatabank-2019.2/baseballdatabank-2019.2/core/HallOfFame.csv'
    allstarfull_filie = 'baseballdatabank-2019.2/baseballdatabank-2019.2/core/AllstarFull.csv'
else:
    print("windows")

    people_file = 'C:\\Users\\user\\Notebooks\\CS545\\Project\\baseballdatabank-2019.2\\core\\People.csv'
    batting_file = 'C:\\Users\\user\\Notebooks\\CS545\\Project\\baseballdatabank-2019.2\\core\\Batting.csv'
    fielding_file = 'C:\\Users\\user\\Notebooks\\CS545\\Project\\baseballdatabank-2019.2\\core\\Fielding.csv'
    pitchcing_file = 'C:\\Users\\user\\Notebooks\\CS545\\Project\\baseballdatabank-2019.2\\core\\Pitching.csv'
    halloffame_file = 'C:\\Users\\user\\Notebooks\\CS545\\Project\\baseballdatabank-2019.2\\core\\HallofFame.csv'
    allstarfull_filie = 'C:\\Users\\user\\Notebooks\\CS545\\Project\\baseballdatabank-2019.2\\core\\AllstarFull.csv'

# Players
df_players = pandas.read_csv(people_file)
df_players = df_players['playerID']

# Batting information for players
df_batting = pandas.read_csv(batting_file)
df_batting.fillna(0, inplace=True)

# Fielding information
df_fielding = pandas.read_csv(fielding_file)
df_fielding.fillna(0, inplace=True)

# Pitching information
df_pitching = pandas.read_csv(pitchcing_file)
df_pitching.fillna(0, inplace=True)

# HoF
df_hof = pandas.read_csv(halloffame_file)
df_hof.fillna(0, inplace=True)
df_hof = df_hof.loc[(df_hof['inducted'] == 'Y') & (df_hof['category'] == 'Player')]
df_hof = df_hof['playerID']
hof_list = list(df_hof.unique())

#All-Star
df_allstar = pandas.read_csv(allstarfull_filie)
df_allstar.fillna(0, inplace=True)
# allstar_list = list(df_allstar.unique()) # list of all star players
df_allstar = df_allstar['playerID']
allstar_list = list(df_allstar.unique())

### Generate career statistics

###### Pitching Statistics

In [5]:
# Create new dataframe to hold career stats for pitchers
# A pitcher is anyone showing up in the 'df_pitching' dataframe, which was imported above
df_career_pitching = pandas.DataFrame()

if os.path.isfile('df_career_pitching.csv'):
    df_career_pitching = pandas.read_csv('df_career_pitching.csv')
    df_career_pitching.drop(['Unnamed: 0'], axis=1, errors='ignore', inplace=True)
    df_career_pitching.drop(['lgID'], axis=1, errors='ignore', inplace=True)
else:
    start_time = time.time()
    
    # Build data set for each pitcher in the pitching database
    count = 0
    for player in df_pitching['playerID'].unique():
        # Set variable to '1' if player is in the HOF, else set to '0'
        in_hof = 1 if player in hof_list else 0
    
        # Get all pitching statistics for this player and sum
        ds_pitch = df_pitching.loc[df_batting['playerID'] == player].sum()
        
        # Get the fielding statistics, only when they were a pitcher, for this player and sum
        ds_field = df_fielding.loc[(df_fielding['playerID'] == player) & (df_fielding['POS'] == 'P')].sum()
        
        # Determine the number of years played (should be the same in both sets, but who knows)
        pitch_yrs = ds_pitch['stint']
        field_yrs = ds_field['stint']
        max_yrs = max(pitch_yrs, field_yrs)
        
        # Get rid of the fields we don't want in both data sets
        ds_pitch.drop(['yearID', 'teamID', 'lgID'], inplace=True, errors='ignore')
        ds_field.drop(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'POS', 'G', 'GS', 'WP'], inplace=True, errors='ignore')
        
        # Overwrite specific fields
        ds_pitch['playerID'] = player
        ds_pitch['stint'] = max_yrs
        ds_pitch['ERA'] = ds_pitch['ERA'] / pitch_yrs
        
        # Add 'HOF' field to data set
        ds_field['HOF'] = in_hof
        
        # Append fielding data to batting data and append to the dataframe
        df_career_pitching = df_career_pitching.append(ds_pitch.append(ds_field), ignore_index=True)
        count += 1
    
    # The order of the columns gets messed up, so fix at the end
    # Move 'stint' to the front, then 'playerID' so we'll get 'playerID' 'stint' ...
    # Move 'HOF' to the end of the list so we'll get ... 'HOF'
    df_career_pitching.drop(['Unnamed: 0'], axis=1, errors='ignore')
    col_name_lst = list(df_career_pitching.columns)
    col_name_lst.insert(0, col_name_lst.pop(col_name_lst.index('stint')))
    col_name_lst.insert(0, col_name_lst.pop(col_name_lst.index('playerID')))
    col_name_lst.append(col_name_lst.pop(col_name_lst.index('HOF')))
    df_career_pitching = df_career_pitching[col_name_lst]

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f'It took {elapsed_time} seconds to process the pitching data. count({count})')
    
    # Save to file so we only need to generate once    
    df_career_pitching.to_csv('df_career_pitching.csv')

###### Fielding Statistics

In [6]:
# Create new dataframe to hold career stats for fielders
# A fielder is anyone showing up in the 'df_fielding' dataframe 
# AND NOT in the 'df_pitching' dataframe, both were imported above
df_career_fielder = pandas.DataFrame()

if os.path.isfile('df_career_fielder.csv'):
    df_career_fielder = pandas.read_csv('df_career_fielder.csv')
else:
    start_time = time.time()
    
    # Get a list of pitchers, these will be excluded from processing
    pitchers = list(df_pitching['playerID'].unique())
    
    # Build data set for each player in the fielding database
    count = 0
    for player in df_fielding['playerID'].unique():
        
        # If the player is a pitcher, exclude them from this dataset
        if player in pitchers:
            continue
        
        # Set variable to '1' if player is in the HOF, else set to '0'
        in_hof = 1 if player in hof_list else 0
        
        # Get all batting and fielding information for this player and sum
        ds_bat = df_batting.loc[df_batting['playerID'] == player].sum()
        ds_fld = df_fielding.loc[df_fielding['playerID'] == player].sum()
        
        # Determine the number of years played (should be the same in both sets, but who knows)
        bat_yrs = ds_bat['stint']
        fld_yrs = ds_fld['stint']
        max_yrs = max(bat_yrs, fld_yrs)
        
        # Get rid of the fields we don't want
        ds_bat.drop(['yearID', 'teamID', 'lgID', 'POS', 'G'], inplace=True, errors='ignore')
        ds_fld.drop(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'POS', 'G', 'GS', 'SB', 'CS'], inplace=True, errors='ignore')
        
        # Overwrite specific fields
        ds_bat['playerID'] = player
        ds_bat['stint'] = max_yrs
        
        # Add 'HOF' field to data set
        ds_fld['HOF'] = in_hof
        
        # Append fielding data to batting data and append to the dataframe
        df_career_fielder = df_career_fielder.append(ds_bat.append(ds_fld), ignore_index=True)
        
        count += 1
    
    # The order of the columns gets messed up, so fix at the end
    # Move 'stint' to the front, then 'playerID' so we'll get 'playerID' 'stint' ...
    # Move 'HOF' to the end of the list so we'll get ... 'HOF'
    col_name_lst = list(df_career_fielder.columns)
    col_name_lst.insert(0, col_name_lst.pop(col_name_lst.index('stint')))
    col_name_lst.insert(0, col_name_lst.pop(col_name_lst.index('playerID')))
    col_name_lst.append(col_name_lst.pop(col_name_lst.index('HOF')))
    df_career_fielder = df_career_fielder[col_name_lst]
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f'It took {elapsed_time} seconds to process the fielder data. count({count})')

    # Save to file so we only need to generate once    
    df_career_fielder.to_csv('df_career_fielder.csv')

In [7]:
# Create new dataframe to hold season stats for pitchers
# A pitcher is anyone showing up in the 'df_pitching' dataframe, which was imported above
df_season_pitching = pandas.DataFrame()

if os.path.isfile('df_season_pitching.csv'):
    df_season_pitching = pandas.read_csv('df_season_pitching.csv')
else:
    start_time = time.time()
    
    count = 0

    # Process all players in the pitching db
    for player in df_pitching['playerID'].unique():
        # Process each season for this player 
        for year in df_pitching.loc[(df_pitching['playerID'] == player)]['yearID'].unique():
            # Set variable to '1' if player is in the HOF, else set to '0'
            all_star = int(not df_all_star.loc[((df_all_star['playerID'] == player) & 
                                                (df_all_star['yearID'] == year))].empty)
        
            # Get all pitching statistics for this player and season
            df_pitch = df_pitching.loc[(df_pitching['playerID'] == player) & 
                                       (df_pitching['yearID'] == year)]

            # Get the fielding statistics for this player and season
            df_field = df_fielding.loc[(df_fielding['playerID'] == player) & 
                                       (df_fielding['yearID'] == year) &
                                       (df_fielding['POS'] == 'P')]
            
            # There may be more than one entry for a player + year (changed teams?)
            # Average all stats for each column.
            ds_pitch = df_pitch.mean()
            ds_field = df_field.mean()
        
            # Get rid of the fields we don't want in both data sets
            ds_pitch.drop(['stint', 'teamID', 'lgID'], inplace=True, errors='ignore')
            ds_field.drop(['playerID', 'yearID', 'stint', 'teamID', 'lgID', 'POS', 'G', 'GS', 'WP'], inplace=True, errors='ignore')

            # Add 'playerID' to pitch data set (it was lost during 'mean' step 
            # Add all-star' field to data set
            ds_pitch['playerID'] = player
            ds_field['all-star'] = all_star
            
            # Append fielding data to batting data and append to the dataframe
            df_season_pitching = df_season_pitching.append(ds_pitch.append(ds_field), ignore_index=True)
            
            count += 1
    
    # The order of the columns gets messed up, so fix at the end
    # Move 'yearID' to the front, then 'playerID' so we'll get 'playerID' 'yearID' ...
    # Move 'all-star' to the end of the list so we'll get ... 'all-star'
    #df_career_pitching.drop(['Unnamed: 0'], axis=1, errors='ignore')
    col_name_lst = list(df_season_pitching.columns)
    col_name_lst.insert(0, col_name_lst.pop(col_name_lst.index('yearID')))
    col_name_lst.insert(0, col_name_lst.pop(col_name_lst.index('playerID')))
    col_name_lst.append(col_name_lst.pop(col_name_lst.index('all-star')))
    df_season_pitching = df_season_pitching[col_name_lst]

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f'It took {elapsed_time} seconds to process the pitching data. count({count})')
    
    # Save to file so we only need to generate once    
    df_season_pitching.to_csv('df_season_pitching.csv')

In [8]:
# Create new dataframe to hold season stats for pitchers
# A pitcher is anyone showing up in the 'df_pitching' dataframe, which was imported above
df_season_fielding = pandas.DataFrame()

if os.path.isfile('df_season_fielding.csv'):
    df_season_fielding = pandas.read_csv('df_season_fielding.csv')
else:
    start_time = time.time()
    
    count = 0

    # Get a list of pitchers, these will be excluded from processing
    pitchers = list(df_pitching['playerID'].unique())
    
    # Process all players in the fielding db
    for player in df_fielding['playerID'].unique():
        
        # If the player is a pitcher, exclude them from this dataset
        if player in pitchers:
            continue
        
        # Process each season for this player 
        for year in df_fielding.loc[(df_fielding['playerID'] == player)]['yearID'].unique():
            # Set variable to '1' if player is in the HOF, else set to '0'
            all_star = int(not df_all_star.loc[((df_all_star['playerID'] == player) & 
                                                (df_all_star['yearID'] == year))].empty)
        
            # Get all batting statistics for this player and season
            df_hit = df_batting.loc[(df_batting['playerID'] == player) & 
                                    (df_batting['yearID'] == year)]

            # Get the fielding statistics for this player and season
            df_field = df_fielding.loc[(df_fielding['playerID'] == player) & 
                                       (df_fielding['yearID'] == year)]
            
            # There may be more than one entry for a player + year (changed teams?)
            # Average all stats for each column.
            ds_hit = df_hit.mean()
            ds_field = df_field.mean()
        
            # Get rid of the fields we don't want in both data sets
            ds_hit.drop(['stint'], inplace=True, errors='ignore')
            ds_field.drop(['yearID', 'stint', 'G', 'CS', 'SB'], inplace=True, errors='ignore')

            # Add 'playerID' to pitch data set (it was lost during 'mean' step 
            # Add all-star' field to data set
            ds_hit['playerID'] = player
            ds_field['all-star'] = all_star
            
            # Append fielding data to batting data and append to the dataframe
            df_season_fielding = df_season_fielding.append(ds_hit.append(ds_field), ignore_index=True)
            
            count += 1
    
    # The order of the columns gets messed up, so fix at the end
    # Move 'yearID' to the front, then 'playerID' so we'll get 'playerID' 'yearID' ...
    # Move 'all-star' to the end of the list so we'll get ... 'all-star'
    col_name_lst = list(df_season_fielding.columns)
    col_name_lst.insert(0, col_name_lst.pop(col_name_lst.index('yearID')))
    col_name_lst.insert(0, col_name_lst.pop(col_name_lst.index('playerID')))
    col_name_lst.append(col_name_lst.pop(col_name_lst.index('all-star')))
    df_season_fielding = df_season_fielding[col_name_lst]

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f'It took {elapsed_time} seconds to process the fielding data. count({count})')
    
    # Save to file so we only need to generate once    
    df_season_fielding.to_csv('df_season_fielding.csv')

#### Summary of career statistics

In [9]:
num_fielders = df_career_fielder.count()[0]
num_pitchers = df_career_pitching.count()[0]
num_players = num_fielders + num_pitchers
num_hof = df_hof.count()
pct_hof = (num_hof / num_players) * 100

hofers = set(df_hof.unique())
pitchers = set(df_career_pitching['playerID'].unique())
fielders = set(df_career_fielder['playerID'].unique())
num_hof_pitchers = len(pitchers.intersection(hofers))
pct_hof_pitchers = (num_hof_pitchers / num_pitchers) * 100
num_hof_fielders = len(fielders.intersection(hofers))
pct_hof_fielders = (num_hof_fielders / num_fielders) * 100

print('-----------------Summary--------------------------')
print(f'Total number of pitchers - {num_pitchers}')
print(f'Total number of fielders - {num_fielders}')
print(f'Total number of players - {num_players}')
print('--------------------------------------------------')
print(f'Number of pitchers in HOF ({num_hof_pitchers}) or  ({pct_hof_pitchers:0.3f})%')
print(f'Number of fielders in HOF ({num_hof_fielders}) or  ({pct_hof_fielders:0.3f})%')
print(f'Total in HOF ({num_hof}) or ({pct_hof:0.3f})%')

-----------------Summary--------------------------
Total number of pitchers - 9655
Total number of fielders - 9574
Total number of players - 19229
--------------------------------------------------
Number of pitchers in HOF (96) or  (0.994)%
Number of fielders in HOF (134) or  (1.400)%
Total in HOF (256) or (1.331)%


These statistics show how hard it is for a player to make the hall of fame.  This also means that the data set is exteremely unbalanced where there are more players who do NOT make the HOF than do.

#### Summary of single season statistics

In [14]:
num_seasons_played = df_season_fielding.count()[0]
num_seasons_all_star = df_allstar.count()
pct_seasons_all_star = (num_seasons_all_star / num_seasons_played) * 100
print('-----------------Summary--------------------------')
print(f'Total number of seasons played   - {num_seasons_played}')
print(f'Total number of all-star seasons - {num_seasons_all_star}')
print(f'Percent all-star seeasons        - {pct_seasons_all_star:0.3f}')

-----------------Summary--------------------------
Total number of seasons played   - 49390
Total number of all-star seasons - 5291
Percent all-star seeasons        - 10.713


## Manipulate data

In [15]:
# Slice dataframe.  Get all data except 'playerId' and 'HOF'
# NOTE: use 'fillna(0)' to replace 'nan' with 0
Xdata = df_career_pitching.loc[:,'stint':'ZR'].fillna(0)
print(f'Xdata type({type(Xdata)}) size({Xdata.shape})')

Xdata type(<class 'pandas.core.frame.DataFrame'>) size((9655, 35))


In [16]:
# Convert pandas dataframe to numpy array
Xtrain = Xdata.reset_index().values.astype(int)
print(f'Xtrain type({type(Xtrain)}) size({Xtrain.shape})')

Xtrain type(<class 'numpy.ndarray'>) size((9655, 36))


In [17]:
# Slice dataframe.  Get only 'HOF'
Tdata = df_career_pitching.loc[:,'HOF']
print(f'Tdata type({type(Tdata)}) size({Tdata.shape})')

Tdata type(<class 'pandas.core.series.Series'>) size((9655,))


In [18]:
Ttrain = Tdata.reset_index().values[:,1].astype(int)
print(f'Ttrain type({type(Ttrain)}) size({Ttrain.shape})')

Ttrain type(<class 'numpy.ndarray'>) size((9655,))


In [19]:
# This will be our 'classes'
# 0 = Not a HOF player, 1 = HOF player
np.unique(Ttrain)

array([0, 1])

### Create neural network

In [25]:
nnet = NeuralNetwork_Convolutional(n_channels_in_image=1, #Xtrain.shape[0],
                                   image_size=Xtrain.shape[0],
                                   n_units_in_conv_layers=[5],
                                   n_units_in_fc_hidden_layers=[2],
                                   classes=np.unique(Ttrain),
                                   kernels_size_and_stride=[[5, 2]],
                                   use_gpu=False)

### Train Neural network

In [26]:
nnet.train(Xtrain, Ttrain, 50, learning_rate=0.01)

RuntimeError: Expected 4-dimensional input for 4-dimensional weight 5 1, but got 2-dimensional input of size [9655, 36] instead