# The Economics of Moneyball : The Moneyball thesis is simple: Using statistical analysis, small-market teams can compete by buying assets that are undervalued by other teams and selling ones that are overvalued by other teams.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
batting = pd.read_csv('Batting.csv')

In [None]:
batting.head()

In [None]:
batting.columns

In [None]:
batting.info()

In [None]:
# Performing feature engineering

In [None]:
# Batting average being a percentage reflecting how often a batter gets on base
batting['BA'] = batting['H'] / batting['AB']

In [None]:
# On Base Percentage
# Generally measuring how frequently a batter reaches base
# It records the ratio of the batter's times-on-base (TOB) (the sum of hits, walks, 
# and times hit by pitch) to their number of plate appearances# On Base Percentage

batting['OBP'] = (batting['H'] + batting['BB'] + batting['HBP'])/(batting['AB'] + batting['BB'] + batting['HBP'] + batting['SF'])


In [None]:
# 1B = H-2B-3B-HR
# Creating X1B (Singles)
batting['1B'] = batting['H'] - batting['2B'] - batting['3B'] - batting['HR']

In [None]:
# Creating Slugging Average (SLG)

# slugging percentage (SLG) is a measure of the batting productivity of a hitter. 
# It is calculated as total bases divided by at bats.
# Unlike batting average, slugging percentage gives more weight to extra-base hits with doubles, 
# triples, and home runs, relative to singles.


batting['SLG'] = ((1 * batting['1B']) + (2 * batting['2B']) + (3 * batting['3B']) + (4 * batting['HR']) ) / batting['AB']

In [None]:
sal = pd.read_csv('Salaries.csv')

In [None]:
sal.head()

In [None]:
batting = batting[batting['yearID'] >= 1985] # since I have the salary data starting from 1985

In [None]:
combo = batting.merge(sal, on=('playerID','yearID'))

In [None]:
combo.columns

In [None]:
# combo[combo['playerID']=='giambja01']
lost_players = combo[combo['playerID'].isin(['giambja01','damonjo01','saenzol01'])]

# Lost players: First baseman 2000 AL MVP Jason Giambi (giambja01) to the New York Yankees, 
# outfielder Johnny Damon (damonjo01) to the Boston Red Sox and 
# infielder Rainer Gustavo "Ray" Olmedo ('saenzol01').

lost_players.columns

In [None]:
type(lost_players)

In [None]:
grouped = lost_players.groupby(['playerID'],axis=0)
grouped['salary'].agg([np.mean])

In [None]:
lost_players = lost_players[lost_players['yearID'] == 2001] 
# extracting from the year they lost the key players

In [None]:
lost_players = lost_players[['playerID','H','2B','3B','HR','OBP','SLG','BA','AB']]

In [None]:
avail_players = combo[combo['yearID']== 2001] # to match the data with the available ones

In [None]:
avail_players.columns

In [None]:
plt.scatter(avail_players['OBP'],avail_players['salary'],c='r')

In [None]:
avail_players = avail_players[(avail_players['salary']<6500000) & (avail_players['OBP']> 0)]

# The total AB of the lost players is about 1450, 
# meaning should probably cut off avail.players at 480 AB.
# eyeballing some particular value for the lost player salary considering the budget

# removing the outliers for OBP

In [None]:
plt.scatter(avail_players['OBP'],avail_players['salary'],c='r')

In [None]:
avail_players = avail_players[avail_players['AB'] >= 480]

In [None]:
possible_players = avail_players.sort_values(by=['OBP'],ascending=False)

In [None]:
possible_players = possible_players[['playerID','OBP','AB','salary']]

In [None]:
possible_players.head(10)

In [None]:
# berkmla01, gonzalu01, pujolal01 being my choice