In [169]:
# Setup
# -*- coding: utf-8 -*-
import numpy as np
import scipy as sp
import pandas as p
from IPython.display import display
from sklearn import datasets, linear_model, preprocessing
from sklearn.metrics import mean_squared_error, r2_score

In [153]:
# STORE ALL REFORMATTED BATTING DATA AND PITHING DATA W/ SALARIES IN CSV

years = ['2002', '2003', '2004', '2005', '2006', 
         '2007', '2008', '2009', '2010', '2011', 
         '2012', '2013', '2014', '2015', '2016']

luxury_tax_mult = [1.7, 1.66, 1.62, 1.52, 1.43, 
                   1.32,1.26, 1.2, 1.15, 1.1, 
                   1.1, 1.1, 1.03,1.03, 1.03]

batting_train = p.DataFrame()
pitching_train = p.DataFrame() 
batting_test = p.DataFrame()
pitching_test = p.DataFrame()

for idx, year in enumerate(years):
    # Batting salary data for a year -->
    batting_df = p.read_csv('BattingSalary'+year+'.csv', 
                             engine='python')
    pitching_df = p.read_csv('PitchingSalary'+year+'.csv', 
                             engine='python') 
    
    batting_df.columns = batting_df.columns.str.replace('Salary▼', 'Salary')
    pitching_df.columns = pitching_df.columns.str.replace('Salary▼', 'Salary')
    
    batting_df['Salary'] = batting_df['Salary'].str.replace('$','')
    pitching_df['Salary'] = pitching_df['Salary'].str.replace('$','')
 
    batting_df['Salary'] = batting_df['Salary'].astype(float)
    pitching_df['Salary'] = pitching_df['Salary'].astype(float)
    
    lux = luxury_tax_mult[idx]
    batting_df['Salary'] *= lux
    pitching_df['Salary'] *= lux
    
    if year == years[0]:
        batting_train = batting_df
        pitching_train = pitching_df
    elif year == years[-1]:
        batting_test = batting_df
        pitching_test = pitching_df
    else:
        batting_train = p.concat([batting_train, batting_df])
        pitching_train = p.concat([pitching_train, pitching_df])


batting_train.drop(batting_train.columns[[0, 1, -2]], axis=1, inplace=True)
batting_train = batting_train.fillna(0)
batting_test.drop(batting_test.columns[[0, 1, -2]], axis=1, inplace=True)
batting_test = batting_test.fillna(0)

pitching_train.drop(pitching_train.columns[[0,1,-1]], axis=1, inplace=True)
pitching_train = pitching_train.fillna(0)
pitching_test.drop(pitching_test.columns[[0,1,-1]], axis=1, inplace=True)
pitching_test = pitching_test.fillna(0)

batting_train.to_csv('AllBattingData.csv', index=False)
batting_test.to_csv('BattingTest.csv', index=False)

pitching_train.to_csv('AllPitchingData.csv', index=False)
pitching_test.to_csv('PitchingTest.csv', index=False)

# Now remove team for the training data
batting_train.drop(batting_train.columns[[1]], axis=1, inplace=True)
batting_test.drop(batting_test.columns[[1]], axis=1, inplace=True)

pitching_train.drop(pitching_train.columns[[1]], axis=1, inplace=True)
pitching_test.drop(pitching_test.columns[[1]], axis=1, inplace=True)

display(batting_test)
display(pitching_test)

Unnamed: 0,Age,G,PA,Rbat,Rbaser,Rdp,Rfield,Rpos,RAA,WAA,Rrep,RAR,WAR,waaWL%,162WL%,oWAR,dWAR,oRAR,Salary,Pos Summary
0,28,23,50.0,-4,-1,0,0,7,3,0.3,0,3,0.3,0.514,0.502,0.3,0.0,3,35608571.87,1
1,32,27,60.0,-6,0,1,0,9,3,0.3,0,3,0.3,0.511,0.502,0.3,0.0,3,35020000.00,1
2,30,35,11.0,-3,0,0,0,2,-1,-0.1,0,-1,-0.1,0.498,0.499,-0.1,0.0,-1,30900000.00,1
3,33,158,679.0,44,-1,-5,-6,-9,23,2.5,24,47,4.9,0.515,0.515,5.5,-1.5,53,28840000.00,*3/D5
4,33,34,5.0,-1,0,0,0,1,0,0.0,0,0,0.0,0.501,0.500,0.0,0.0,0,28840000.00,1
5,30,132,543.0,21,-1,-2,-3,-2,13,1.2,18,31,2.9,0.510,0.508,3.2,-0.6,34,28325000.00,*78/D
6,30,25,8.0,-2,0,0,0,1,-1,-0.1,0,-1,-0.1,0.496,0.499,-0.1,0.0,-1,26632710.00,1
7,36,112,362.0,-8,-2,0,-9,-5,-25,-2.6,12,-13,-1.5,0.477,0.484,-0.6,-1.6,-4,25750000.00,3/D
8,32,33,75.0,-10,0,-1,0,11,-1,-0.1,0,-1,-0.1,0.499,0.500,-0.1,0.0,-1,25750000.00,1
9,36,152,650.0,11,-1,-3,-4,-13,-10,-0.8,23,14,1.4,0.494,0.494,1.8,-1.8,18,25750000.00,*D3


Unnamed: 0,Age,IP,G,GS,R,RA9,RA9opp,RA9def,RA9role,PPFp,RA9avg,RAA,WAA,gmLI,WAAadj,WAR,RAR,waaWL%,162WL%,Salary
0,28,149.0,21,21,31,1.87,4.42,0.19,0.19,92.5,4.09,38,4.4,0.00,-0.1,5.6,50,0.709,0.527,35608571.87
1,32,158.2,26,26,80,4.54,4.50,-0.12,0.19,104.3,5.02,9,1.0,0.00,-0.1,2.3,23,0.538,0.506,35020000.00
2,30,230.0,35,35,106,4.15,4.40,0.32,0.18,107.9,4.60,9,1.0,0.00,-0.1,3.1,34,0.527,0.506,30900000.00
3,33,227.2,34,34,81,3.20,4.49,-0.30,0.18,98.3,4.89,40,4.5,0.00,-0.1,6.6,65,0.633,0.528,28840000.00
4,30,153.1,25,25,76,4.46,4.67,-0.09,0.19,94.0,4.66,2,0.2,0.00,-0.1,1.6,19,0.507,0.501,26632710.00
5,32,202.2,32,32,57,2.53,4.51,0.51,0.19,94.0,3.94,33,3.7,0.00,-0.2,5.3,50,0.615,0.523,25750000.00
6,35,179.2,30,30,83,4.16,4.53,0.00,0.18,104.2,4.91,13,1.4,0.00,-0.1,3.0,33,0.546,0.509,25750000.00
7,32,200.2,32,32,83,3.72,4.64,0.01,0.19,106.0,5.10,28,3.1,0.00,-0.1,5.0,51,0.598,0.519,24205000.00
8,31,228.1,34,34,77,3.04,4.35,-0.04,0.18,98.7,4.51,39,4.4,0.00,-0.2,6.2,58,0.628,0.527,22807290.00
9,27,199.2,31,31,75,3.38,4.61,0.00,0.19,102.9,4.94,32,3.6,0.00,-0.1,5.4,54,0.617,0.522,22660000.00


In [154]:
# FORMAT TRAINING DATA FOR BATTING AND PITCHING
batting_train_y = batting_train[['Salary']]
batting_train_x = batting_train.drop(['Salary'], axis=1)
batting_train_x = batting_train_x.drop(['Pos Summary'], axis=1)

pitching_train_y = pitching_train[['Salary']]
pitching_train_x = pitching_train.drop(['Salary'], axis=1) 

# FORMAT TESTING DATA FOR BATTING AND PITCHING
batting_test_y = batting_test[['Salary']]
batting_test_x = batting_test.drop(['Salary'], axis=1)
batting_test_x = batting_test_x.drop(['Pos Summary'], axis=1)

pitching_test_y = pitching_test[['Salary']]
pitching_test_x = pitching_test.drop(['Salary'], axis=1) 

In [173]:
# PREDICT SALARIES BASED OFF OF STATISTICS
batting_regr = linear_model.LinearRegression()
batting_regr.fit(batting_train_x.values, batting_train_y.values)
print batting_regr.coef_

pitching_regr = linear_model.LinearRegression()
pitching_regr.fit(pitching_train_x.values, pitching_train_y.values)
print pitching_regr.coef_

[[  4.91234214e+05  -3.16364553e+04   2.45166292e+03  -6.58029171e+04
   -1.32215294e+05  -3.74794315e+05   1.85134646e+05   3.26134324e+05
    4.06793599e+04   7.54945109e+04   6.50919001e+04   2.18587497e+04
    3.07561898e+06   1.99481888e+07  -1.28252813e+08  -3.46655899e+06
   -5.27304212e+06   2.08496681e+05]]
[[  485303.63441748    34932.22529745   -30515.61838551  -116100.79849665
    -15512.18875736    11365.99348654    47237.0200478   -570881.7105203
   5763625.77851057    12749.83681698  -406878.06989382  -252141.29316355
   1696508.24624635   191047.53425102  1281581.27654777   896748.5840009
    -11995.4106465   4698101.29581374 -3285479.81867339]]
