In [293]:
# Setup
# -*- coding: utf-8 -*-
import numpy as np
import scipy as sp
import pandas as p
from IPython.display import display
from sklearn import datasets, linear_model, preprocessing
from sklearn.metrics import mean_squared_error, r2_score

In [306]:
# STORE ALL REFORMATTED BATTING DATA AND PITHING DATA W/ SALARIES IN CSV

years = ['2002', '2003', '2004', '2005', '2006', 
         '2007', '2008', '2009', '2010', '2011', 
         '2012', '2013', '2014', '2015', '2016', '2017']

teams = ['ARI','ATL','BAL','BOS','CHC','CHW','CIN','CLE','COL','DET','HOU','KCR','LAA','LAD','MIA','MIL','MIN','NYM','NYY','OAK','PHI','PIT','SDP','SEA','SFG','STL','TBR','TEX','TOR','WSN']

luxury_tax_mult = [1.7, 1.66, 1.62, 1.52, 1.43, 
                   1.32,1.26, 1.2, 1.15, 1.1, 
                   1.1, 1.1, 1.03,1.03, 1.03, 1]

batting_train = p.DataFrame()
pitching_train = p.DataFrame() 
batting_test = p.DataFrame()
pitching_test = p.DataFrame()

for idx, year in enumerate(years):
    # Batting salary data for a year -->
    batting_df = p.read_csv('BattingSalary'+year+'.csv', 
                             engine='python')
    pitching_df = p.read_csv('PitchingSalary'+year+'.csv', 
                             engine='python') 
    
    batting_df['Tm'] = batting_df['Tm'].str.replace('ANA', 'LAA')
    batting_df['Tm'] = batting_df['Tm'].str.replace('MON', 'WSN')
    batting_df['Tm'] = batting_df['Tm'].str.replace('TBD', 'TBR')
    batting_df['Tm'] = batting_df['Tm'].str.replace('FLA', 'MIA')

    pitching_df['Tm'] = pitching_df['Tm'].str.replace('ANA', 'LAA')
    pitching_df['Tm'] = pitching_df['Tm'].str.replace('MON', 'WSN')
    pitching_df['Tm'] = pitching_df['Tm'].str.replace('TBD', 'TBR')
    pitching_df['Tm'] = pitching_df['Tm'].str.replace('FLA', 'MIA')    
    
    batting_df.columns = batting_df.columns.str.replace('Salary▼', 'Salary')
    pitching_df.columns = pitching_df.columns.str.replace('Salary▼', 'Salary')
    
    batting_df['Salary'] = batting_df['Salary'].str.replace('$','')
    pitching_df['Salary'] = pitching_df['Salary'].str.replace('$','')
 
    batting_df['Salary'] = batting_df['Salary'].astype(float)
    pitching_df['Salary'] = pitching_df['Salary'].astype(float)
    
    lux = luxury_tax_mult[idx]
    batting_df['Salary'] *= lux
    pitching_df['Salary'] *= lux
    
    if year == years[0]:
        batting_train = batting_df
        pitching_train = pitching_df
    elif year == years[-1]:
        batting_test = batting_df
        pitching_test = pitching_df
    else:
        batting_train = p.concat([batting_train, batting_df])
        pitching_train = p.concat([pitching_train, pitching_df])

    batting_df.to_csv('BattingStats'+year+'.csv', index=False)
    pitching_df.to_csv('PitchingStats'+year+'.csv', index=False)
        

batting_train.drop(batting_train.columns[[0, 1, -2]], axis=1, inplace=True)
batting_train = batting_train.fillna(0)
batting_test.drop(batting_test.columns[[0, 1, -2]], axis=1, inplace=True)
batting_test = batting_test.fillna(0)

pitching_train.drop(pitching_train.columns[[0,1,-1]], axis=1, inplace=True)
pitching_train = pitching_train.fillna(0)
pitching_test.drop(pitching_test.columns[[0,1,-1]], axis=1, inplace=True)
pitching_test = pitching_test.fillna(0)

batting_train.to_csv('AllBattingData.csv', index=False)
batting_test.to_csv('BattingTest.csv', index=False)

pitching_train.to_csv('AllPitchingData.csv', index=False)
pitching_test.to_csv('PitchingTest.csv', index=False)

# Now remove team for the training data
batting_train.drop(batting_train.columns[[1]], axis=1, inplace=True)
batting_test.drop(batting_test.columns[[1]], axis=1, inplace=True)

pitching_train.drop(pitching_train.columns[[1]], axis=1, inplace=True)
pitching_test.drop(pitching_test.columns[[1]], axis=1, inplace=True)


In [307]:
# FORMAT TRAINING DATA FOR BATTING AND PITCHING
batting_train_y = batting_train[['Salary']]
batting_train_x = batting_train.drop(['Salary'], axis=1)
batting_train_x = batting_train_x.drop(['Pos Summary'], axis=1)

pitching_train_y = pitching_train[['Salary']]
pitching_train_x = pitching_train.drop(['Salary'], axis=1) 

# FORMAT TESTING DATA FOR BATTING AND PITCHING
batting_test_y = batting_test[['Salary']]
batting_test_x = batting_test.drop(['Salary'], axis=1)
batting_test_x = batting_test_x.drop(['Pos Summary'], axis=1)

pitching_test_y = pitching_test[['Salary']]
pitching_test_x = pitching_test.drop(['Salary'], axis=1) 

In [308]:
# PREDICT SALARIES BASED OFF OF STATISTICS
batting_regr = linear_model.LinearRegression()
batting_regr.fit(batting_train_x.values, batting_train_y.values)
print batting_regr.coef_

pitching_regr = linear_model.LinearRegression()
pitching_regr.fit(pitching_train_x.values, pitching_train_y.values)
print pitching_regr.coef_

[[  5.01575479e+05  -3.14817084e+04   2.32672328e+03  -6.78893802e+04
   -1.27950676e+05  -4.01916135e+05   2.58492211e+05   3.32286666e+05
    4.39021951e+03  -3.71932228e+05  -4.51857203e+03   7.77971096e+03
    3.48110457e+06   1.90999031e+07  -1.24224317e+08  -3.05492121e+06
   -5.44114494e+06   2.17654606e+05]]
[[  4.93022796e+05   3.41048708e+04  -2.82310191e+04  -1.08359015e+05
   -1.84470501e+04   1.27855710e+04  -1.42158231e+05  -2.60738937e+05
    5.66345547e+06   5.49548846e+03  -2.47686746e+05  -2.93756548e+05
    1.97579936e+06   1.52169166e+05   1.38297749e+06   8.32809213e+05
    4.82804644e+03   4.40810966e+06  -1.23872588e+06]]


In [336]:
full_twenty = np.zeros((1, 15))

for year in years: 
    year_batting_df = p.read_csv('BattingStats'+year+'.csv', engine='python')
    year_pitching_df = p.read_csv('PitchingStats'+year+'.csv', engine='python')
    year_batting_df.drop(year_batting_df.columns[[0, 1, -2]], axis=1, inplace=True)
    year_batting_df = year_batting_df.fillna(0)
    year_pitching_df.drop(year_pitching_df.columns[[0,1,-1]], axis=1, inplace=True)
    year_pitching_df = year_pitching_df.fillna(0)

    if year == '2017':
        test_success_x = np.zeros((1, 16))
    
    for team in teams:
        print(team, year)
        teamB = year_batting_df[(year_batting_df['Tm'].str.startswith(team))]
        teamB = teamB.sort_values(by='WAR', ascending=0)
        teamB = teamB.head(n=10)
        teamB.drop(teamB.columns[[1,-2, -1]], axis=1, inplace=True)
        B_predicted = batting_regr.predict(teamB.values)   
        B_predicted = p.DataFrame(B_predicted) 
    
        teamP = year_pitching_df[(year_pitching_df['Tm'].str.startswith(team))]
        teamP = teamP.sort_values(by='G', ascending=0)
        teamP = teamP.head(n=6)
        teamP.drop(teamP.columns[[1, -1]], axis=1, inplace=True)
        P_predicted = pitching_regr.predict(teamP.values)
        P_predicted = p.DataFrame(P_predicted)
         
        B_predicted = p.concat([B_predicted, P_predicted])
        team_Twenty_Predicted = B_predicted.T
        to_append = team_Twenty_Predicted.values
        
        if year == '2017':
            test_success_x = np.vstack([test_success_x, to_append])             
        else:
            full_twenty = np.vstack([full_twenty, to_append]) 
full_twenty = full_twenty[1:]
test_success_x = test_success_x[1:]

TypeError: cannot concatenate 'str' and 'list' objects

In [334]:
teamWins = p.read_csv('teamWins.csv')
teamWins.drop(teamWins.columns[0], axis=1, inplace=True)
years = teamWins.values.tolist()
success_y_orig = [item for sublist in years for item in sublist]
success_y = success_y_orig[30:]
print(success_y)

[69, 68, 89, 93, 103, 78, 68, 94, 75, 86, 84, 81, 74, 91, 79, 73, 59, 87, 84, 69, 71, 78, 68, 87, 86, 86, 68, 95, 89, 95, 79, 67, 81, 78, 97, 76, 64, 81, 68, 74, 86, 95, 85, 92, 71, 68, 83, 90, 87, 68, 63, 98, 74, 84, 76, 100, 80, 88, 93, 83, 64, 79, 96, 71, 73, 73, 76, 85, 66, 90, 70, 89, 98, 94, 77, 82, 70, 79, 84, 88, 73, 88, 77, 88, 87, 90, 77, 67, 83, 96, 81, 96, 85, 97, 66, 63, 90, 92, 74, 93, 51, 86, 78, 92, 62, 74, 66, 74, 85, 96, 73, 94, 76, 76, 71, 97, 92, 91, 74, 86, 81, 94, 93, 69, 61, 85, 97, 68, 64, 88, 55, 72, 89, 86, 69, 83, 66, 74, 95, 94, 81, 79, 76, 94, 75, 88, 90, 93, 73, 98, 94, 89, 69, 90, 71, 79, 79, 80, 73, 95, 56, 71, 86, 82, 72, 96, 63, 77, 97, 74, 102, 72, 71, 86, 67, 90, 91, 96, 81, 80, 65, 91, 66, 89, 75, 88, 91, 69, 83, 81, 76, 67, 80, 80, 80, 77, 94, 79, 95, 81, 97, 57, 90, 92, 61, 86, 96, 90, 85, 69, 70, 86, 64, 95, 83, 79, 78, 65, 92, 86, 74, 65, 97, 95, 87, 80, 87, 70, 103, 75, 93, 62, 75, 88, 85, 91, 84, 87, 75, 59, 82, 72, 68, 95, 97, 89, 74, 81, 74,

In [335]:
# PREDICT WINS
wins_regr = linear_model.LinearRegression()
success_y = np.asarray(success_y)
wins_regr.fit(full_twenty, success_y)
print wins_regr.coef_

test_success_y = np.asarray(success_y_orig[:30])

actual = test_success_y.T
predicted = wins_regr.predict(test_success_x).T

print(len(actual))
print(len(predicted))

sidebyside = np.vstack((actual, predicted))

display(p.DataFrame(sidebyside))
print sidebyside



[  2.75407175e-07  -1.04452242e-07   1.93624398e-07   7.17763126e-07
   3.06021444e-07   1.20746708e-07   7.33584216e-08   2.40036921e-08
  -1.06792625e-07  -6.09257581e-08   2.93120180e-07  -3.47337839e-07
   7.59771595e-08  -1.04645504e-07   2.92632049e-08]
30
30


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,93.0,72.0,75.0,93.0,92.0,67.0,68.0,102.0,87.0,64.0,...,66.0,75.0,71.0,64.0,78.0,83.0,80.0,78.0,76.0,97.0
1,79.324281,82.794651,80.121291,77.095299,73.786841,75.615988,79.237007,86.968063,79.094717,79.868357,...,79.587518,78.100546,75.323501,82.164653,79.275897,78.677533,81.783138,79.653466,81.203338,85.19942


[[  93.           72.           75.           93.           92.           67.
    68.          102.           87.           64.          101.           80.
    80.          104.           77.           86.           85.           70.
    91.           75.           66.           75.           71.           64.
    78.           83.           80.           78.           76.           97.        ]
 [  79.32428051   82.79465062   80.12129068   77.09529874   73.78684061
    75.61598764   79.23700742   86.96806272   79.09471714   79.86835696
    85.62821421   76.28872371   77.75502497   78.78117782   78.21496958
    79.71685577   76.92121889   79.57624254   81.45891085   79.4862336
    79.58751815   78.1005461    75.32350059   82.16465296   79.27589693
    78.67753299   81.78313788   79.65346599   81.20333827   85.19942012]]
