In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [4]:
df = pd.read_csv('tennis_stats.csv')
print(df.head())

                 Player  Year  FirstServe  FirstServePointsWon  \
0           Pedro Sousa  2016        0.88                 0.50   
1       Roman Safiullin  2017        0.84                 0.62   
2           Pedro Sousa  2017        0.83                 0.60   
3   Rogerio Dutra Silva  2010        0.83                 0.64   
4  Daniel Gimeno-Traver  2017        0.81                 0.54   

   FirstServeReturnPointsWon  SecondServePointsWon  \
0                       0.38                  0.50   
1                       0.26                  0.33   
2                       0.28                  0.53   
3                       0.34                  0.59   
4                       0.00                  0.33   

   SecondServeReturnPointsWon  Aces  BreakPointsConverted  BreakPointsFaced  \
0                        0.39     0                  0.14                 7   
1                        0.07     7                  0.00                 7   
2                        0.44     2      

In [5]:
print(df.columns)

Index(['Player', 'Year', 'FirstServe', 'FirstServePointsWon',
       'FirstServeReturnPointsWon', 'SecondServePointsWon',
       'SecondServeReturnPointsWon', 'Aces', 'BreakPointsConverted',
       'BreakPointsFaced', 'BreakPointsOpportunities', 'BreakPointsSaved',
       'DoubleFaults', 'ReturnGamesPlayed', 'ReturnGamesWon',
       'ReturnPointsWon', 'ServiceGamesPlayed', 'ServiceGamesWon',
       'TotalPointsWon', 'TotalServicePointsWon', 'Wins', 'Losses', 'Winnings',
       'Ranking'],
      dtype='object')


In [6]:
x = df[['FirstServe', 'FirstServePointsWon',
       'FirstServeReturnPointsWon', 'SecondServePointsWon',
       'SecondServeReturnPointsWon', 'Aces', 'BreakPointsConverted',
       'BreakPointsFaced', 'BreakPointsOpportunities', 'BreakPointsSaved',
       'DoubleFaults', 'ReturnGamesPlayed', 'ReturnGamesWon',
       'ReturnPointsWon', 'ServiceGamesPlayed', 'ServiceGamesWon',
       'TotalPointsWon', 'TotalServicePointsWon', 'Wins', 'Losses', 'Winnings']]
y = df[['Ranking']]
x_train, x_test, y_train, y_test = train_test_split(x , y, train_size=.8, test_size=.2, random_state=6)
multreg = LinearRegression()
multreg = multreg.fit(x_train,y_train)
y_predict = multreg.predict(x_test)
print(multreg.coef_)

[[-1.58252170e+02  4.29926074e+01 -6.21722996e+02 -1.20455565e+02
  -4.46410342e+02 -7.41580806e-02  3.09230558e+01  3.58955149e-01
  -7.81011129e-01 -4.09179079e+01  2.70926282e-01  1.04209364e+00
  -1.68409242e+02  6.17746080e+02 -1.07438370e+00  4.34914785e+02
  -1.68414616e+02 -9.67401699e+02  1.28033887e+01 -8.27534389e-01
  -5.69796221e-04]]


In [7]:
# So based on the R**2 values listed above, it looks like most of these variables have a significant effect on a player's rank. What's most surprising from an initial glance at the data is that the variable with the greatest effect is winnings, or the total monetary value received by the player for playing tennis. The fact that this number is negative means that the greater the winnings, the lower the rank; a correlation which makes sense given that the players who earn the most are likely to be the best. It also helps explain why winnings has such a greater effect than the other variables because the higher ranked players are playing in higher stakes matches against players of comparable rank. 

In [28]:
# I want to try to predict winnings based on two variables and see how accurate the predictsions are. First I want to see which variables have the greatest effect on winnings. The greatest impact on winnings comes from TotalServicePointsWon which is the % of points in a game won where the player won. In other words, what precentaage of their points came from their serve. Only knowing a little about modern tennis, it seems that dominating with your serve is imperative to success. This appears to be corroborated by the fact that the second most significant variable is ReturnGamesWon which is the number of games where the player won when receiving the serve (i.e., when the broke their opponent's serve)

x_1 = df[['FirstServe', 'FirstServePointsWon',
       'FirstServeReturnPointsWon', 'SecondServePointsWon',
       'SecondServeReturnPointsWon', 'Aces', 'BreakPointsConverted',
       'BreakPointsFaced', 'BreakPointsOpportunities', 'BreakPointsSaved',
       'DoubleFaults', 'ReturnGamesPlayed', 'ReturnGamesWon',
       'ReturnPointsWon', 'ServiceGamesPlayed', 'ServiceGamesWon',
       'TotalPointsWon', 'TotalServicePointsWon', 'Wins', 'Losses',
       'Ranking']]
y_1 = df[['Winnings']]
x_1_train, x_1_test, y_1_train, y_1_test = train_test_split(x_1,y_1,train_size=.8,test_size=.2,random_state=6)
multreg_1 = LinearRegression()
multreg_1 = multreg_1.fit(x_1_train,y_1_train)
print(multreg_1.coef_)

[[ 3.96261522e+04  1.00232351e+05 -3.94951089e+04  1.01145977e+05
   1.05047645e+04 -9.98320559e+01  7.34258497e+03 -7.39456088e+02
   2.54245708e+02  2.69108942e+04  7.16991048e+02 -7.01615372e+02
   9.65740775e+03  2.12481349e+04  1.18208052e+03  3.92275907e+04
  -1.92441983e+05 -1.91551878e+05  1.11849965e+04  4.99441841e+03
  -7.10037361e+01]]


In [29]:
def coef_func(columns):
    for i in columns:
        x_func = df[[i]]
        y_func = df[['Winnings']]
        x_func_train, x_func_test, y_func_train, y_func_test = train_test_split(x_func,y_func,train_size=.8,test_size=.2,random_state=6)
        reg_func = LinearRegression()
        reg_func = reg_func.fit(x_func_train,y_func_train)
        print("The coef of " + i + " is " + str(reg_func.coef_))


In [52]:
coef_func(['FirstServe', 'FirstServePointsWon',
       'FirstServeReturnPointsWon', 'SecondServePointsWon',
       'SecondServeReturnPointsWon', 'Aces', 'BreakPointsConverted',
       'BreakPointsFaced', 'BreakPointsOpportunities', 'BreakPointsSaved',
       'DoubleFaults', 'ReturnGamesWon',
       'ReturnPointsWon', 'ServiceGamesWon',
       'TotalPointsWon', 'TotalServicePointsWon'])

The coef of FirstServe is [[300124.51480496]]
The coef of FirstServePointsWon is [[1240060.00863982]]
The coef of FirstServeReturnPointsWon is [[1023646.00731376]]
The coef of SecondServePointsWon is [[1066841.85228712]]
The coef of SecondServeReturnPointsWon is [[765299.8071049]]
The coef of Aces is [[1499.56663682]]
The coef of BreakPointsConverted is [[114416.78602206]]
The coef of BreakPointsFaced is [[1847.64746875]]
The coef of BreakPointsOpportunities is [[1845.33857889]]
The coef of BreakPointsSaved is [[532035.76554414]]
The coef of DoubleFaults is [[3824.83605371]]
The coef of ReturnGamesWon is [[851966.1123691]]
The coef of ReturnPointsWon is [[1423979.78060302]]
The coef of ServiceGamesWon is [[809670.6844831]]
The coef of TotalPointsWon is [[3269103.24001372]]
The coef of TotalServicePointsWon is [[1764069.57792217]]


In [48]:
def r_squared_val(columns):
    for i in columns:
        x_func = df[[i]]
        y_func = df[['Winnings']]
        x_func_train, x_func_test, y_func_train, y_func_test = train_test_split(x_func,y_func,train_size=.8,test_size=.2,random_state=6)
        reg_func = LinearRegression()
        reg_func = reg_func.fit(x_func_train,y_func_train)
        y_func_predict = reg_func.predict(x_func_test)
        print("The R**2 value of " + i + " is " + str(reg_func.score(x_func_test,y_func_test)))

In [51]:
r_squared_val(['FirstServe', 'FirstServePointsWon',
       'FirstServeReturnPointsWon', 'SecondServePointsWon',
       'SecondServeReturnPointsWon', 'Aces', 'BreakPointsConverted',
       'BreakPointsFaced', 'BreakPointsOpportunities', 'BreakPointsSaved',
       'DoubleFaults', 'ReturnGamesWon',
       'ReturnPointsWon', 'ServiceGamesWon',
       'TotalPointsWon', 'TotalServicePointsWon'])

The R**2 value of FirstServe is -0.008869749190366205
The R**2 value of FirstServePointsWon is 0.14961337962986254
The R**2 value of FirstServeReturnPointsWon is 0.06258032988574369
The R**2 value of SecondServePointsWon is 0.06272054166581154
The R**2 value of SecondServeReturnPointsWon is 0.01747732136243596
The R**2 value of Aces is 0.5816068520493393
The R**2 value of BreakPointsConverted is -0.002900477039500249
The R**2 value of BreakPointsFaced is 0.7668989125296909
The R**2 value of BreakPointsOpportunities is 0.8081205523550062
The R**2 value of BreakPointsSaved is 0.06864886879694732
The R**2 value of DoubleFaults is 0.691094250942484
The R**2 value of ReturnGamesWon is 0.0739380210850199
The R**2 value of ReturnPointsWon is 0.05776914333371064
The R**2 value of ServiceGamesWon is 0.15667104197367154
The R**2 value of TotalPointsWon is 0.16681809408268922
The R**2 value of TotalServicePointsWon is 0.1637468240447537


In [33]:
# I want to visualize these three varialbes (TotalServicePointsWon and ReturnGamesWon plotted against Winnings).
x_2 = df[['TotalServicePointsWon','ReturnPointsWon']]
y_2 = df[['Winnings']]
x_2_train,x_2_test,y_2_train,y_2_test = train_test_split(x_2,y_2,train_size=.8,test_size=.2,random_state=6)
multreg_2 = LinearRegression()
multreg_2 = multreg_2.fit(x_2_train,y_2_train)
print(multreg_2.coef_)
y_2_predict = multreg_2.predict(x_2_test)
print(multreg_2.score(x_2_test,y_2_test))

[[1690720.93762166 1288357.14888733]]
0.23360062748301424


In [34]:
plt.subplot()
plt.scatter(x_2_test,y_2_test)
plt.plot(x_2_test,y_2_predict)
plt.show()

ValueError: x and y must be the same size