In [None]:

The ATP men’s tennis dataset includes a wide array of tennis statistics, which are described below:

Identifying Data
Player: name of the tennis player
Year: year data was recorded
Service Game Columns (Offensive)
Aces: number of serves by the player where the receiver does not touch the ball
DoubleFaults: number of times player missed both first and second serve attempts
FirstServe: % of first-serve attempts made
FirstServePointsWon: % of first-serve attempt points won by the player
SecondServePointsWon: % of second-serve attempt points won by the player
BreakPointsFaced: number of times where the receiver could have won service game of the player
BreakPointsSaved: % of the time the player was able to stop the receiver from winning service game when they had the chance
ServiceGamesPlayed: total number of games where the player served
ServiceGamesWon: total number of games where the player served and won
TotalServicePointsWon: % of points in games where the player served that they won
Return Game Columns (Defensive)
FirstServeReturnPointsWon: % of opponents first-serve points the player was able to win
SecondServeReturnPointsWon: % of opponents second-serve points the player was able to win
BreakPointsOpportunities: number of times where the player could have won the service game of the opponent
BreakPointsConverted: % of the time the player was able to win their opponent’s service game when they had the chance
ReturnGamesPlayed: total number of games where the player’s opponent served
ReturnGamesWon: total number of games where the player’s opponent served and the player won
ReturnPointsWon: total number of points where the player’s opponent served and the player won
TotalPointsWon: % of points won by the player
Outcomes
Wins: number of matches won in a year
Losses: number of matches lost in a year
Winnings: total winnings in USD($) in a year
Ranking: ranking at the end of year

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# load and investigate the data here:
df = pd.read_csv("tennis_statistics.csv")
print(df.head())


#1. perform exploratory analysis here:

plt.scatter(df["BreakPointsOpportunities"], df["Winnings"])
plt.title("BreakPointsOpportunities vs Winnings")
plt.xlabel("BreakPointsOpportunities")
plt.ylabel("Winnings")
plt.show()
plt.close()

plt.scatter(df["ServiceGamesWon"], df["Winnings"])
plt.title("ServiceGamesWon vs Winnings")
plt.xlabel("ServiceGamesWon")
plt.ylabel("Winnings")
plt.show()
plt.clf()


plt.scatter(df["BreakPointsOpportunities"], df["Ranking"])
plt.title("BreakPointsOpportunities vs Ranking")
plt.xlabel("BreakPointsOpportunities")
plt.ylabel("Ranking")
plt.show()
plt.clf()

plt.scatter(df['TotalPointsWon'],df['Ranking'])
plt.title('TotalPointsWon vs Ranking')
plt.xlabel('TotalPointsWon')
plt.ylabel('Ranking')
plt.show()
plt.clf()

#ends

#2. Perform single feature linear regressions (FirstServeReturnPointsWon)

features =df[['FirstServeReturnPointsWon']]
outcome = df[['Winnings']]

# Using scikit-learn’s train_test_split function to split our data into training and test sets:
features_train, features_test, outcome_train, outcome_test = train_test_split(features, outcome, train_size = 0.8)

#created a linear regression model and train it on training data
model = LinearRegression()
model.fit(features_train,outcome_train)

#score the model performance on test data
score_model = model.score(features_test,outcome_test)
print("Model score single feature: %.2f" % score_model)

print(model.coef_)
#make a prediction with test data
prediction = model.predict(features_test)

# plot predictions against actual winnings
plt.scatter(outcome_test,prediction, alpha=0.4)
plt.title("Predicted Winnings vs. Actual Winnings : single feature")
plt.xlabel("Actual Winnings")
plt.ylabel("Predicted Winnings")
plt.show()
plt.clf()

#ends

#starts
#Perform single feature linear regressions (BreakPointsOpportunities)
features =df[['BreakPointsOpportunities']]
outcome = df[['Winnings']]

features_train, features_test, outcome_train, outcome_test = train_test_split(features, outcome, train_size = 0.8)

model = LinearRegression()
model.fit(features_train,outcome_train)

score_model = model.score(features_test,outcome_test)
print("Model score single feature: %.2f" % score_model)

print(model.coef_)

prediction = model.predict(features_test)

# plot predictions against actual winnings
plt.scatter(outcome_test,prediction, alpha=0.4)
plt.title("Predicted Winnings vs. Actual Winnings : single feature")
plt.xlabel("Actual Winnings")
plt.ylabel("Predicted Winnings")
plt.show()
plt.clf()

#ends


#3. perform two feature linear regressions to predict yearly earnings:

#starts
features = df[['BreakPointsOpportunities',
'FirstServeReturnPointsWon']]
outcome = df[['Winnings']]

features_train, features_test, outcome_train, outcome_test = train_test_split(features, outcome, train_size = 0.8)

model = LinearRegression()
model.fit(features_train,outcome_train)

score_model = model.score(features_test,outcome_test)
print("Model score two features: %.2f" % score_model)

print(model.coef_)

prediction = model.predict(features_test)

# plot predictions against actual winnings
plt.scatter(outcome_test,prediction, alpha=0.4)
plt.title("Predicted Winnings vs. Actual Winnings : two features")
plt.xlabel("Actual Winnings")
plt.ylabel("Predicted Winnings")
plt.legend(["BreakPointsOpportunities", "FirstServeReturnPointsWon"])
plt.show()
plt.clf()

#ends

#starts
features = df[['TotalServicePointsWon',
'TotalPointsWon']]
outcome = df[['Winnings']]

features_train, features_test, outcome_train, outcome_test = train_test_split(features, outcome, train_size = 0.8)

model = LinearRegression()
model.fit(features_train,outcome_train)

print("Model test score two features:")
print(model.score(features_test,outcome_test))

print(model.coef_)

prediction = model.predict(features_test)

# plot predictions against actual winnings
plt.scatter(outcome_test,prediction, alpha=0.4)
plt.title("Predicted Winnings vs. Actual Winnings : two features")
plt.xlabel("Actual Winnings")
plt.ylabel("Predicted Winnings")
plt.legend(["TotalServicePointsWon", "TotalPointsWon"])
plt.show()
plt.clf()


#ends

# perform multiple feature linear regressions
features = df[['FirstServe','FirstServePointsWon','FirstServeReturnPointsWon',
'SecondServePointsWon','SecondServeReturnPointsWon','Aces',
'BreakPointsConverted','BreakPointsFaced','BreakPointsOpportunities',
'BreakPointsSaved','DoubleFaults','ReturnGamesPlayed','ReturnGamesWon',
'ReturnPointsWon','ServiceGamesPlayed','ServiceGamesWon','TotalPointsWon',
'TotalServicePointsWon']]
outcome = df[['Winnings']]

features_train, features_test, outcome_train, outcome_test = train_test_split(features, outcome, train_size = 0.8)

model = LinearRegression()
model.fit(features_train,outcome_train)

print("Model test score multiple features:")
print(model.score(features_test,outcome_test))

print(model.coef_)

prediction = model.predict(features_test)

# plot predictions against actual winnings
plt.scatter(outcome_test,prediction, alpha=0.4)
plt.title("Predicted Winnings vs. Actual Winnings : multiple features")
plt.xlabel("Actual Winnings")
plt.ylabel("Predicted Winnings")
plt.show()
plt.clf()

#ends


#starts
#remove FirstServeReturnPointsWon
features = df[['FirstServe','FirstServePointsWon',
'SecondServePointsWon','SecondServeReturnPointsWon','Aces',
'BreakPointsConverted','BreakPointsFaced','BreakPointsOpportunities',
'BreakPointsSaved','DoubleFaults','ReturnGamesPlayed','ReturnGamesWon',
'ReturnPointsWon','ServiceGamesPlayed','ServiceGamesWon','TotalPointsWon',
'TotalServicePointsWon']]
outcome = df[['Winnings']]

features_train, features_test, outcome_train, outcome_test = train_test_split(features, outcome, train_size = 0.8)

model = LinearRegression()
model.fit(features_train,outcome_train)

print("Model  test score multiple features:")
print(model.score(features_test,outcome_test))

print(model.coef_)

prediction = model.predict(features_test)

# plot predictions against actual winnings
plt.scatter(outcome_test,prediction, alpha=0.4)
plt.title("Predicted Winnings vs. Actual Winnings : multiple features ")
plt.xlabel("Actual Winnings")
plt.ylabel("Predicted Winnings")
plt.show()
plt.clf()

#ends


