In [37]:
import pandas as pd

# Tennis Modeling Assigment
#### Question
Given the current state of a match, what is the probability that player 1 will win?
#### Dataset Description
This dataset is composed of various WTA (Womens Tennis Association) matches. It gives point-by-point information about individual games. Player 1 and Player 2 are assigned arbitrarily and it is not encouraged to try to uncover these players identities nor is it worth the time
#### Task
Using a model or models of your choosing, predict the probabilities that player 1 will win the match and evaluate your model
### If you are unfamiliar with how tennis is scored, here are a few of the basics
#### Scoring a game
Tennis has a different point system than most sports
- 0 points= Love
- 1 point = 15
- 2 points= 30
- 3 points= 40
- 40-40 = Deuce

In order to win the game, a player must win at least four points. So if you are up 40-30, 40-15 or 40-love, and win one more point, you win the game. A score of 40-40 is called deuce. When the score reaches deuce, one player will need to win at least two points in a row to win the game
#### Winning a set
To win a set, a player needs to win 6 games, by two (no tie break games will appear in this dataset)
#### Winning a match
In WTA, the first player to win 2 sets wins the match

### *Please include all code used throughout the model creation in this notebook, not just the completed model*

In [38]:
df = pd.read_csv('tennis_data.csv')
df

Unnamed: 0,game_id,set_num,serving_player,player_1_points,player_2_points,player_1_games,player_2_games,player_1_sets,player_2_sets,match_winning_player
0,1,1,1,0,0,0,0,0,0,2
1,1,1,1,15,0,0,0,0,0,2
2,1,1,1,15,15,0,0,0,0,2
3,1,1,1,30,15,0,0,0,0,2
4,1,1,1,40,15,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...
18500,136,2,1,15,0,5,1,1,0,1
18501,136,2,1,30,0,5,1,1,0,1
18502,136,2,1,30,15,5,1,1,0,1
18503,136,2,1,40,15,5,1,1,0,1


# Exploratory data analysis

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18505 entries, 0 to 18504
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   game_id               18505 non-null  int64 
 1   set_num               18505 non-null  int64 
 2   serving_player        18505 non-null  int64 
 3   player_1_points       18505 non-null  object
 4   player_2_points       18505 non-null  object
 5   player_1_games        18505 non-null  int64 
 6   player_2_games        18505 non-null  int64 
 7   player_1_sets         18505 non-null  int64 
 8   player_2_sets         18505 non-null  int64 
 9   match_winning_player  18505 non-null  int64 
dtypes: int64(8), object(2)
memory usage: 1.4+ MB


In [40]:
df.head(5)

Unnamed: 0,game_id,set_num,serving_player,player_1_points,player_2_points,player_1_games,player_2_games,player_1_sets,player_2_sets,match_winning_player
0,1,1,1,0,0,0,0,0,0,2
1,1,1,1,15,0,0,0,0,0,2
2,1,1,1,15,15,0,0,0,0,2
3,1,1,1,30,15,0,0,0,0,2
4,1,1,1,40,15,0,0,0,0,2


In [41]:
df.describe()

Unnamed: 0,game_id,set_num,serving_player,player_1_games,player_2_games,player_1_sets,player_2_sets,match_winning_player
count,18505.0,18505.0,18505.0,18505.0,18505.0,18505.0,18505.0,18505.0
mean,68.7505,1.707052,1.500621,2.159741,1.979249,0.380978,0.326074,1.519751
std,38.538161,0.695584,0.500013,1.664717,1.649196,0.48564,0.468787,0.499623
min,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
25%,35.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
50%,69.0,2.0,2.0,2.0,2.0,0.0,0.0,2.0
75%,102.0,2.0,2.0,3.0,3.0,1.0,1.0,2.0
max,136.0,3.0,2.0,6.0,6.0,1.0,1.0,2.0


# Early things to note
-The dataset is structure where game_id is actually a match. The game_id increments += 1 once a match is completed

-Very important to note that player 1, player 2 are different each match. When compiling a model, this must be taken into account

-It may be in our best interest to somehow concatenate this data set where n_rows = # of completed matches, will continue to assess

-We may want to ordinally encode these scores, after all they are ordinal values despite the scoring system

-Looking like this will be a binary classification problem where match_winning_player = y[i], therefore we will also have to switch these values



# First model test
-I'm gonna try an 'at large' model here and see h ow we do, only encoding values and performing no concatenation

In [42]:
# set a new dataframe each time we work with the original
df_1 = df
df_1.head(5)

Unnamed: 0,game_id,set_num,serving_player,player_1_points,player_2_points,player_1_games,player_2_games,player_1_sets,player_2_sets,match_winning_player
0,1,1,1,0,0,0,0,0,0,2
1,1,1,1,15,0,0,0,0,0,2
2,1,1,1,15,15,0,0,0,0,2
3,1,1,1,30,15,0,0,0,0,2
4,1,1,1,40,15,0,0,0,0,2


In [45]:
# lets change some of the data so that it fits some of Python libraries SLR and decision tree models
# we don't need to one hot as this data follows ordinal structure
# it will likely be best practice to handle game_id differently but let's see how the model performes
# game_id is fine as it's already ordinally encoded, we do
# let's ordinally encode the player_1_points and player_2_points
# let's also change all ADs to 60 as a dummy value for the encoder
df_1['player_1_points'] = df_1['player_1_points'].replace(['AD'], 60)
df_1['player_2_points'] = df_1['player_2_points'].replace(['AD'], 60)

# now let's ordinally encode
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
encoder.fit(df_1[['player_1_points', 'player_2_points']])
df_1[['player_1_points','player_2_points']] = encoder.transform(df_1[['player_1_points','player_2_points']])
# check that we're all good
df_1.head(10)


TypeError: Encoders require their input to be uniformly strings or numbers. Got ['int', 'str']

In [47]:
# now let's shift the target variable down 1 to enter it into a binary class
df_1['match_winning_player'] = df_1['match_winning_player'] - 1

df_1.head(10)

Unnamed: 0,game_id,set_num,serving_player,player_1_points,player_2_points,player_1_games,player_2_games,player_1_sets,player_2_sets,match_winning_player
0,1,1,1,0,0,0,0,0,0,0
1,1,1,1,15,0,0,0,0,0,0
2,1,1,1,15,15,0,0,0,0,0
3,1,1,1,30,15,0,0,0,0,0
4,1,1,1,40,15,0,0,0,0,0
5,1,1,1,40,30,0,0,0,0,0
6,1,1,1,40,40,0,0,0,0,0
7,1,1,1,40,60,0,0,0,0,0
8,1,1,2,0,0,0,1,0,0,0
9,1,1,2,15,0,0,1,0,0,0


In [52]:
# since we are working strictly with ordinal data, let's skip scaling it for now
# let's define a feature matrix and a response variable
X = df_1.drop('match_winning_player', axis=1)
X

Unnamed: 0,game_id,set_num,serving_player,player_1_points,player_2_points,player_1_games,player_2_games,player_1_sets,player_2_sets
0,1,1,1,0,0,0,0,0,0
1,1,1,1,15,0,0,0,0,0
2,1,1,1,15,15,0,0,0,0
3,1,1,1,30,15,0,0,0,0
4,1,1,1,40,15,0,0,0,0
...,...,...,...,...,...,...,...,...,...
18500,136,2,1,15,0,5,1,1,0
18501,136,2,1,30,0,5,1,1,0
18502,136,2,1,30,15,5,1,1,0
18503,136,2,1,40,15,5,1,1,0


In [53]:
# response variable
y = df_1['match_winning_player']
y

0        0
1        0
2        0
3        0
4        0
        ..
18500   -1
18501   -1
18502   -1
18503   -1
18504   -1
Name: match_winning_player, Length: 18505, dtype: int64

In [54]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22, test_size= .25)


In [55]:
# let's try a model out
from sklearn.linear_model import LogisticRegression
model_1 = LogisticRegression()
model_1.fit(X_train, y_train)
y_pred = model_1.predict(X_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [97]:
# check model performance
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

# lets run metrics
confusion_matrix(y_train, y_pred)
accuracy_score(y_train, y_pred)

# model_1 review
# we're getting an f1 of 76% lets use that as our baseline

0.7617812364894077

In [51]:
# maybe it's in our best interest to group by match_id and then run it that way
df = pd.read_csv('tennis_data.csv')
df_2 = df
df_2

Unnamed: 0,game_id,set_num,serving_player,player_1_points,player_2_points,player_1_games,player_2_games,player_1_sets,player_2_sets,match_winning_player
0,1,1,1,0,0,0,0,0,0,2
1,1,1,1,15,0,0,0,0,0,2
2,1,1,1,15,15,0,0,0,0,2
3,1,1,1,30,15,0,0,0,0,2
4,1,1,1,40,15,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...
18500,136,2,1,15,0,5,1,1,0,1
18501,136,2,1,30,0,5,1,1,0,1
18502,136,2,1,30,15,5,1,1,0,1
18503,136,2,1,40,15,5,1,1,0,1


In [62]:
# i'm gonna try a different approach here, let's see if i can group by game_id
# features will be game_id, sum(player_1_points), sum(player_2_points), sum(player_1_games), sum(player_2_games),
## sum(player_1_sets), sum(player_2_sets), and avg(match_winning_player) (aka just the winner)

# let's get rid of the features we don't want [set_num, serving_player
df_2 = df_2.drop(['set_num','serving_player'],axis=1)
df_2.head(5)
df_2.columns

Index(['game_id', 'player_1_points', 'player_2_points', 'player_1_games',
       'player_2_games', 'player_1_sets', 'player_2_sets',
       'match_winning_player'],
      dtype='object')

In [65]:
df_2.head(5)
df_2['player_1_points'] = df_2['player_1_points'].replace(['AD'], 60)
df_2['player_2_points'] = df_2['player_2_points'].replace(['AD'], 60)
df_2.head(5)

Unnamed: 0,game_id,player_1_points,player_2_points,player_1_games,player_2_games,player_1_sets,player_2_sets,match_winning_player
0,1,0,0,0,0,0,0,2
1,1,15,0,0,0,0,0,2
2,1,15,15,0,0,0,0,2
3,1,30,15,0,0,0,0,2
4,1,40,15,0,0,0,0,2


In [83]:
# now let's play around with groupby
df_2["player_1_points"] = pd.to_numeric(df_2["player_1_points"])
df_2["player_2_points"] = pd.to_numeric(df_2["player_2_points"])
df_2.info()


final_df_2 = df_2.groupby(['game_id'], as_index=False).agg({
    'player_1_points':'sum', 'player_2_points':'sum','player_1_games':'sum',
     'player_2_games':'sum', 'player_1_sets':'sum', 'player_2_sets':'sum',
       'match_winning_player':'mean'
})

final_df_2.head(10)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18505 entries, 0 to 18504
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   game_id               18505 non-null  int64
 1   player_1_points       18505 non-null  int64
 2   player_2_points       18505 non-null  int64
 3   player_1_games        18505 non-null  int64
 4   player_2_games        18505 non-null  int64
 5   player_1_sets         18505 non-null  int64
 6   player_2_sets         18505 non-null  int64
 7   match_winning_player  18505 non-null  int64
dtypes: int64(8)
memory usage: 1.1 MB


Unnamed: 0,game_id,player_1_points,player_2_points,player_1_games,player_2_games,player_1_sets,player_2_sets,match_winning_player
0,1,3260,3200,237,469,0,78,2.0
1,2,2475,2550,264,89,41,0,1.0
2,3,3005,3545,290,298,106,58,2.0
3,4,1460,460,174,4,28,0,1.0
4,5,2695,3055,291,350,0,65,2.0
5,6,2170,2200,272,328,0,51,2.0
6,7,2115,1330,216,41,45,0,1.0
7,8,3700,3250,383,204,113,56,1.0
8,9,2030,2455,297,321,0,56,2.0
9,10,2760,2705,195,359,0,76,2.0


In [104]:
# we now have some semblance of a concatenated dataframe, let's see how a naive model would perform. 
# let's exclude game_id as it has no handle
# X will include all data, minus matching player
X = final_df_2.drop('match_winning_player',axis=1)
y = final_df_2['match_winning_player'] - 1

In [102]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22, test_size=.25)


In [105]:
# SLR model
model_2 = LogisticRegression()
model_2.fit(X_train, y_train)

In [111]:
# metrics
y_pred = model_2.predict(X_train)
f1_score(y_train,y_pred)


0.9523809523809524

In [132]:
# check that out!! we're yielding a pretty good f1 (95%) from this concatenated model
# let's try a decision tree and see if we can get a little bit better accuracy
# we're also going to want to remove df[n - 1] and then make a prediction from the existing data
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=3, random_state=14)
clf.fit(X_train, y_train)

In [133]:
y_pred = clf.predict(X_train)
f1_score(y_train, y_pred)

0.9902912621359222

In [None]:
# nice, f1 close to 1. this could indicate some overfitting, although the test set should perform pretty well
# FINAL STEPS: use concatenated set with final point removed, train on train, f1 on test, train on full,
## and finally predict using full trained model