In [1]:
import pandas as pd

### Reading match data into Pandas dataframe 

In [2]:
matches = pd.read_csv("matches.csv", index_col=0)

In [3]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,notes,sh,sot,dist,pk,pkatt,season,team,xg,xga
2,2021-08-16,19:15,Süper Lig,Matchweek 1,Mon,Away,W,5,1,Yeni Mal'spor,...,,14.0,8.0,,1,1,2022,Trabzonspor,,
4,2021-08-23,19:15,Süper Lig,Matchweek 2,Mon,Home,W,2,1,Sivasspor,...,,14.0,6.0,,1,1,2022,Trabzonspor,,
6,2021-08-29,21:45,Süper Lig,Matchweek 3,Sun,Away,W,1,0,Giresunspor,...,,13.0,5.0,,0,0,2022,Trabzonspor,,
7,2021-09-12,20:00,Süper Lig,Matchweek 4,Sun,Home,D,2,2,Galatasaray,...,,19.0,6.0,,0,0,2022,Trabzonspor,,
8,2021-09-18,16:00,Süper Lig,Matchweek 5,Sat,Away,W,1,0,Kasımpaşa,...,,12.0,9.0,,0,0,2022,Trabzonspor,,


In [4]:
matches.shape

(1600, 26)

### Investigating missing data

In [5]:
# 2 seasons * 20 squads * 38 matches

2 * 20 * 38

1520

In [6]:
matches["team"].value_counts()

Trabzonspor            78
Kasimpasa              78
Yeni Malatyaspor       78
Goztepe                78
Rizespor               78
Gaziantep FK           78
Kayserispor            78
Fenerbahce             78
Hatayspor              78
Galatasaray            78
Sivasspor              78
Fatih Karagumruk       78
Antalyaspor            78
Besiktas               78
Alanyaspor             78
Istanbul Basaksehir    78
Konyaspor              78
Erzurum BB             40
Ankaragucu             40
Genclerbirligi         40
Denizlispor            40
Adana Demirspor        38
Giresunspor            38
Altay                  38
Name: team, dtype: int64

In [7]:
matches["round"].value_counts()

Matchweek 1     40
Matchweek 30    40
Matchweek 23    40
Matchweek 24    40
Matchweek 25    40
Matchweek 26    40
Matchweek 27    40
Matchweek 28    40
Matchweek 29    40
Matchweek 31    40
Matchweek 21    40
Matchweek 32    40
Matchweek 33    40
Matchweek 34    40
Matchweek 35    40
Matchweek 36    40
Matchweek 37    40
Matchweek 38    40
Matchweek 2     40
Matchweek 22    40
Matchweek 20    40
Matchweek 10    40
Matchweek 3     40
Matchweek 4     40
Matchweek 5     40
Matchweek 6     40
Matchweek 7     40
Matchweek 8     40
Matchweek 19    40
Matchweek 9     40
Matchweek 11    40
Matchweek 12    40
Matchweek 13    40
Matchweek 14    40
Matchweek 15    40
Matchweek 16    40
Matchweek 17    40
Matchweek 18    40
Matchweek 39    20
Matchweek 40    20
Matchweek 41    20
Matchweek 42    20
Name: round, dtype: int64

### Cleaning Data for machine learning

In [8]:
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf                int64
ga                int64
opponent         object
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
pk                int64
pkatt             int64
season            int64
team             object
xg              float64
xga             float64
dtype: object

Converting data object to datatime, this just makes it easier for me to compute predictors based on the date time column.

In [9]:
matches["date"] = pd.to_datetime(matches["date"])

In [10]:
del matches["comp"]
del matches["notes"]

### Creating predictors for machine learing 

The first predictor that I created is called venue code.
This is for converting home or away column into a numeric column.

In [11]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

Next thing what I did is something similar to venue column. I created a unique code for each opponent squad


In [12]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

Here I wanted to remove the minutes and just keep the hour, so I used a string replacement and a reg ex.


In [13]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

The last thing I created before building initial model is called day code column. This is giving me a number for each day of the week, so monday is a zero, tuesday is one, wednesday is three and so on


In [14]:
matches["day_code"] = matches["date"].dt.dayofweek

We can see that the result can be L, W or D so I converted this to two numbers. If the result is a loss or a draw I code it as a zero, if the result is a win I code it as a one, because what I want to predict is if the team won or not.


In [15]:
matches["target"] = (matches["result"] == "W").astype("int")

In [16]:
matches

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,poss,...,pkatt,season,team,xg,xga,venue_code,opp_code,hour,day_code,target
2,2021-08-16,19:15,Matchweek 1,Mon,Away,W,5,1,Yeni Mal'spor,60.0,...,1,2022,Trabzonspor,,,0,23,19,0,1
4,2021-08-23,19:15,Matchweek 2,Mon,Home,W,2,1,Sivasspor,53.0,...,1,2022,Trabzonspor,,,1,21,19,0,1
6,2021-08-29,21:45,Matchweek 3,Sun,Away,W,1,0,Giresunspor,54.0,...,0,2022,Trabzonspor,,,0,14,21,6,1
7,2021-09-12,20:00,Matchweek 4,Sun,Home,D,2,2,Galatasaray,62.0,...,0,2022,Trabzonspor,,,1,11,20,6,0
8,2021-09-18,16:00,Matchweek 5,Sat,Away,W,1,0,Kasımpaşa,55.0,...,0,2022,Trabzonspor,,,0,17,16,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,2021-04-28,16:00,Matchweek 38,Wed,Away,L,3,6,Kayserispor,51.0,...,0,2021,Denizlispor,,,0,18,16,2,0
36,2021-05-02,16:00,Matchweek 39,Sun,Home,L,0,1,Rizespor,55.0,...,0,2021,Denizlispor,,,1,20,16,6,0
37,2021-05-08,20:30,Matchweek 40,Sat,Away,L,0,1,Hatayspor,49.0,...,1,2021,Denizlispor,,,0,16,20,5,0
38,2021-05-11,20:30,Matchweek 41,Tue,Home,L,1,4,Galatasaray,43.0,...,0,2021,Denizlispor,,,1,11,20,1,0


### Creating initial machine learning model

A random forest is a series of decision trees but each decision tree has slightly different parameters. 
The bigger the number of n_estimators, the longer the algorithm will take to run, but the more accurate it will be. The amount of samples I want in a leaf of the decision tree before splitting the node is called min_samples_split, and the more it is, the less likely we are to overfit, but the poorer the training data accuracy will be.
A random state has a lot of random parameters in it, so if I run the random forest multiple times I'll get the same results as long as the data is the same.


In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [19]:
train = matches[matches["date"] < '2022-01-01']

In [20]:
test = matches[matches["date"] > '2022-01-01']

In [21]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [22]:
rf.fit(train[predictors], train["target"])

RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [23]:
preds = rf.predict(test[predictors])

In [24]:
from sklearn.metrics import accuracy_score

In [25]:
error = accuracy_score(test["target"], preds)
error

0.6078947368421053

In [26]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))

In [27]:
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,187,48
1,101,44


In [28]:
from sklearn.metrics import precision_score

precision_score(test["target"], preds)

0.4782608695652174

### Improving precision with rolling averages

### Retraining machine learning model

### Combining home and away predictions