League of Legends Match Analysis

**Name(s)**: Ashley Chen

**Website Link**: https://ashchen738.github.io/league-of-legends-analysis/

In [3]:
import pandas as pd
import numpy as np

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from lec_utils import * # Feel free to uncomment and use this. It'll make your plotly graphs look like ours in lecture!

In [18]:
!pip install tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0


## Step 1: Introduction

What impact does gold difference have on other factors of the game, especially win chances?

## Step 2: Data Cleaning and Exploratory Data Analysis

### Data Cleaning

In [None]:
df = pd.read_csv('lol2022.csv')

# See how many rows
print(f"# rows: {df.shape[0]}")

# Looking only at matches on a team basis
team_rows = df[df['position'] == 'team']
df_cleaned = team_rows.copy()[['gameid', 'result', 'gamelength', 'golddiffat10', 'golddiffat15', 'golddiffat20', 'golddiffat25', 'killsat25', 'deathsat25', 'assistsat25', 'firsttower', 'firstmidtower', 'firstblood', 'firstbloodkill', 'firstbloodassist', 'firstbloodvictim', 'firstherald', 'firstdragon', 'firstbaron']]

# Transform boolean columns
bool_columns = ['firsttower', 'firstmidtower', 'firstblood', 'firstbloodkill', 'firstbloodassist', 'firstbloodvictim', 'firstherald', 'firstdragon', 'firstbaron', 'result']
for col in bool_columns:
    df_cleaned[col] = df_cleaned[col].astype(bool)

# Drop rows with missing gold difference data
df_cleaned.dropna(subset=['golddiffat10'], inplace=True)

# Create boolean gold advantage columns (game lasted that long, i.e. data exists and gold difference is positive at that time)
df_cleaned[['goldadvat10', 'goldadvat15', 'goldadvat20', 'goldadvat25']] = df_cleaned[['golddiffat10', 'golddiffat15', 'golddiffat20', 'golddiffat25']] > 0 & df_cleaned[['golddiffat10', 'golddiffat15', 'golddiffat20', 'golddiffat25']].notna()

print(print(df_cleaned.head().to_markdown(index=False)))
df_cleaned


Columns (2) have mixed types. Specify dtype option on import or set low_memory=False.



# rows: 150180
| gameid                | result   |   gamelength |   golddiffat10 |   golddiffat15 |   golddiffat20 |   golddiffat25 |   killsat25 |   deathsat25 |   assistsat25 | firsttower   | firstmidtower   | firstblood   | firstbloodkill   | firstbloodassist   | firstbloodvictim   | firstherald   | firstdragon   | firstbaron   | goldadvat10   | goldadvat15   | goldadvat20   | goldadvat25   |
|:----------------------|:---------|-------------:|---------------:|---------------:|---------------:|---------------:|------------:|-------------:|--------------:|:-------------|:----------------|:-------------|:-----------------|:-------------------|:-------------------|:--------------|:--------------|:-------------|:--------------|:--------------|:--------------|:--------------|
| ESPORTSTMNT01_2690210 | False    |         1713 |           1523 |            107 |           -944 |             88 |           6 |            7 |            12 | True         | True            | True         | Tr

Unnamed: 0,gameid,result,gamelength,golddiffat10,...,goldadvat10,goldadvat15,goldadvat20,goldadvat25
10,ESPORTSTMNT01_2690210,False,1713,1523.0,...,True,True,False,True
11,ESPORTSTMNT01_2690210,True,1713,-1523.0,...,False,False,True,False
22,ESPORTSTMNT01_2690219,False,2114,-1619.0,...,False,False,False,False
...,...,...,...,...,...,...,...,...,...
149903,ESPORTSTMNT01_3269631,False,2076,-1815.0,...,False,False,False,False
149914,ESPORTSTMNT01_3268705,True,1680,1510.0,...,True,True,True,True
149915,ESPORTSTMNT01_3268705,False,1680,-1510.0,...,False,False,False,False


### Univariate Analysis

In [31]:
# Graph histogram of gold difference distributions at different durations of the game
labels = {
    'count': 'Count',
    'golddiffat10': 'Gold Difference',
    'golddiffat15': 'Gold Difference',
    'golddiffat20': 'Gold Difference',
    'golddiffat25': 'Gold Difference'
}

fig10 = px.histogram(df_cleaned, x='golddiffat10', nbins=20, title='Distribution of Team Gold Differences at the 10 minute mark', labels=labels)
fig10.show()

fig15 = px.histogram(df_cleaned, x='golddiffat15', nbins=20, title='Distribution of Team Gold Differences at the 15 minute mark', labels=labels)
fig15.show()

fig20 = px.histogram(df_cleaned, x='golddiffat20', nbins=20, title='Distribution of Team Gold Differences at the 20 minute mark', labels=labels)
fig20.show()

fig25 = px.histogram(df_cleaned, x='golddiffat25', nbins=20, title='Distribution of Team Gold Differences at the 25 minute mark', labels=labels)
fig25.write_html('assets/fig25.html', include_plotlyjs='cdn')
fig25.show()

In [34]:

df_cleaned['Result'] = df_cleaned['result'].replace({True: 'Win', False: 'Loss'})

# Pie charts showing the percentage of teams at a gold disadvantage that ended up coming back and winning/losing the game
pie10 = px.pie(df_cleaned[df_cleaned['goldadvat10'] == False], names='Result', title="% of Teams that Comeback after a Gold Deficit at the 10 Minute Mark")
pie10.show()

pie15 = px.pie(df_cleaned[df_cleaned['goldadvat15'] == False], names='Result', title="% of Teams that Comeback after a Gold Deficit at the 15 Minute Mark")
pie15.show()

pie20 = px.pie(df_cleaned[df_cleaned['goldadvat20'] == False], names='Result', title="% of Teams that Comeback after a Gold Deficit at the 20 Minute Mark")
pie20.show()

pie25 = px.pie(df_cleaned[df_cleaned['goldadvat25'] == False], names='Result', title="% of Teams that Comeback after a Gold Deficit at the 25 Minute Mark")
pie25.write_html('assets/pie25.html', include_plotlyjs='cdn')
pie25.show()

### Interesting Aggregates

In [35]:
# Aggregate average gold differences based on if it was a Win/Loss
df_win = df_cleaned.groupby('result')[['golddiffat10', 'golddiffat15', 'golddiffat20', 'golddiffat25']].mean()
print(print(df_win.head().to_markdown(index=False)))
df_win

|   golddiffat10 |   golddiffat15 |   golddiffat20 |   golddiffat25 |
|---------------:|---------------:|---------------:|---------------:|
|       -691.987 |       -1726.75 |       -3006.34 |        -4768.9 |
|        691.987 |        1726.75 |        3006.34 |         4768.9 |
None


Unnamed: 0_level_0,golddiffat10,golddiffat15,golddiffat20,golddiffat25
result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,-691.99,-1726.75,-3006.34,-4768.9
True,691.99,1726.75,3006.34,4768.9


## Step 3: Framing a Prediction Problem

Prediction Problem: Are we able to predict if a team will be able to come back from a gold deficit based on other game statistics?

In [8]:
# Filter for all teams at a gold disadvantage by the 25 minute mark (game has lasted that long, and at a gold disadvantage)
df_cleaned_25disadv = df_cleaned[(df_cleaned['gamelength'] >= 25 * 60) & df_cleaned['goldadvat25'] == False]
df_cleaned_25disadv['comeback'] = df_cleaned_25disadv['result']

## Step 4: Baseline Model

In [36]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier


# test-train split
X_train, X_test, y_train, y_test = train_test_split(df_cleaned_25disadv, df_cleaned_25disadv['comeback'], random_state=26)

# Simple decision tree classifier here to predict if the disadvantaged team can comeback
pipeline = Pipeline([
    ('classifier', DecisionTreeClassifier())
])

pipeline.fit(X_train[['golddiffat25', 'killsat25', 'deathsat25', 'assistsat25']], y_train)

y_pred = pipeline.predict(X_test[['golddiffat25', 'killsat25', 'deathsat25', 'assistsat25']])

# Print accuracies
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 0.7346521145975443
              precision    recall  f1-score   support

       False       0.81      0.86      0.83      2236
        True       0.42      0.33      0.37       696

    accuracy                           0.73      2932
   macro avg       0.61      0.60      0.60      2932
weighted avg       0.71      0.73      0.72      2932



## Step 5: Final Model

In [38]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', DecisionTreeClassifier())
])

param_grid = {
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)

grid_search.fit(X_train[['golddiffat25', 'killsat25', 'deathsat25', 'assistsat25', 'firstdragon', 'firstherald', 'firstbaron', 'firsttower', 'firstmidtower']], y_train)

y_pred = grid_search.predict(X_test[['golddiffat25', 'killsat25', 'deathsat25', 'assistsat25', 'firstdragon', 'firstherald', 'firstbaron', 'firsttower', 'firstmidtower']])

print(f"Best hyperparameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Best hyperparameters: {'classifier__max_depth': 5, 'classifier__min_samples_split': 2}
Accuracy: 0.8816507503410641
              precision    recall  f1-score   support

       False       0.90      0.95      0.92      2236
        True       0.80      0.67      0.73       696

    accuracy                           0.88      2932
   macro avg       0.85      0.81      0.83      2932
weighted avg       0.88      0.88      0.88      2932

