League of Legends Model Building

**Name(s)**: Andrew Tan

**Website Link**: (your website link)

## Code

In [9]:
import pandas as pd
import numpy as np
import os

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


In [5]:
league = pd.read_csv(os.path.join('data', '2022_LoL_dataset.csv'), low_memory=False)
league

Unnamed: 0,gameid,datacompleteness,url,league,year,split,playoffs,date,game,patch,...,opp_csat15,golddiffat15,xpdiffat15,csdiffat15,killsat15,assistsat15,deathsat15,opp_killsat15,opp_assistsat15,opp_deathsat15
0,ESPORTSTMNT01_2690210,complete,,LCK CL,2022,Spring,0,2022-01-10 07:44:08,1,12.01,...,121.0,391.0,345.0,14.0,0.0,1.0,0.0,0.0,1.0,0.0
1,ESPORTSTMNT01_2690210,complete,,LCK CL,2022,Spring,0,2022-01-10 07:44:08,1,12.01,...,100.0,541.0,-275.0,-11.0,2.0,3.0,2.0,0.0,5.0,1.0
2,ESPORTSTMNT01_2690210,complete,,LCK CL,2022,Spring,0,2022-01-10 07:44:08,1,12.01,...,119.0,-475.0,153.0,1.0,0.0,3.0,0.0,3.0,3.0,2.0
3,ESPORTSTMNT01_2690210,complete,,LCK CL,2022,Spring,0,2022-01-10 07:44:08,1,12.01,...,149.0,-793.0,-1343.0,-34.0,2.0,1.0,2.0,3.0,3.0,0.0
4,ESPORTSTMNT01_2690210,complete,,LCK CL,2022,Spring,0,2022-01-10 07:44:08,1,12.01,...,21.0,443.0,-497.0,7.0,1.0,2.0,2.0,0.0,6.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149227,9687-9687_game_5,partial,https://lpl.qq.com/es/stats.shtml?bmid=9687,DC,2022,,0,2022-12-27 12:43:43,5,12.23,...,,,,,,,,,,
149228,9687-9687_game_5,partial,https://lpl.qq.com/es/stats.shtml?bmid=9687,DC,2022,,0,2022-12-27 12:43:43,5,12.23,...,,,,,,,,,,
149229,9687-9687_game_5,partial,https://lpl.qq.com/es/stats.shtml?bmid=9687,DC,2022,,0,2022-12-27 12:43:43,5,12.23,...,,,,,,,,,,
149230,9687-9687_game_5,partial,https://lpl.qq.com/es/stats.shtml?bmid=9687,DC,2022,,0,2022-12-27 12:43:43,5,12.23,...,,,,,,,,,,


### Framing the Problem

The League of Legends dataset loaded above give insight on games from the 2022 season. It specifies information and statistics for each player for each specific game, including bans, kills, deaths, damage taken per minute, wards placed, etc. As a beginner to the game, I wonder if there are aspects of the game that could be predicted, such as:
1. Which team will win or lose a game?
2. After the first 10 minutes have played out, which team will win or lose a game?
2. How long will this game take?
3. How much damage to champions will this team have?
4. How much damage will this team take?

The specific question that I will be focusing on predicting will be "which team will win or lose a game?"

### Cleaning

In [19]:
'''
Data Cleaning:
I manually filled in NaN values for certain columns to either 'unknown' or 'n/a', depending on if the 
information was missing intentionally or not. I dropped columns that were meant for individual statistics, since I am focusing on
team statistics. For the rest of the numerical values that weren't filled in, 
I calculated the median and filled in the NaN values accordingly. 
'''
league_cleaned = league.copy()
league_cleaned = league_cleaned[league_cleaned['position'] == 'team']
league_cleaned['result'] = league_cleaned['result'].astype(bool)
league_cleaned['playoffs'] = league_cleaned['playoffs'].astype(bool)
league_cleaned = league_cleaned.fillna({'url': 'unknown', 
                                        'split': 'unknown',
                                       })
columns = league_cleaned.columns
median_imputation = list(range(30,37)) + list(range(67, len(columns)))

for i in median_imputation:
    col = columns[i]
    league_cleaned[col] = league_cleaned[col].fillna(league_cleaned[col].median())
    

    
dropped = [columns[i] for i in range(40, 58)] + [columns[i] for i in range(60, 67)]
league_cleaned = league_cleaned.drop(columns=dropped)
league_cleaned = league_cleaned.drop(columns=['playername', 'playerid', 'champion'])

league_cleaned

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Unnamed: 0,gameid,datacompleteness,url,league,year,split,playoffs,date,game,patch,...,opp_csat15,golddiffat15,xpdiffat15,csdiffat15,killsat15,assistsat15,deathsat15,opp_killsat15,opp_assistsat15,opp_deathsat15
10,ESPORTSTMNT01_2690210,complete,unknown,LCK CL,2022,Spring,False,2022-01-10 07:44:08,1,12.01,...,510.0,107.0,-1617.0,-23.0,5.0,10.0,6.0,6.0,18.0,5.0
11,ESPORTSTMNT01_2690210,complete,unknown,LCK CL,2022,Spring,False,2022-01-10 07:44:08,1,12.01,...,487.0,-107.0,1617.0,23.0,6.0,18.0,5.0,5.0,10.0,6.0
22,ESPORTSTMNT01_2690219,complete,unknown,LCK CL,2022,Spring,False,2022-01-10 08:38:24,1,12.01,...,555.0,-1763.0,-906.0,-22.0,1.0,1.0,3.0,3.0,3.0,1.0
23,ESPORTSTMNT01_2690219,complete,unknown,LCK CL,2022,Spring,False,2022-01-10 08:38:24,1,12.01,...,533.0,1763.0,906.0,22.0,3.0,3.0,1.0,1.0,1.0,3.0
34,8401-8401_game_1,partial,https://lpl.qq.com/es/stats.shtml?bmid=8401,LPL,2022,Spring,False,2022-01-10 09:24:26,1,12.01,...,505.0,0.0,0.0,0.0,4.0,5.0,4.0,4.0,5.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149207,9687-9687_game_3,partial,https://lpl.qq.com/es/stats.shtml?bmid=9687,DC,2022,unknown,False,2022-12-27 10:54:36,3,12.23,...,505.0,0.0,0.0,0.0,4.0,5.0,4.0,4.0,5.0,4.0
149218,9687-9687_game_4,partial,https://lpl.qq.com/es/stats.shtml?bmid=9687,DC,2022,unknown,False,2022-12-27 11:45:06,4,12.23,...,505.0,0.0,0.0,0.0,4.0,5.0,4.0,4.0,5.0,4.0
149219,9687-9687_game_4,partial,https://lpl.qq.com/es/stats.shtml?bmid=9687,DC,2022,unknown,False,2022-12-27 11:45:06,4,12.23,...,505.0,0.0,0.0,0.0,4.0,5.0,4.0,4.0,5.0,4.0
149230,9687-9687_game_5,partial,https://lpl.qq.com/es/stats.shtml?bmid=9687,DC,2022,unknown,False,2022-12-27 12:43:43,5,12.23,...,505.0,0.0,0.0,0.0,4.0,5.0,4.0,4.0,5.0,4.0


In [21]:
def getChampions(row):
    game, team = row['gameid'], row['teamname']
    player_stats = league[(league['gameid'] == game) & (league['teamname'] == team)]
    champions = player_stats['champion']
    print(champions)
    return champions

test = league_cleaned.apply(getChampions, axis=1)
test

TypeError: can only concatenate str (not "int") to str

### Baseline Model

In [18]:
X_train, X_test, y_train, y_test = train_test_split(league_cleaned[['killsat10', 'deathsat10', 'assistsat10']],
                                                   league_cleaned['result'],
                                                   test_size=0.25,
                                                   random_state=1)


dt = DecisionTreeClassifier(max_depth=2)
dt.fit(X_train, y_train)
dt.score(X_train, y_train)
dt.score(X_test, y_test)

0.5791251206175619

### Final Model

In [None]:
# TODO

### Fairness Analysis

In [None]:
# TODO