In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

In [2]:
# Study data files
lck = "spring_2020/lck_spring_2020.csv"
lec = "spring_2020/lec_spring_2020.csv"
lpl= "spring_2020/lpl_spring_2020.csv"
lcs= "spring_2020/lcs_spring_2020.csv"
region = "spring_2020/region_games.csv"
total = "spring_2020/total_data.csv"
min_d = "spring_2020/min_diversity.csv"
max_d = "spring_2020/max_diversity.csv"

#read data into dfs
lck_data = pd.read_csv(lck)
lec_data = pd.read_csv(lec)
lpl_data = pd.read_csv(lpl)
lcs_data = pd.read_csv(lcs)
min_diversity = pd.read_csv(min_d)
max_diversity = pd.read_csv(max_d)
region_data = pd.read_csv(region)
total_data = pd.read_csv(total)

In [3]:
#cleaning data
lck_data['Presence']=pd.to_numeric(lck_data['Presence'].str.replace('%',''))
lcs_data['Presence']=pd.to_numeric(lcs_data['Presence'].str.replace('%',''))
lec_data['Presence']=pd.to_numeric(lec_data['Presence'].str.replace('%',''))
lpl_data['Presence']=pd.to_numeric(lpl_data['Presence'].str.replace('%',''))

In [4]:
#cleaning data
lck_data['Winrate']=pd.to_numeric(lck_data['Winrate'].str.replace('%',''))
lcs_data['Winrate']=pd.to_numeric(lcs_data['Winrate'].str.replace('%',''))
lec_data['Winrate']=pd.to_numeric(lec_data['Winrate'].str.replace('%',''))
lpl_data['Winrate']=pd.to_numeric(lpl_data['Winrate'].str.replace('%',''))

In [5]:
region_data

Unnamed: 0,region,total_games
0,LCK,223
1,LCS,94
2,LEC,90
3,LPL,330


In [6]:
#recalculate the presence column for more exact nubmers
lck_data['True Presence']= (lck_data['Picks']+lck_data['Bans'])/223*100
lcs_data['True Presence']= (lcs_data['Picks']+lcs_data['Bans'])/94*100
lec_data['True Presence']= (lec_data['Picks']+lec_data['Bans'])/90*100
lpl_data['True Presence']= (lpl_data['Picks']+lpl_data['Bans'])/330*100

In [7]:
#export cleaned df to csv
lck_data.to_csv('spring_2020/cleaned/lck_data.csv')
lcs_data.to_csv('spring_2020/cleaned/lcs_data.csv')
lec_data.to_csv('spring_2020/cleaned/lec_data.csv')
lpl_data.to_csv('spring_2020/cleaned/lpl_data.csv')

In [8]:
#set index to 'champion' column so we don't have index column when we import the tables to html
lck_data=lck_data.set_index('Champion')
lec_data=lec_data.set_index('Champion')
lpl_data=lpl_data.set_index('Champion')
lcs_data=lcs_data.set_index('Champion')

In [9]:
#dfs in html formate
lck_data.to_html('raw_data_html/lck_data.html')
lcs_data.to_html('raw_data_html/lcs_data.html')
lec_data.to_html('raw_data_html/lec_data.html')
lpl_data.to_html('raw_data_html/lpl_data.html')

In [10]:
from functools import reduce

In [11]:
#join all the tables so I can calculate the champion presence for all the regions combined
data_frames=[lck_data,lcs_data,lpl_data,lec_data]
total_data = reduce(lambda  left,right: pd.merge(left,right,on=['Champion'],
                                            how='outer'), data_frames)

In [12]:
total_data

Unnamed: 0_level_0,Picks_x,Bans_x,Presence_x,Avg BT_x,Wins_x,Losses_x,Winrate_x,KDA_x,GT_x,CSM_x,...,Avg BT_y,Wins_y,Losses_y,Winrate_y,KDA_y,GT_y,CSM_y,DPM_y,CSD@15_y,True Presence_y
Champion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aphelios,65,156,99,3.3,37.0,28.0,57.0,5.3,35:05:00,9.9,...,3.9,23.0,23.0,50.0,3.5,33:58:00,9.8,492.0,3.7,95.555556
Sett,82,138,99,3.8,40.0,42.0,49.0,2.3,33:42:00,6.9,...,4.3,18.0,20.0,47.0,2.5,33:37:00,7.1,357.0,3.7,72.222222
Ornn,94,98,86,5,54.0,40.0,57.0,3.7,33:27:00,7.4,...,4.2,12.0,16.0,43.0,2.9,33:20:00,6.8,309.0,-9.7,77.777778
LeBlanc,36,125,72,6,21.0,15.0,58.0,4.6,32:51:00,8.3,...,6.6,10.0,12.0,45.0,3.7,35:16:00,8.3,517.0,2.6,67.777778
Zoe,67,91,71,4.8,35.0,32.0,52.0,5.4,33:46:00,8.3,...,8.5,6.0,10.0,38.0,3.4,31:41:00,8.2,533.0,-1.2,22.222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Veigar,0,0,0,-,,,,,0:00,,...,8,1.0,1.0,50.0,3.8,29:57:00,8.0,521.0,-7.0,4.444444
Ahri,0,0,0,-,,,,,0:00,,...,-,,,,,0:00,,,,0.000000
Sona,0,0,0,-,,,,,0:00,,...,-,0.0,1.0,0.0,1.3,29:33:00,3.1,417.0,-96.0,1.111111
Aurelion Sol,0,0,0,-,,,,,0:00,,...,-,,,,,0:00,,,,0.000000


In [14]:
pd.DataFrame.to_csv(total_data, 'spring_2020/total_data.csv', index=True)

In [None]:
#cleaned the total data in excel, easier that way