# ~~Valorant EDA Pros vs In-Game LeaderBoard~~
# Valorant EDA Data Cleaning

### Imports and DataSet Definitions

In [161]:
import pandas as pd
from get_data import fetch_data

use fetch_data to retrieve up to date data from vlr.gg

notes:
- pros_all_time and champs22 was collected on Feb 1, 2023
- leaderboard was collected at the end of Epidode 4: Act 3. roughly 1 year older than above datasets
- the reason for this is that riot (creators of valorant) are very strict about player stat data. hard to access.
- url for vlr: https://www.vlr.gg/stats
- vlr columns: r = rating, acs = average combat score, K:D kill death ratio, KAST = kills assists trade survive %, ADR = average damage per round, KPR = kills per round, APR = assist per round, FKPR = first kill per round, FDPR = first death per round, HS% = head shot percentage, CL% = clutch success percentage, KMax = max kill in single map, k,d,a,fk,fd = total kills,deaths,assists,first kills, first death in that order


In [162]:
# fetch_data("https://www.vlr.gg/stats/?event_group_id=all&event_id=all&region=all&country=all&min_rounds=200&min_rating=1550&agent=all&map_id=all&timespan=all","pros_all_time")

In [163]:
na_values = [
    "", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan",
    "1.#IND", "1.#QNAN", "<NA>", "N/A", "NULL", "NaN", "n/a", "nan", "null"
]
df_pros_all_time: pd.DataFrame = pd.read_pickle("./data/pros_all_time")
df_champs22: pd.DataFrame = pd.read_pickle("./data/champs22")
# leaderboard column region has entries name "NA" which actually means North America so to avoid categorizing them as NAN we must use na_values
df_leaderboards: pd.DataFrame = pd.read_csv("./data/leaderboards.csv",
                                            na_values=na_values,
                                            keep_default_na=False)


  df_leaderboards: pd.DataFrame = pd.read_csv("./data/leaderboards.csv",


In [164]:
df_pros_all_time.sample()

Unnamed: 0,Player,Agents,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKPR,FDPR,HS%,CL%,CL,KMax,K,D,A,FK,FD
3283,KugelMitSahne,,237,0.82,184.3,0.74,61%,122.1,0.59,0.26,0.09,0.1,22%,20%,6/30,17,140,190,61,22,23


### Cleaning Up Data
goes without saying all this data gathered from different parts of the web is extremely messy. dtypes were all over the place, column names needed to be matched, a bunch of other stuff. so a lot of cleanup was necessary. per usual i found these bits and pieces of mess one by one, so the code could be written much more effiecently. i just added things as i sifted through the data. to at least make it somewhat organized i put all the cleaning procedures in this one cell below.

In [165]:
pd.set_option('display.max_columns', None)
column_renames = {
    "rating": "Rank",
    "damage_round": "ADR",
    "headshot_percent": "HS_pct",
    "first_bloods": "FK",
    "kills": "K",
    "deaths": "D",
    "assists": "A",
    "kd_ratio": "K:D",
    "kills_round": "KPR",
    "most_kills": "KMax",
    "score_round": "ACS",
    "win_percent": "Win_pct",
    "HS%": "HS_pct",
    "CL%": "CL_pct",
    "player": "name",
}
dfs = [df_pros_all_time, df_leaderboards, df_champs22]
for dataframe in dfs:
    # fixing column names
    dataframe.rename(columns=column_renames,inplace=True)
    dataframe.columns = dataframe.columns.str.lower()
    # dups
    dataframe.drop_duplicates(inplace=True)
    
#NaN's
df_champs22.drop(columns=["agents","cl"],inplace=True)
df_champs22.fillna(0,inplace=True)  
df_pros_all_time.drop(columns=["agents","cl"],inplace=True)
df_pros_all_time.fillna(0,inplace=True)  
df_leaderboards.drop(columns="fk",inplace=True)
df_leaderboards.fillna("none", inplace=True)

# parsing
parse_vtr_cols = ["kast","hs_pct","cl_pct"]
for col in parse_vtr_cols:
    df_champs22[col] = df_champs22[col].str.replace("%", "")
    df_champs22[col] = pd.to_numeric(df_champs22[col])

    df_pros_all_time[col] = df_pros_all_time[col].str.replace("%", "")
    df_pros_all_time[col] = pd.to_numeric(df_pros_all_time[col])

categoricals = ["region", "rank","agent_1","agent_2","agent_3","gun1_name","gun2_name","gun3_name"]
for col in categoricals:
    df_leaderboards[col] = df_leaderboards[col].astype("category")
    
numerical = ["headshots","k","d","a","gun1_kills","gun2_kills"]
for col in numerical:
    df_leaderboards[col] = df_leaderboards[col].str.replace(",", "")
    df_leaderboards[col] = pd.to_numeric(df_leaderboards[col])
    
# theres another way to do the same thing as above. came to me later...
# df_leaderboards[numerical] = df_leaderboards[numerical].apply(pd.to_numeric)


In [166]:
df_leaderboards.isna().sum()

region        0
name          0
tag           0
rank          0
adr           0
headshots     0
hs_pct        0
aces          0
clutches      0
flawless      0
k             0
d             0
a             0
k:d           0
kpr           0
kmax          0
acs           0
wins          0
win_pct       0
agent_1       0
agent_2       0
agent_3       0
gun1_name     0
gun1_head     0
gun1_body     0
gun1_legs     0
gun1_kills    0
gun2_name     0
gun2_head     0
gun2_body     0
gun2_legs     0
gun2_kills    0
gun3_name     0
gun3_head     0
gun3_body     0
gun3_legs     0
gun3_kills    0
dtype: int64

In [167]:
df_leaderboards.dtypes

region        category
name            object
tag             object
rank          category
adr            float64
headshots        int64
hs_pct         float64
aces             int64
clutches         int64
flawless         int64
k                int64
d                int64
a                int64
k:d            float64
kpr            float64
kmax             int64
acs            float64
wins             int64
win_pct        float64
agent_1       category
agent_2       category
agent_3       category
gun1_name     category
gun1_head        int64
gun1_body        int64
gun1_legs        int64
gun1_kills       int64
gun2_name     category
gun2_head        int64
gun2_body        int64
gun2_legs        int64
gun2_kills       int64
gun3_name     category
gun3_head        int64
gun3_body        int64
gun3_legs        int64
gun3_kills       int64
dtype: object

### Saving to new pickle
after all that cleaning im going to just save this data and open a new notebook.

In [None]:
pd.to_pickle(df_champs22,"./data/df_champs22_clean")
pd.to_pickle(df_leaderboards,"./data/df_leaderboards_clean")
pd.to_pickle(df_pros_all_time,"./data/df_pros_all_time_clean")