### Data Cleaning & EDA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option('display.max_columns', 100, 'display.max_rows', 100)

In [3]:
oski = pd.read_csv('../../1-data-collection/data/oski.csv')
vati = pd.read_csv('../../1-data-collection/data/vati.csv')
monkeymoon = pd.read_csv('../../1-data-collection/data/monkeymoon.csv')

In [4]:
oski.shape, vati.shape, monkeymoon.shape

((325, 85), (384, 85), (253, 86))

In [5]:
oski_mean = oski.describe().T[['mean']].rename(columns = {'mean' : 'oski'})
vati_mean = vati.describe().T[['mean']].rename(columns = {'mean' : 'vati'})
monk_mean = monkeymoon.describe().T[['mean']].rename(columns = {'mean' : 'monkeymoon'})

pros = pd.concat([oski_mean, vati_mean, monk_mean], axis = 1)

pros = pros.T

pros['mvp'] = [oski.mvp.value_counts(normalize = True)[1],vati.mvp.value_counts(normalize = True)[1],
              monkeymoon.mvp.value_counts(normalize = True)[1]]

pros.drop(columns = 'goals_against_while_last_defender', inplace = True)

pros = pros.T

pros

Unnamed: 0,oski,vati,monkeymoon
shots,3.0,2.96875,3.288538
shots_against,6.775385,6.895833,7.671937
goals,0.818462,0.770833,0.87747
goals_against,1.747692,1.596354,2.0
saves,1.258462,1.544271,1.98419
assists,0.495385,0.627604,0.612648
score,356.544615,384.794271,451.418972
shooting_percentage,26.631135,24.980985,24.702359
bpm,494.030769,424.106771,408.790514
bcpm,497.376185,435.693972,422.106681


- inital comparison of the 3 pro players and their mean stats over ~300 games from international tournaments
    - Oski appears to have high boost usage, and spends the most time furthest forward and time spend high in the air
    - vatira is mostly placing between the other two pros in most stats on average, indicating a well rounded playstyle
        - he has the lowest time spent closest to the ball and spends the most time on 75-100 boost
    - Monkey Moon spends the most time on the ground, and lowest in the high air.
        - he has a very high proportion of time spent on 25-75 boost, indicating strong boost management - particularly his low time spent on 0-25 boost

In [32]:
to_drop = [
    'shots_against',
    'goals_against',
    'shooting_percentage',
    'bpm',
    'amount_stolen_big',
    'amount_stolen_small',
    'count_collected_big',
    'count_collected_small',
    'count_stolen_small',
    'count_stolen_big',
    'amount_overfill_stolen',
    'time_zero_boost',
    'time_full_boost',
    'time_boost_0_25',
    'time_boost_25_50',
    'time_boost_50_75',
    'time_boost_75_100',
    'avg_speed',
    'total_distance',
    'time_supersonic_speed',
    'time_boost_speed',
    'time_slow_speed',
    'time_ground',
    'time_low_air',
    'time_high_air',
    'time_powerslide',
    'time_defensive_third',
    'time_neutral_third',
    'time_offensive_third',
    'time_defensive_half',
    'time_offensive_half',
    'time_behind_ball',
    'time_infront_ball',
    'time_most_back',
    'time_most_forward',
    'time_closest_to_ball',
    'time_farthest_from_ball',
    'goals_against_while_last_defender'
]

In [80]:
df = pd.concat([oski, vati, monkeymoon], axis = 0)

In [81]:
df.mvp.value_counts()

False    711
True     251
Name: mvp, dtype: int64

In [82]:
df.mvp = df.mvp.astype(int)

In [83]:
df.head()

Unnamed: 0.1,Unnamed: 0,shots,shots_against,goals,goals_against,saves,assists,score,mvp,shooting_percentage,bpm,bcpm,avg_amount,amount_collected,amount_stolen,amount_collected_big,amount_stolen_big,amount_collected_small,amount_stolen_small,count_collected_big,count_stolen_big,count_collected_small,count_stolen_small,amount_overfill,amount_overfill_stolen,amount_used_while_supersonic,time_zero_boost,percent_zero_boost,time_full_boost,percent_full_boost,time_boost_0_25,time_boost_25_50,time_boost_50_75,time_boost_75_100,percent_boost_0_25,percent_boost_25_50,percent_boost_50_75,percent_boost_75_100,avg_speed,total_distance,time_supersonic_speed,time_boost_speed,time_slow_speed,time_ground,time_low_air,time_high_air,time_powerslide,count_powerslide,avg_powerslide_duration,avg_speed_percentage,percent_slow_speed,percent_boost_speed,percent_supersonic_speed,percent_ground,percent_low_air,percent_high_air,avg_distance_to_ball,avg_distance_to_ball_possession,avg_distance_to_ball_no_possession,avg_distance_to_mates,time_defensive_third,time_neutral_third,time_offensive_third,time_defensive_half,time_offensive_half,time_behind_ball,time_infront_ball,time_most_back,time_most_forward,time_closest_to_ball,time_farthest_from_ball,percent_defensive_third,percent_offensive_third,percent_neutral_third,percent_defensive_half,percent_offensive_half,percent_behind_ball,percent_infront_ball,percent_most_back,percent_most_forward,percent_closest_to_ball,percent_farthest_from_ball,inflicted,taken,player_name,goals_against_while_last_defender
0,EU P-K KC vs TL G7 2023-02-26.19.11,3,6,2,1,0,0,378,0,66.666664,429,420.76846,48.78,2338,175,1535,0,803,175,18,0,65,16,296,6,389,43.2,12.957797,33.87,10.159272,108.33,89.79,34.43,91.14,33.467205,27.739504,10.636721,28.156569,1521,479414,46.5,128.22,158.76,193.4,121.31,18.79,9.73,84,0.12,66.13043,47.60705,38.449085,13.943866,57.991005,36.374813,5.634183,2836,2827,2882,3486,188.59,87.07,57.83,235.09,98.4,261.02,72.47,118.8,95.2,97.9,116.2,56.550423,17.34085,26.10873,70.49387,29.506132,78.26921,21.730787,35.63394,28.555145,29.365007,34.854073,0,3,Oski,
1,EU P-K KC vs TL G6 2023-02-26.19.02,2,12,1,2,1,0,337,0,50.0,427,443.67334,49.62,2749,389,1535,200,1214,189,18,2,93,17,280,7,457,60.76,16.343878,22.06,5.933936,133.1,71.06,73.76,99.06,35.306915,18.849806,19.566025,26.277256,1593,585813,66.32,166.66,153.35,220.85,146.22,19.26,10.71,94,0.11,69.26087,39.694046,43.139282,17.16667,57.16615,37.84847,4.985375,2840,2821,2778,3664,190.02,141.74,54.57,279.29,107.04,286.52,99.81,128.3,107.2,114.6,126.7,49.18593,14.125229,36.688843,72.29311,27.70688,74.16457,25.835426,34.511513,28.835808,30.826338,34.081127,1,0,Oski,
2,EU P-K KC vs TL G5 2023-02-26.18.51,2,5,1,0,2,0,396,0,50.0,473,483.6058,42.02,2677,507,1905,322,772,185,21,4,69,16,200,77,335,47.3,14.241411,40.02,12.049499,127.52,95.88,49.99,60.0,38.2495,28.759111,14.994452,17.99694,1624,529064,60.04,148.98,131.26,178.96,137.89,23.43,9.61,87,0.11,70.608696,38.574112,43.781593,17.644293,52.591984,40.52251,6.885506,2860,2622,3089,3685,181.46,94.79,64.03,239.65,100.63,233.08,107.2,96.3,121.2,109.8,100.2,53.32667,18.816856,27.856472,70.42729,29.572704,68.49654,31.503468,28.99467,36.491734,33.059345,30.16891,2,1,Oski,
3,EU P-K KC vs TL G4 2023-02-26.18.44,2,11,1,3,2,0,408,0,50.0,435,427.79123,43.45,2624,352,1868,198,756,154,21,2,71,16,241,1,315,50.67,13.7679,30.7,8.341711,184.0,64.8,42.98,73.18,50.416485,17.755373,11.776633,20.051514,1517,531429,54.19,142.95,182.78,215.02,134.2,30.7,11.17,110,0.1,65.95652,48.11013,37.626343,14.26353,56.596123,35.323223,8.080648,2760,2794,2732,3498,223.85,103.77,52.3,285.03,94.89,300.52,79.4,114.1,105.2,114.2,99.0,58.920303,13.766057,27.313646,75.02369,24.976313,79.10086,20.899137,31.002907,28.584627,31.030079,26.89998,1,3,Oski,
4,EU P-K KC vs TL G3 2023-02-26.18.35,0,12,0,3,2,0,212,0,0.0,456,489.8436,42.65,2725,250,1900,0,825,250,22,0,66,22,215,14,445,53.08,15.902691,22.83,6.839835,134.18,81.36,50.03,65.9,40.48028,24.545208,15.093372,19.881136,1590,509663,63.63,142.1,136.56,199.96,128.37,13.95,9.35,84,0.11,69.13043,39.895992,41.514507,18.5895,58.420002,37.50438,4.07561,2732,2586,2872,3678,160.61,127.42,54.26,228.74,113.55,238.97,103.31,85.0,137.0,122.2,104.0,46.922203,15.852056,37.225746,66.82637,33.173622,69.81711,30.182892,25.465876,41.045,36.610943,31.158249,1,3,Oski,


In [84]:
df.mvp.value_counts()

0    711
1    251
Name: mvp, dtype: int64

In [85]:
df.player_name.value_counts(normalize=True)

Vati           0.399168
Oski           0.337838
M0nkey M00n    0.262994
Name: player_name, dtype: float64

In [86]:
df = df.drop(columns=to_drop)

In [87]:
df.isnull().sum()

Unnamed: 0                            0
shots                                 0
goals                                 0
saves                                 0
assists                               0
score                                 0
mvp                                   0
bcpm                                  0
avg_amount                            0
amount_collected                      0
amount_stolen                         0
amount_collected_big                  0
amount_collected_small                0
amount_overfill                       0
amount_used_while_supersonic          0
percent_zero_boost                    0
percent_full_boost                    0
percent_boost_0_25                    0
percent_boost_25_50                   0
percent_boost_50_75                   0
percent_boost_75_100                  0
count_powerslide                      0
avg_powerslide_duration               0
avg_speed_percentage                  0
percent_slow_speed                    0


In [88]:
df.dropna(inplace= True)

In [89]:
df.shape

(958, 48)

In [90]:
df.head()

Unnamed: 0.1,Unnamed: 0,shots,goals,saves,assists,score,mvp,bcpm,avg_amount,amount_collected,amount_stolen,amount_collected_big,amount_collected_small,amount_overfill,amount_used_while_supersonic,percent_zero_boost,percent_full_boost,percent_boost_0_25,percent_boost_25_50,percent_boost_50_75,percent_boost_75_100,count_powerslide,avg_powerslide_duration,avg_speed_percentage,percent_slow_speed,percent_boost_speed,percent_supersonic_speed,percent_ground,percent_low_air,percent_high_air,avg_distance_to_ball,avg_distance_to_ball_possession,avg_distance_to_ball_no_possession,avg_distance_to_mates,percent_defensive_third,percent_offensive_third,percent_neutral_third,percent_defensive_half,percent_offensive_half,percent_behind_ball,percent_infront_ball,percent_most_back,percent_most_forward,percent_closest_to_ball,percent_farthest_from_ball,inflicted,taken,player_name
0,EU P-K KC vs TL G7 2023-02-26.19.11,3,2,0,0,378,0,420.76846,48.78,2338,175,1535,803,296,389,12.957797,10.159272,33.467205,27.739504,10.636721,28.156569,84,0.12,66.13043,47.60705,38.449085,13.943866,57.991005,36.374813,5.634183,2836,2827,2882,3486,56.550423,17.34085,26.10873,70.49387,29.506132,78.26921,21.730787,35.63394,28.555145,29.365007,34.854073,0,3,Oski
1,EU P-K KC vs TL G6 2023-02-26.19.02,2,1,1,0,337,0,443.67334,49.62,2749,389,1535,1214,280,457,16.343878,5.933936,35.306915,18.849806,19.566025,26.277256,94,0.11,69.26087,39.694046,43.139282,17.16667,57.16615,37.84847,4.985375,2840,2821,2778,3664,49.18593,14.125229,36.688843,72.29311,27.70688,74.16457,25.835426,34.511513,28.835808,30.826338,34.081127,1,0,Oski
2,EU P-K KC vs TL G5 2023-02-26.18.51,2,1,2,0,396,0,483.6058,42.02,2677,507,1905,772,200,335,14.241411,12.049499,38.2495,28.759111,14.994452,17.99694,87,0.11,70.608696,38.574112,43.781593,17.644293,52.591984,40.52251,6.885506,2860,2622,3089,3685,53.32667,18.816856,27.856472,70.42729,29.572704,68.49654,31.503468,28.99467,36.491734,33.059345,30.16891,2,1,Oski
3,EU P-K KC vs TL G4 2023-02-26.18.44,2,1,2,0,408,0,427.79123,43.45,2624,352,1868,756,241,315,13.7679,8.341711,50.416485,17.755373,11.776633,20.051514,110,0.1,65.95652,48.11013,37.626343,14.26353,56.596123,35.323223,8.080648,2760,2794,2732,3498,58.920303,13.766057,27.313646,75.02369,24.976313,79.10086,20.899137,31.002907,28.584627,31.030079,26.89998,1,3,Oski
4,EU P-K KC vs TL G3 2023-02-26.18.35,0,0,2,0,212,0,489.8436,42.65,2725,250,1900,825,215,445,15.902691,6.839835,40.48028,24.545208,15.093372,19.881136,84,0.11,69.13043,39.895992,41.514507,18.5895,58.420002,37.50438,4.07561,2732,2586,2872,3678,46.922203,15.852056,37.225746,66.82637,33.173622,69.81711,30.182892,25.465876,41.045,36.610943,31.158249,1,3,Oski


---
## Data saved for Classification modeling

- dataframes cleaned, columns dropped which are either unhelpful or too similar to other columns 


In [91]:
df.to_csv('../data/pros.csv', index = False)