# Exploratory Data Analysis - Valorant

## Libs

In [19]:
# Data manipulation
import pandas as pd
from datetime import datetime
import pathlib

In [20]:
# Charts
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")

## Initial Settings

In [21]:
abs_path = pathlib.Path.cwd().parent

In [22]:
# Matches
file_read = pathlib.Path(abs_path).joinpath('data').joinpath('matches').joinpath('matches_processed.csv')
df_matches_raw = pd.read_csv(file_read)
del df_matches_raw['Unnamed: 0']

## Dataset Dimenssions

In [23]:
min_data = df_matches_raw.dateMatch.min()
max_data = df_matches_raw.dateMatch.max()
unique_userId = df_matches_raw.userId.nunique()
unique_mapId = df_matches_raw.mapName.nunique()
unique_sever = df_matches_raw.server.nunique()
row_number = df_matches_raw.shape[0]
col_number = df_matches_raw.shape[1]

print('The dataset has a start date at : {} \nEnd date at : {} Match \nWith: unique players : {} \nMaps : {} \nGame servers : {} \nRow number : {} \nColumn number : {}'. format(min_data, max_data, unique_userId, unique_mapId, unique_sever, row_number, col_number))

The dataset has a start date at : 2020-06-28 
End date at : 2022-11-10 Match 
With: unique players : 1871 
Maps : 8 
Game servers : 6 
Row number : 333835 
Column number : 57


In [134]:
df_matches_raw_describe = df_matches_raw.describe()
df_matches_raw_describe = pd.pivot_table(df_matches_raw_describe, columns=["count", 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
df_matches_raw_describe

Unnamed: 0,25%,50%,75%,count,max,mean,min,std
ability1CastsValue,6.0,10.0,16.0,333835.0,87.0,11.527281,0.0,7.283314
ability2CastsValue,13.0,20.0,26.0,333835.0,109.0,20.500013,0.0,11.277662
acesValue,0.0,0.0,0.0,333835.0,3.0,0.027571,0.0,0.167447
assistsValue,3.0,4.0,7.0,333835.0,26.0,4.992239,0.0,3.220758
clutchesLostValue,1.0,2.0,3.0,333835.0,15.0,2.059721,0.0,1.603646
clutchesValue,0.0,0.0,1.0,333835.0,8.0,0.653431,0.0,0.931498
competitiveTier,26.0,27.0,27.0,333835.0,27.0,26.663373,25.0,0.567392
damagePerRoundValue,119.153846,145.111111,173.25,333835.0,461.0,147.668696,0.0,41.148931
damageReceivedValue,2497.0,2930.0,3354.0,333835.0,8020.0,2918.106265,0.0,711.47083
damageValue,2438.0,3083.0,3784.0,333835.0,11217.0,3144.430006,0.0,1023.790909


## EDA Matches

### Number of matches per day

In [24]:
matches_per_day = (
    
                    df_matches_raw.groupby(df_matches_raw.dateMatch)['matchId']
                    .count()
                    .reset_index(name='count')
                    .reset_index(drop = True)
                    )


In [25]:
fig = px.line(x = matches_per_day['dateMatch'], y = matches_per_day['count'], title= 'Number of matches per day')
fig.show()

As we can visually analyze in the graph above that the vast majority of players are drawn only 2022-08-25.

The abrupt drop that starts on 2022-10-19 is due to the end of the season.

It is worth mentioning that the player is able to change his own name, with this negatively impacting the tagname that is used to profile the player in our crawler

### Means of matches per days of week

In [26]:
weekDayOrdered = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

matches_per_day['weekDay'] = matches_per_day['dateMatch'].astype('datetime64').dt.day_name()

mean_matches_per_days_week = (
                                matches_per_day.groupby(matches_per_day.weekDay)['count']
                                .mean()
                                .reindex(weekDayOrdered) 
                                .reset_index(name = 'mean')
                                .reset_index(drop = True)
                                .round(3)
                                )

In [27]:
fig = px.bar(mean_matches_per_days_week, x = 'weekDay', y = 'mean', title = 'Means of matches per days of week')
fig.show()

In this graph of average departures per day, we certify that Wednesday it's the day with the highest number of matches, with an average of 547,624 matches played.

### Means time per days of week

In [28]:
means_time_matches = df_matches_raw[['matchId', 'weekDay', 'playtimeValue']]
means_time_matches['playtimeValueMinutes'] = means_time_matches.playtimeValue/60

means_time_matches = (
                        means_time_matches.groupby('weekDay')['playtimeValueMinutes']
                        .mean()
                        .reindex(weekDayOrdered)
                        .reset_index(name = 'mean')
                        .reset_index(drop = True)                     
                        .round(2)

)

In [29]:
fig = px.line(means_time_matches, x = 'weekDay', y = 'mean', title = 'Means time per days of week')
fig.show()

In contrast, we can see that Wednesday does not have the longest time in minutes of matches played, Tuesday takes the lead in a matter of minutes.

But we can also analyze that the average time of matches is very well distributed between the days of the week, not exceeding 5 minutes, neither more nor less

### Top seven days with most matches played

In [30]:
top_seven_days_matches = (
                            df_matches_raw.groupby(['dateMatch', 'weekDay'])['weekDay']
                            .count()
                            .reset_index(name = 'count')
                            .reset_index(drop = True)
                            .sort_values('count', ascending = False)
                            .head(7)
                            )

In [31]:
fig = px.bar(top_seven_days_matches, x='weekDay', y='count',
             hover_data=['dateMatch', 'count'], color='count', height=400, title = 'Top seven days with most matches played')
fig.show()

If we add up all the matches by day of the week, once again we find Wednesday being the day of the week with the highest number of matches played. Since the second day of the week with the most number of games played is Friday, containing a total of 10.156, 854 games less than Wednesday.

## EDA Players

###

### Number of matches per player

In [32]:
matches_per_player = (
                        df_matches_raw.groupby(['userId'])['userId']
                        .count()
                        .reset_index(name = 'count')
                        .reset_index(drop = True)
                        )

matches_per_player

Unnamed: 0,userId,count
0,Z%232000,180
1,00xampa%23HNDEK,180
2,010 farewell%23RRR,180
3,06 cAstAwAy 06%23YARGI,180
4,06 발로 프로지망%23USW,180
...,...,...
1866,한석봉%23blebe,180
1867,한섭새끼들다썰고아섭으로도망감%23이지까까,180
1868,허 민 규%235841,180
1869,헬스장덤벨연쇄절도범%233대500,180


### Number of hours played per player

In [124]:
total_time_player = df_matches_raw.groupby(['userId'])[['playtimeValue']].sum().reset_index()
total_time_player['playtimeValueMinutes'] = (df_matches_raw['playtimeValue']/60).round(2)


total_time_player

Unnamed: 0,userId,playtimeValue,playtimeValueMinutes
0,Z%232000,357449,37.08
1,00xampa%23HNDEK,361857,31.33
2,010 farewell%23RRR,354616,33.03
3,06 cAstAwAy 06%23YARGI,367082,42.08
4,06 발로 프로지망%23USW,374216,51.22
...,...,...,...
1866,한석봉%23blebe,365979,40.85
1867,한섭새끼들다썰고아섭으로도망감%23이지까까,362797,31.78
1868,허 민 규%235841,362889,30.18
1869,헬스장덤벨연쇄절도범%233대500,363814,24.62


In [125]:
fig = px.line(total_time_player, x = 'userId', y = 'playtimeValueMinutes', title = 'Means time per days of week')
fig.show()

In [128]:
fig = px.box(total_time_player, y="playtimeValueMinutes")
fig.show()

### Number of wins per player

In [156]:
number_wins_player = (
                        df_matches_raw.groupby(['userId', 'result'])['result']
                        .count()
                        .reset_index(name = 'count')
                        .sort_values(['count'], ascending = False)
                        .reset_index(drop = True)

)

number_wins_player

Unnamed: 0,userId,result,count
0,OiShi As1an%232004,victory,138
1,LFT Virtue%231603,victory,133
2,KONE Haimian%230010,victory,132
3,nAts%23TATAR,victory,127
4,FNATIC Leo%23BONK,victory,126
...,...,...,...
5280,Creamydreamy%23Cream,tied,1
5281,SH ARTZIN%23scott,tied,1
5282,Megumiツ%23RD18,tied,1
5283,circu%23fpj,tied,1


In [159]:
fig = px.box(number_wins_player, x = "result", y = "count", color = "result")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.show()