# Exploratory Data Analysis - Valorant

## Libs

In [143]:
# Data manipulation
import pandas as pd
from datetime import datetime
import pathlib

In [144]:
# Charts
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")

## Initial Settings

In [145]:
abs_path = pathlib.Path.cwd().parent

In [146]:
# Matches
file_read = pathlib.Path(abs_path).joinpath('data').joinpath('matches').joinpath('matches_processed.csv')
df_matches_raw = pd.read_csv(file_read)
del df_matches_raw['Unnamed: 0']

## Dataset Dimenssions

In [147]:
min_data = df_matches_raw.dateMatch.min()
max_data = df_matches_raw.dateMatch.max()
unique_userId = df_matches_raw.userId.nunique()
unique_mapId = df_matches_raw.mapName.nunique()
unique_sever = df_matches_raw.server.nunique()
row_number = df_matches_raw.shape[0]
col_number = df_matches_raw.shape[1]

print('The dataset has a start date at : {} \nEnd date at : {} Match \nWith: unique players : {} \nMaps : {} \nGame servers : {} \nRow number : {} \nColumn number : {}'. format(min_data, max_data, unique_userId, unique_mapId, unique_sever, row_number, col_number))

The dataset has a start date at : 2020-06-28 
End date at : 2022-11-10 Match 
With: unique players : 1871 
Maps : 8 
Game servers : 6 
Row number : 333835 
Column number : 57


## EDA Matches

### Number of matches per day

In [148]:
matches_per_day = (
    
                    df_matches_raw.groupby(df_matches_raw.dateMatch)['matchId']
                    .count()
                    .reset_index(name='count')
                    .reset_index(drop = True)
                    )


In [149]:
fig = px.line(x = matches_per_day['dateMatch'], y = matches_per_day['count'], title= 'Number of matches per day')
fig.show()

As we can visually analyze in the graph above that the vast majority of players are drawn only 2022-08-25.

The abrupt drop that starts on 2022-10-19 is due to the end of the season.

It is worth mentioning that the player is able to change his own name, with this negatively impacting the tagname that is used to profile the player in our crawler

### Means of matches per days of week

In [199]:
weekDayOrdered = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

matches_per_day['weekDay'] = matches_per_day['dateMatch'].astype('datetime64').dt.day_name()

mean_matches_per_days_week = (
                                matches_per_day.groupby(matches_per_day.weekDay)['count']
                                .mean()
                                .reindex(weekDayOrdered) 
                                .reset_index(name = 'mean')
                                .reset_index(drop = True)
                                .round(3)
                                )

In [233]:
fig = px.bar(mean_matches_per_days_week, x = 'weekDay', y = 'mean', title = 'Means of matches per days of week')
fig.show()

In this graph of average departures per day, we certify that Wednesday it's the day with the highest number of matches, with an average of 547,624 matches played.

### Means time per days of week

In [207]:
means_time_matches = df_matches_raw[['matchId', 'weekDay', 'playtimeValue']]
means_time_matches['playtimeValueMinutes'] = means_time_matches.playtimeValue/60

means_time_matches = (
                        means_time_matches.groupby('weekDay')['playtimeValueMinutes']
                        .mean()
                        .reindex(weekDayOrdered)
                        .reset_index(name = 'mean')
                        .reset_index(drop = True)                     
                        .round(2)

)

In [227]:
fig = px.line(means_time_matches, x = 'weekDay', y = 'mean', title = 'Means time per days of week')
fig.show()

In contrast, we can see that Wednesday does not have the longest time in minutes of matches played, Tuesday takes the lead in a matter of minutes.

But we can also analyze that the average time of matches is very well distributed between the days of the week, not exceeding 5 minutes, neither more nor less

### Top seven days with most matches played

In [215]:
top_seven_days_matches = (
                            df_matches_raw.groupby(['dateMatch', 'weekDay'])['weekDay']
                            .count()
                            .reset_index(name = 'count')
                            .reset_index(drop = True)
                            .sort_values('count', ascending = False)
                            .head(7)
                            )

In [224]:
fig = px.bar(top_seven_days_matches, x='weekDay', y='count',
             hover_data=['dateMatch', 'count'], color='count', height=400, title = 'Top seven days with most matches played')
fig.show()

If we add up all the matches by day of the week, once again we find Wednesday being the day of the week with the highest number of matches played. Since the second day of the week with the most number of games played is Friday, containing a total of 10.156, 854 games less than Wednesday.

## EDA Players

###