## **Dependencies**

In [1]:
import pandas as pd

## **Source data**

In [2]:
source_path = "data/vgsales.csv"

games_sales = pd.read_csv(source_path)
games_sales.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


## **Summary**

In [3]:
games_sales.describe()

Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,16598.0,16327.0,16598.0,16598.0,16598.0,16598.0,16598.0
mean,8300.605254,2006.406443,0.264667,0.146652,0.077782,0.048063,0.537441
std,4791.853933,5.828981,0.816683,0.505351,0.309291,0.188588,1.555028
min,1.0,1980.0,0.0,0.0,0.0,0.0,0.01
25%,4151.25,2003.0,0.0,0.0,0.0,0.0,0.06
50%,8300.5,2007.0,0.08,0.02,0.0,0.01,0.17
75%,12449.75,2010.0,0.24,0.11,0.04,0.04,0.47
max,16600.0,2020.0,41.49,29.02,10.22,10.57,82.74


## **Data Schema**

In [4]:
games_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16598 non-null  int64  
 1   Name          16598 non-null  object 
 2   Platform      16598 non-null  object 
 3   Year          16327 non-null  float64
 4   Genre         16598 non-null  object 
 5   Publisher     16540 non-null  object 
 6   NA_Sales      16598 non-null  float64
 7   EU_Sales      16598 non-null  float64
 8   JP_Sales      16598 non-null  float64
 9   Other_Sales   16598 non-null  float64
 10  Global_Sales  16598 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB


## **Unique categorical values view**

In [11]:
categoric_columns = [
    'Name',
    'Platform',
    'Genre',
    'Publisher'
]

def get_column_info(df, column):
    column_values = df[column]
    print(f"- Quantidade de valores únicos em {column}: {column_values.nunique()}")
    print(f"- Ocorrências top 5 valores únicos em {column}: {column_values.value_counts().head().to_dict()}")
    top_five_ocurrences = column_values.value_counts(normalize=True).head().to_dict()
    print(f"- Ocorrências  % top 5 valores únicos em {column}: {top_five_ocurrences}")
    print(f"- Ocorrências % top 5 em {column} sobre o total: {round(sum(top_five_ocurrences.values()), 4)*100}%\n")

for column in categoric_columns:
    get_column_info(games_sales, column)

- Quantidade de valores únicos em Name: 11493
- Ocorrências top 5 valores únicos em Name: {'Need for Speed: Most Wanted': 12, 'Ratatouille': 9, 'FIFA 14': 9, 'LEGO Marvel Super Heroes': 9, 'Madden NFL 07': 9}
- Ocorrências  % top 5 valores únicos em Name: {'Need for Speed: Most Wanted': 0.0007229786721291722, 'Ratatouille': 0.0005422340040968791, 'FIFA 14': 0.0005422340040968791, 'LEGO Marvel Super Heroes': 0.0005422340040968791, 'Madden NFL 07': 0.0005422340040968791}
- Ocorrências % top 5 em Name sobre o total: 0.29%

- Quantidade de valores únicos em Platform: 31
- Ocorrências top 5 valores únicos em Platform: {'DS': 2163, 'PS2': 2161, 'PS3': 1329, 'Wii': 1325, 'X360': 1265}
- Ocorrências  % top 5 valores únicos em Platform: {'DS': 0.1303169056512833, 'PS2': 0.13019640920592843, 'PS3': 0.08006988793830581, 'Wii': 0.07982889504759609, 'X360': 0.07621400168695024}
- Ocorrências % top 5 em Platform sobre o total: 49.66%

- Quantidade de valores únicos em Genre: 12
- Ocorrências top 5 v

## **Year view**

In [15]:
get_column_info(games_sales, 'Year')
print(f"Valores de Year: {sorted(games_sales['Year'].unique())}")

- Quantidade de valores únicos em Year: 39
- Ocorrências top 5 valores únicos em Year: {2009.0: 1431, 2008.0: 1428, 2010.0: 1259, 2007.0: 1202, 2011.0: 1139}
- Ocorrências  % top 5 valores únicos em Year: {2009.0: 0.08764623017088259, 2008.0: 0.08746248545354321, 2010.0: 0.077111533043425, 2007.0: 0.07362038341397685, 2011.0: 0.06976174434984994}
- Ocorrências % top 5 em Year sobre o total: 39.56%

Valores de Year: [1980.0, 1981.0, 1982.0, 1983.0, 1984.0, 1985.0, 1986.0, 1987.0, 1988.0, 1989.0, 1990.0, 1991.0, 1992.0, 1993.0, 1994.0, 1995.0, 1996.0, 1997.0, 1998.0, 1999.0, 2000.0, 2001.0, 2002.0, 2003.0, 2004.0, 2005.0, 2006.0, 2007.0, 2008.0, 2009.0, 2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, nan, 2016.0, 2017.0, 2020.0]
