# Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import

In [2]:
# Import the CSV file
df = pd.read_csv("https://raw.githubusercontent.com/candersen114/Python2Source/main/vgsales_new_final.csv", dtype_backend="pyarrow")

print(df.dtypes)

# Remove PC and PSN games, there aren't many and they aren't easily classified into generations
df = df[(df["Platform"] != "PSN") & (df["Platform"] != "PC")]

# Trim publisher and Genre to ensure standardization
df["Genre"] = df["Genre"].str.strip()
df["Publisher"] = df["Publisher"].str.strip()

df.describe(include="all")

Rank             int64[pyarrow]
Name            string[pyarrow]
Platform        string[pyarrow]
Year             int64[pyarrow]
Genre           string[pyarrow]
Publisher       string[pyarrow]
Developer       string[pyarrow]
Critic_Score    double[pyarrow]
User_Score      double[pyarrow]
NA_Sales        double[pyarrow]
PAL_Sales       double[pyarrow]
JP_Sales        double[pyarrow]
Other_Sales     double[pyarrow]
Global_Sales    double[pyarrow]
dtype: object


Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,Developer,Critic_Score,User_Score,NA_Sales,PAL_Sales,JP_Sales,Other_Sales,Global_Sales
count,1936.0,1936,1936,1936.0,1936,1936,1936,1098.0,129.0,1878.0,1871.0,1306.0,1905.0,1936.0
unique,,1528,26,,17,103,456,,,,,,,
top,,Minecraft,PS2,,Sports,Nintendo,Nintendo EAD,,,,,,,
freq,,7,300,,293,345,69,,,,,,,
mean,999.905475,,,2005.707645,,,,8.117942,8.834109,1.499989,0.92016,0.573124,0.293433,3.019613
std,578.801568,,,7.538652,,,,1.082233,1.05819,2.046652,1.338663,0.944822,0.501566,3.84032
min,1.0,,,1977.0,,,,2.6,3.0,0.01,0.0,0.0,0.0,1.11
25%,496.75,,,2001.0,,,,7.6,8.5,0.66,0.34,0.04,0.09,1.4
50%,1001.5,,,2007.0,,,,8.3,9.1,1.0,0.56,0.16,0.17,1.895
75%,1499.25,,,2011.0,,,,8.9,9.5,1.65,1.05,0.74,0.32,3.12


## Results
- The fact that all of our categorical data columns (rank, name, platform, year, genre, publisher, developer) have a count of 2000 is very promising - means no missing values in those columns
- Looks like some sales numbers are missing in certain regions. For some of this analysis, I think I'll want to create a subset that only looks at games available in North America and Japan.
- Japan is the video game capital of the world but seems to have fewer sales - probably because it's a smaller country. This means caution (and possibly "grading on a curve") will be needed when comparing results.
    - NA population in 2016 579 million
    - Japan population in 2016 roughly 127 million
    - This means that if there is a need to normalize comparisons between NA and Japan, Japanese sales figures can be multiplied x4.56
- PC and PSN games are included - they are not easily classified into generations. We'll exclude them because the numbers are very low.

# Calculated Columns
This code is pillaged from project #1 to allow my calculated columns to be config instead of hardcoded

In [26]:
# Calculated column definitions. These will be applied AFTER data cleansing, once dfClean (the clean dataset) is created
dictCalculatedColData = {
    # Example of a lookup, i.e. the value in new column "transaction fee" is based on the values in the "payment method" column
    "ConsoleGeneration": # Column to be created
        {
            "Column Type": "Lookup",
            "Data Type": "int64[pyarrow]",
            "Column Spec":
            {
                "Platform": # Column containing the categories we need to look up
                {
                    "2600": 2,
                    "3DS": 8,
                    "DC": 6,
                    "DS": 7,
                    "GB": 4,
                    "GBA": 6,
                    "GBC": 5,
                    "GC": 6,
                    "GEN": 4,
                    "N64": 5,
                    "NES": 3,
                    "NS": 8,
                    "PS": 5,
                    "PS2": 6,
                    "PS3": 7,
                    "PS4": 8,
                    "PSP": 7,
                    "PSV": 8,
                    "SAT": 5,
                    "SCD": 4,
                    "SNES": 4,
                    "Wii": 7,
                    "WiiU": 8,
                    "X360": 7,
                    "XB": 6,
                    "XOne": 8
                }
            }
        },
    "ConsoleManufacturer": # Column to be created
        {
            "Column Type": "Lookup",
            "Data Type": "string[pyarrow]",
            "Column Spec":
            {
                "Platform": # Column containing the categories we need to look up
                {
                    "2600": "Atari",
                    "3DS": "Nintendo",
                    "DC": "Sega",
                    "DS": "Nintendo",
                    "GB": "Nintendo",
                    "GBA": "Nintendo",
                    "GBC": "Nintendo",
                    "GC": "Nintendo",
                    "GEN": "Sega",
                    "N64": "Nintendo",
                    "NES": "Nintendo",
                    "NS": "Nintendo",
                    "PS": "Sony",
                    "PS2": "Sony",
                    "PS3": "Sony",
                    "PS4": "Sony",
                    "PSP": "Sony",
                    "PSV": "Sony",
                    "SAT": "Sega",
                    "SCD": "Sega",
                    "SNES": "Nintendo",
                    "Wii": "Nintendo",
                    "WiiU": "Nintendo",
                    "X360": "Microsoft",
                    "XB": "Microsoft",
                    "XOne": "Microsoft"
                }
            }
        },
    "PublisherType": # Column to be created
        {
            "Column Type": "2ColLookup",
            "Data Type": "string[pyarrow]",
            "Column Spec":
            {
                ("ConsoleManufacturer", "Publisher"): # Column containing the categories we need to look up
                {
                    ("Atari", "Atari"): "First Party",
                    ("Microsoft", "Microsoft"): "First Party",
                    ("Microsoft", "Microsoft Game Studios"): "First Party",
                    ("Microsoft", "Microsoft Studios"): "First Party",
                    ("Microsoft", "Mojang"): "First Party",
                    ("Nintendo", "Nintendo"): "First Party",
                    ("Sega", "Sega"): "First Party",
                    ("Sony", "Sony Computer Entertainment"): "First Party",
                    ("Sony", "Sony Computer Entertainment America"): "First Party",
                    ("Sony", "Sony Interactive Entertainment"): "First Party",
                    ("Sony", "Sony Online Entertainment"): "First Party",
                    ("DEFAULT_VALUE", "DEFAULT_VALUE"): "Third Party"
                }
            }
        }#,
    # Example of a column calculated based on other column values
    # "Total Transaction Fee": 
    # {
    #     "Column Type": "Function",
    #     "Data Type": "double[pyarrow]",
    #     "Column Spec": lambda row: round(row["Transaction Amount"] * row["Transaction Fee Pct"], 2)
    # },

    # # Another calculated column - notice that since we are going to end up applying these in order,
    # # column definitions can reference earlier columns.
    # "Total Transaction Amt": 
    # {
    #     "Column Type": "Function",
    #     "Data Type": "double[pyarrow]",
    #     "Column Spec": lambda row: round(row["Transaction Amount"] + row["Total Transaction Fee"], 2)
    # }
}

# Function to apply calculated columns
def calculated_columns(row, calc_col_def):
    # Get the calculation type from the column definition
    calc_col_type = calc_col_def["Column Type"]

    if calc_col_type == "Lookup":
        # Extract the key and value (which is our lookup dictionary) from calc_col_def
        for base_col_name, dictLookup in calc_col_def["Column Spec"].items():
            if row[base_col_name] in dictLookup:
                return dictLookup[row[base_col_name]]
            elif "DEFAULT_VALUE" in dictLookup:
                return dictLookup["DEFAULT_VALUE"]
            else:
                return None

    if calc_col_type == "2ColLookup":
            lookup_spec = calc_col_def["Column Spec"]
            # There should be exactly one item in Column Spec
            ((col1, col2), lookup_dict) = next(iter(lookup_spec.items()))

            # Create a tuple from the row values
            row_tuple = (row[col1], row[col2])
            
            # Print the row tuple for debugging
            print("Row tuple:", row_tuple)
            
            # Check if the row tuple exists in the lookup dictionary
            if row_tuple in lookup_dict:
                print("Key found:", row_tuple)
                return lookup_dict[row_tuple]
            
            # Check for default value
            if ("DEFAULT_VALUE", "DEFAULT_VALUE") in lookup_dict:
                print("Returning default value")
                return lookup_dict[("DEFAULT_VALUE", "DEFAULT_VALUE")]
            
            # If no match is found, return None
            print("No match found, returning None")
            return None

    # Default - i.e. the calculated column is of a type we have not written code to handle yet
    else:
        return None
    
# Utilize dictCalculatedColData to apply our calculated columns
for calc_col_name, calc_col_def in dictCalculatedColData.items():
    print("Creating column ", calc_col_name)
    df[calc_col_name] = df.apply(calculated_columns, args=(calc_col_def,), axis=1)
    print("Applying data type ", calc_col_def["Data Type"], " to column ", calc_col_name)
    df[calc_col_name] = df[calc_col_name].astype(calc_col_def["Data Type"])

Creating column  ConsoleGeneration
Applying data type  int64[pyarrow]  to column  ConsoleGeneration
Creating column  ConsoleManufacturer
Applying data type  string[pyarrow]  to column  ConsoleManufacturer
Creating column  PublisherType
Row tuple: ('Nintendo', 'Nintendo')
Key found: ('Nintendo', 'Nintendo')
Row tuple: ('Nintendo', 'Nintendo')
Key found: ('Nintendo', 'Nintendo')
Row tuple: ('Nintendo', 'Nintendo')
Key found: ('Nintendo', 'Nintendo')
Row tuple: ('Nintendo', 'Nintendo')
Key found: ('Nintendo', 'Nintendo')
Row tuple: ('Nintendo', 'Nintendo')
Key found: ('Nintendo', 'Nintendo')
Row tuple: ('Nintendo', 'Nintendo')
Key found: ('Nintendo', 'Nintendo')
Row tuple: ('Nintendo', 'Nintendo')
Key found: ('Nintendo', 'Nintendo')
Row tuple: ('Nintendo', 'Nintendo')
Key found: ('Nintendo', 'Nintendo')
Row tuple: ('Nintendo', 'Nintendo')
Key found: ('Nintendo', 'Nintendo')
Row tuple: ('Nintendo', 'Nintendo')
Key found: ('Nintendo', 'Nintendo')
Row tuple: ('Nintendo', 'Nintendo')
Key foun

In [27]:
df.describe(include="all")
display(df)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,Developer,Critic_Score,User_Score,NA_Sales,PAL_Sales,JP_Sales,Other_Sales,Global_Sales,ConsoleGeneration,ConsoleManufacturer,PublisherType
0,1,Wii Sports,Wii,2006,Sports,Nintendo,Nintendo EAD,7.7,,41.36,29.02,3.77,8.51,82.65,7,Nintendo,First Party
1,2,Super Mario Bros.,NES,1985,Platform,Nintendo,Nintendo EAD,10.0,8.2,29.08,3.58,6.81,0.77,40.24,3,Nintendo,First Party
2,3,Mario Kart Wii,Wii,2008,Racing,Nintendo,Nintendo EAD,8.2,9.1,15.91,12.92,3.8,3.35,35.98,7,Nintendo,First Party
3,4,Wii Sports Resort,Wii,2009,Sports,Nintendo,Nintendo EAD,8.0,8.8,15.61,10.99,3.29,3.02,32.9,7,Nintendo,First Party
4,5,Pokémon Red / Green / Blue Version,GB,1998,Role-Playing,Nintendo,Game Freak,9.4,,11.27,8.89,10.22,1.0,31.37,4,Nintendo,First Party
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,Red Dead Redemption: Undead Nightmare,X360,2010,Action,Rockstar Games,Rockstar San Diego,10.0,,0.61,0.38,0.02,0.1,1.11,7,Microsoft,Third Party
1996,1997,Metal Gear Solid HD Collection,PS3,2011,Action,Konami Digital Entertainment,Bluepoint Games,,,0.52,0.44,,0.14,1.11,7,Sony,Third Party
1997,1998,EA Sports UFC,PS4,2014,Fighting,Electronic Arts,EA Canada,,,0.49,0.43,0.01,0.18,1.11,8,Sony,Third Party
1998,1999,NCAA Football 13,X360,2012,Sports,EA Sports,EA Tiburon,7.5,,1.02,,,0.09,1.11,7,Microsoft,Third Party


In [6]:
dictTest = {
    ("Sony", "Sony Computer Entertainment America"): "First Party"
}
Manufacturer = "Sony"
Publisher = "Sony Computer Entertainment America"
print(dictTest[(Manufacturer, Publisher)])

First Party
