In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import math

In [41]:
dec_2016 = pd.read_csv('data/raw_data/Video_Games_Sales_as_at_22_Dec_2016.csv')
jan_2017 = pd.read_csv('data/raw_data/Video_Game_Sales_as_of_Jan_2017.csv')

In [42]:
''' Read all the csv's into a list. '''
year_csv = {}
for csv in glob.iglob('data/time_series_data/*.csv'):
    thing = csv.split("/")
    year = thing[2].split(".")[0]
    year_csv[year] = csv

In [43]:
def total_sales_for_year(year, sales_region):
    total_sales = 0
    for k in range(len(dec_2016)):
        this_year = dec_2016.iloc[k]['Year_of_Release']
        if not math.isnan(this_year) and str(int(this_year)) == year:
            total_sales += float(dec_2016.iloc[k][sales_region])
            
    for k in range(len(jan_2017)):
        this_year = jan_2017.iloc[k]['Year_of_Release']
        if not math.isnan(this_year) and str(int(this_year)) == year:
            total_sales += float(jan_2017.iloc[k][sales_region])
    return total_sales

In [44]:
''' Returns a mapping of each year to the number of sales of games of the given feature in the given region.
    Calculates number of sales normalized for that year if normalize is True - default False. '''
def calc_sales_for_feature(year_csv, feature, feature_value, sales_region, normalize):
    sales = {}
    
    for k in range(len(dec_2016)):
        feat = dec_2016.iloc[k][feature]
        year = dec_2016.iloc[k]['Year_of_Release']
        if math.isnan(year):
            continue
        
        if year not in sales.keys():
            sales[int(year)] = 0.
        if feat == feature_value:
            sales[year] += dec_2016.iloc[k][sales_region]

    for k in range(len(jan_2017)):
        feat = jan_2017.iloc[k][feature]
        year = jan_2017.iloc[k]['Year_of_Release']
        if math.isnan(year):
            continue 
        
        if year not in sales.keys():
            sales[int(year)] = 0.
        if feat == feature_value:
            sales[year] += jan_2017.iloc[k][sales_region]
    
    # Normalize sales by the total sales for all games in the given year.
    if normalize:
        for year in sales.keys():
            sales[year] /= total_sales_for_year(str(year), sales_region)
    
    return sales

In [51]:
# Get list of unique genres from datasets.
genres = []
for genre in dec_2016['Genre']:
    if genre not in genres:
        genres.append(genre)        
for genre in jan_2017['Genre']:
    if genre not in genres:
        genres.append(genre)

# Aggregate yearly global sales for each genre, normalized and unnormalized.
for genre in genres:
    sales = calc_sales_for_feature(year_csv, 'Genre', genre, 'Global_Sales', False)
    sales_norm = calc_sales_for_feature(year_csv, 'Genre', genre, 'Global_Sales', True)
    
    # Save the aggregated sales to csv's (because this takes forever to run)
    pd.DataFrame.from_dict(sales, orient='index').to_csv('data/time_series_data/aggregated_sales/' + genre + '.csv')
    pd.DataFrame.from_dict(sales_norm, orient='index').to_csv('data/time_series_data/aggregated_sales/' + genre + '_norm.csv')
    
    # Plot the aggregated sales against years.
    plt.scatter(sales.keys(), sales.values())
    plt.title(genre)
    plt.show()
    plt.scatter(sales_norm.keys(), sales_norm.values())
    plt.title(genre + ' - normalized')
    plt.show()

KeyboardInterrupt: 