In [1]:
import requests
from zipfile import ZipFile
from io import BytesIO
import pandas as pd
import numpy as np
from datetime import timedelta
from datetime import datetime
from io import StringIO
import telegram

from airflow.decorators import dag, task
from airflow.operators.python import get_current_context
from airflow.models import Variable

# TOP_1M_DOMAINS = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
GAMES = '/var/lib/airflow/airflow.git/dags/a.batalov/vgsales.csv'
# GAMES = '/mnt/HC_Volume_18315164/home-jupyter/jupyter-d-gatiatullin/airflow/dags/d-gatiatullin/vgsales.csv'
# TOP_1M_DOMAINS_FILE = 'top-1m.csv'


default_args = {
    'owner': 'd.gatiatullin',
    'depends_on_past': False,
    'retries': 2,
    'retry_delay': timedelta(minutes=5),
    'start_date': datetime(2020, 11, 24),
    'schedule_interval': '0 9 * * *'
}

year_start = 1994 + hash(f'd.gatiatullin') % 23


@dag(default_args=default_args, catchup=False)
def dgati_dag2():
    @task(retries=3)
    def get_data():
        df = pd.read_csv(GAMES)
        df = df.query("Year== @year_start")
        return df
    
    @task()
    def top_sales(df):
        top_sales_game = df.groupby('Name', as_index=False) \
                           .agg({'Global_Sales': 'sum'}) \
                           .sort_values('Global_Sales', ascending=False) \
                           .head(1)['Name']
        return {'top_sales_res': top_sales_game}
    
    @task
    def top_sales_genres_eu(df):
        df2 = df.groupby('Genre', as_index=False) \
                .agg({'EU_Sales': 'sum'})
        top_sales_genres_eu_list = df2[df2['EU_Sales'] == df2['EU_Sales'].max()].Genre.to_list()
        return {'top_sales_genres_eu_res': top_sales_genres_eu_list}
    
    @task
    def top_sales_platform(df):
        df3 = df.query("NA_Sales > 1") \
                .groupby('Platform', as_index=False) \
                .agg({'Name': 'count'}) \
                .rename(columns={'Name': 'Count'})
        top_sales_platform_list = df3[df3['Count'] == df3['Count'].max()].Platform.to_list()
        return {'top_sales_platform_res': top_sales_platform_list}
    
    @task()
    def top_avg_sales_jp(df):
        df4 = df.groupby('Publisher', as_index=False) \
                .agg({'JP_Sales': 'mean'}) \
                .rename(columns={'JP_Sales': 'JP_Sales_mean'})
        top_avg_sales_jp_list = df4[df4['JP_Sales_mean'] == df4['JP_Sales_mean'].max()].Publisher.to_list()
        return {'top_avg_sales_jp_res': top_avg_sales_jp_list}
    
    @task()
    def eu_jp(df):
        eu_jp_res = df.groupby('Name', as_index=False) \
                      .agg({'EU_Sales': 'sum', 'JP_Sales': 'sum'}) \
                      .query("EU_Sales > JP_Sales") \
                      .shape[0]
        return {'eu_jp_res': eu_jp_res}

    
    @task()
    def print_data(a, b, c, d, e):
        first_answer = a['top_sales_res']
        second_answer = b['top_sales_genres_eu_res']
        third_answer = c['top_sales_platform_res']
        fourth_answer = d['top_avg_sales_jp_res']
        fifth_answer = e['eu_jp_res']
        
        print(f'Best-selling game in {year_start} is {first_answer}')
        print(f'Best-selling genres in Europe is {second_answer}')
        print(f'Platforms with more than a million copies in NA: {third_answer}')
        print(f'Publishers with the highest average sales in Japan: {fourth_answer}')
        print(f'Number of games that sold better in Europe than in Japan: {fifth_answer}')
    

    df = get_data()
    aa = top_sales(df)
    bb = top_sales_genres_eu(df)
    cc = top_sales_platform(df)
    dd = top_avg_sales_jp(df)
    ee = eu_jp(df)

    print_data(aa, bb, cc, dd, ee)

dgati_dag2 = dgati_dag2()


In [20]:
year_start = 1994 + hash(f'd.gatiatullin') % 23

In [18]:
df = pd.read_csv('/mnt/HC_Volume_18315164/home-jupyter/jupyter-d-gatiatullin/airflow/dags/d-gatiatullin/vgsales.csv')
df

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


In [22]:
df.query("Year== @year_start")

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
132,133,Pokémon Crystal Version,GB,2000.0,Role-Playing,Nintendo,2.55,1.56,1.29,0.99,6.39
173,174,Final Fantasy IX,PS,2000.0,Role-Playing,SquareSoft,1.62,0.77,2.78,0.14,5.30
223,224,Driver 2,PS,2000.0,Action,Atari,2.36,2.10,0.02,0.25,4.73
225,226,Tony Hawk's Pro Skater 2,PS,2000.0,Sports,Activision,3.05,1.41,0.02,0.20,4.68
242,243,Dragon Quest VII: Warriors of Eden,PS,2000.0,Role-Playing,Enix Corporation,0.20,0.14,4.10,0.02,4.47
...,...,...,...,...,...,...,...,...,...,...,...
15505,15508,Point Blank 3,PS,2000.0,Shooter,Namco Bandai Games,0.01,0.01,0.00,0.00,0.02
15651,15654,Airline Tycoon,PC,2000.0,Simulation,Interplay,0.00,0.01,0.00,0.00,0.02
16163,16166,Deus Ex,PC,2000.0,Role-Playing,Eidos Interactive,0.00,0.01,0.00,0.00,0.01
16518,16521,Crossroad Crisis,PS,2000.0,Puzzle,Success,0.01,0.00,0.00,0.00,0.01


In [40]:
df = df.query("Year== @year_start")
df

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
132,133,Pokémon Crystal Version,GB,2000.0,Role-Playing,Nintendo,2.55,1.56,1.29,0.99,6.39
173,174,Final Fantasy IX,PS,2000.0,Role-Playing,SquareSoft,1.62,0.77,2.78,0.14,5.30
223,224,Driver 2,PS,2000.0,Action,Atari,2.36,2.10,0.02,0.25,4.73
225,226,Tony Hawk's Pro Skater 2,PS,2000.0,Sports,Activision,3.05,1.41,0.02,0.20,4.68
242,243,Dragon Quest VII: Warriors of Eden,PS,2000.0,Role-Playing,Enix Corporation,0.20,0.14,4.10,0.02,4.47
...,...,...,...,...,...,...,...,...,...,...,...
15505,15508,Point Blank 3,PS,2000.0,Shooter,Namco Bandai Games,0.01,0.01,0.00,0.00,0.02
15651,15654,Airline Tycoon,PC,2000.0,Simulation,Interplay,0.00,0.01,0.00,0.00,0.02
16163,16166,Deus Ex,PC,2000.0,Role-Playing,Eidos Interactive,0.00,0.01,0.00,0.00,0.01
16518,16521,Crossroad Crisis,PS,2000.0,Puzzle,Success,0.01,0.00,0.00,0.00,0.01


In [49]:
df.groupby('Name', as_index=False) \
    .agg({'Global_Sales': 'sum'}) \
    .sort_values('Global_Sales', ascending=False) \
    .head(1)['Name']

211    Pokémon Crystal Version
Name: Name, dtype: object

In [52]:
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
132,133,Pokémon Crystal Version,GB,2000.0,Role-Playing,Nintendo,2.55,1.56,1.29,0.99,6.39
173,174,Final Fantasy IX,PS,2000.0,Role-Playing,SquareSoft,1.62,0.77,2.78,0.14,5.3
223,224,Driver 2,PS,2000.0,Action,Atari,2.36,2.1,0.02,0.25,4.73
225,226,Tony Hawk's Pro Skater 2,PS,2000.0,Sports,Activision,3.05,1.41,0.02,0.2,4.68
242,243,Dragon Quest VII: Warriors of Eden,PS,2000.0,Role-Playing,Enix Corporation,0.2,0.14,4.1,0.02,4.47


In [55]:
df2 = df.groupby('Genre', as_index=False) \
    .agg({'EU_Sales': 'sum'})
df2

Unnamed: 0,Genre,EU_Sales
0,Action,10.84
1,Adventure,0.69
2,Fighting,6.31
3,Misc,4.91
4,Platform,4.64
5,Puzzle,0.93
6,Racing,6.5
7,Role-Playing,4.2
8,Shooter,2.17
9,Simulation,0.77


In [61]:
top_sales_genres_eu = df2[df2['EU_Sales'] == df2['EU_Sales'].max()].Genre.to_list()
top_sales_genres_eu

['Action']

In [63]:
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
132,133,Pokémon Crystal Version,GB,2000.0,Role-Playing,Nintendo,2.55,1.56,1.29,0.99,6.39
173,174,Final Fantasy IX,PS,2000.0,Role-Playing,SquareSoft,1.62,0.77,2.78,0.14,5.3
223,224,Driver 2,PS,2000.0,Action,Atari,2.36,2.1,0.02,0.25,4.73
225,226,Tony Hawk's Pro Skater 2,PS,2000.0,Sports,Activision,3.05,1.41,0.02,0.2,4.68
242,243,Dragon Quest VII: Warriors of Eden,PS,2000.0,Role-Playing,Enix Corporation,0.2,0.14,4.1,0.02,4.47


In [72]:
df3 = df.query("NA_Sales > 1") \
            .groupby('Platform', as_index=False) \
            .agg({'Name': 'count'}) \
            .rename(columns={'Name': 'Count'})
df3

Unnamed: 0,Platform,Count
0,DC,2
1,GB,3
2,N64,6
3,PC,2
4,PS,11
5,PS2,3


In [73]:
df3['Count'].max()

11

In [76]:
df3[df3['Count'] == df3['Count'].max()].Platform.to_list()

['PS']

In [78]:
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
132,133,Pokémon Crystal Version,GB,2000.0,Role-Playing,Nintendo,2.55,1.56,1.29,0.99,6.39
173,174,Final Fantasy IX,PS,2000.0,Role-Playing,SquareSoft,1.62,0.77,2.78,0.14,5.3
223,224,Driver 2,PS,2000.0,Action,Atari,2.36,2.1,0.02,0.25,4.73
225,226,Tony Hawk's Pro Skater 2,PS,2000.0,Sports,Activision,3.05,1.41,0.02,0.2,4.68
242,243,Dragon Quest VII: Warriors of Eden,PS,2000.0,Role-Playing,Enix Corporation,0.2,0.14,4.1,0.02,4.47


In [82]:
df4 = df.groupby('Publisher', as_index=False) \
        .agg({'JP_Sales': 'mean'}) \
        .rename(columns={'JP_Sales': 'JP_Sales_mean'})
df4

Unnamed: 0,Publisher,JP_Sales_mean
0,3DO,0.000000
1,Acclaim Entertainment,0.009333
2,Activision,0.003846
3,Aruze Corp,0.295000
4,Asmik Ace Entertainment,0.090000
...,...,...
56,Vatical Entertainment,0.000000
57,Victor Interactive,0.000000
58,Video System,0.000000
59,Virgin Interactive,0.140000


In [83]:
df4['JP_Sales_mean'].max()

1.6633333333333333

In [89]:
df4[df4['JP_Sales_mean'] == df4['JP_Sales_mean'].max()].Publisher.to_list()

['Enix Corporation']

In [90]:
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
132,133,Pokémon Crystal Version,GB,2000.0,Role-Playing,Nintendo,2.55,1.56,1.29,0.99,6.39
173,174,Final Fantasy IX,PS,2000.0,Role-Playing,SquareSoft,1.62,0.77,2.78,0.14,5.3
223,224,Driver 2,PS,2000.0,Action,Atari,2.36,2.1,0.02,0.25,4.73
225,226,Tony Hawk's Pro Skater 2,PS,2000.0,Sports,Activision,3.05,1.41,0.02,0.2,4.68
242,243,Dragon Quest VII: Warriors of Eden,PS,2000.0,Role-Playing,Enix Corporation,0.2,0.14,4.1,0.02,4.47


In [104]:
df.groupby('Name', as_index=False) \
    .agg({'EU_Sales': 'sum', 'JP_Sales': 'sum'}) \
    .query("EU_Sales > JP_Sales") \
    .shape[0]

212