In [2]:
import requests
import pandas as pd
from datetime import timedelta
from datetime import datetime

from airflow import DAG
from airflow.operators.python import PythonOperator

In [3]:
TOP_1M_DOMAINS = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
TOP_1M_DOMAINS_FILE = 'top-1m.csv'

In [24]:
pd.read_csv(TOP_1M_DOMAINS, names=['rank', 'domain']).head()

Unnamed: 0,rank,domain
0,1,google.com
1,2,youtube.com
2,3,baidu.com
3,4,bilibili.com
4,5,facebook.com


In [22]:
TOP_1M_DOMAINS_FILE

'top-1m.csv'

In [5]:
def get_data():
    # Здесь пока оставили запись в файл, как передавать переменую между тасками будет в третьем уроке
    top_doms = pd.read_csv(TOP_1M_DOMAINS)
    top_data = top_doms.to_csv(index=False)

    with open(TOP_1M_DOMAINS_FILE, 'w') as f:
        f.write(top_data)

#### Найти топ-10 доменных зон по численности доменов

In [48]:
def get_top_10_zones():
    top_zones_df = pd.read_csv(TOP_1M_DOMAINS_FILE, names=['rank', 'domain'])
    top_zones_df['zone'] = top_zones_df.domain.str.split('.').str[-1]
    top_zones_10 = top_zones_df.groupby('zone').agg({'domain': 'count'}).reset_index() \
                        .sort_values('domain', ascending = False).head(10)
    with open('top_10_zones.csv', 'w') as f:
        f.write(top_10_zones.to_csv(index=False, header=False))

##### Проверим работу кода:

In [84]:
top_zones_df = pd.read_csv(TOP_1M_DOMAINS, names=['rank', 'domain'])
top_zones_df['zone'] = top_zones_df.domain.str.split('.').str[-1]
top_zones_10 = top_zones_df.groupby('zone').agg({'domain': 'count'}).reset_index() \
                        .sort_values('domain', ascending = False).head(10)
with open('top_zones_10.csv', 'w') as f:
        f.write(top_zones_10.to_csv(index=False, header=False))

In [45]:
top_zones_df.head()

Unnamed: 0,rank,domain,zone
0,1,google.com,com
1,2,youtube.com,com
2,3,baidu.com,com
3,4,bilibili.com,com
4,5,facebook.com,com


In [46]:
top_zones_10.head()

Unnamed: 0,zone,domain
148,com,350672
455,net,26526
482,org,25847
548,ru,18558
176,de,11926


#### Найти домен с самым длинным именем (если их несколько, то взять только первый в алфавитном порядке)

In [52]:
def get_longest_domain():
    top_data_df = pd.read_csv(TOP_1M_DOMAINS_FILE, names=['rank', 'domain'])
    top_data_df['length'] = top_data_df['domain'].str.len()
    longest = top_data_df.sort_values(['length', 'domain'], ascending=[False, True]).head(2)
    with open('longest_domain.csv', 'w') as f:
        f.write(longest_domain.to_csv(index=False, header=False))

##### Проверим работу кода:

In [88]:
top_data_df = pd.read_csv(TOP_1M_DOMAINS, names=['rank', 'domain'])
top_data_df['length'] = top_data_df['domain'].str.len()
longest = top_data_df.sort_values(['length', 'domain'], ascending=[False, True]).head(1)
with open('longest_domain.csv', 'w') as f:
        f.write(longest_domain.to_csv(index=False, header=False))

NameError: name 'longest_domain' is not defined

In [58]:
top_data_df.head()

Unnamed: 0,rank,domain,length
0,1,google.com,10
1,2,youtube.com,11
2,3,baidu.com,9
3,4,bilibili.com,12
4,5,facebook.com,12


In [59]:
longest

Unnamed: 0,rank,domain,length
234067,234068,file-service-default-114c67af0763a8a98e770ff3e...,81


#### На каком месте находится домен airflow.com?

In [None]:
def airflow_rank():
    top_data_df = pd.read_csv(TOP_1M_DOMAINS_FILE, names=['rank', 'domain'])
    if top_data_df[top_data_df.domain == 'airflow.com'].shape[0] != 0:
        airflow_df = top_data_df[top_data_df.domain == 'airflow.com']['rank']
    else:
        airflow_df = pd.DataFrame({'col_1': ["It seems like airflow.com isn't in the list"]})
    with open('airflow_df.csv', 'w') as f:
        f.write(airflow_df.to_csv(index=False, header=False))

##### Проверим работу кода:

In [89]:
top_data_df = pd.read_csv(TOP_1M_DOMAINS, names=['rank', 'domain'])
if top_data_df[top_data_df.domain == 'airflow.com'].shape[0] != 0:
    airflow_df = top_data_df[top_data_df.domain == 'airflow.com']['rank']
else:
    airflow_df = pd.DataFrame({'col_1': ["It seems like airflow.com isn't in the list"]})
with open('airflow_df.csv', 'w') as f:
    f.write(airflow_df.to_csv(index=False, header=False))

In [67]:
airflow_df

Unnamed: 0,col_1
0,It seems like airflow.com isn't in the list


In [82]:
top_data_df[top_data_df.domain == 'airflow.com']['rank']

Series([], Name: rank, dtype: int64)

In [8]:
def print_data(ds):
    with open('top_data_top_10.csv', 'r') as f:
        all_data = f.read()
    with open('top_data_top_10_com.csv', 'r') as f:
        all_data_com = f.read()
    date = ds

    print(f'Top domains in .RU for date {date}')
    print(all_data)

    print(f'Top domains in .COM for date {date}')
    print(all_data_com)

In [9]:
default_args = {
    'owner': 'a.batalov',
    'depends_on_past': False,
    'retries': 2,
    'retry_delay': timedelta(minutes=5),
    'start_date': datetime(2021, 10, 7),
}
schedule_interval = '0 12 * * *'

In [10]:
dag = DAG('top_10_ru_new', default_args=default_args, schedule_interval=schedule_interval)


In [11]:
t1 = PythonOperator(task_id='get_data',
                    python_callable=get_data,
                    dag=dag)

t2 = PythonOperator(task_id='get_stat',
                    python_callable=get_stat,
                    dag=dag)

t2_com = PythonOperator(task_id='get_stat_com',
                        python_callable=get_stat_com,
                        dag=dag)

t3 = PythonOperator(task_id='print_data',
                    python_callable=print_data,
                    dag=dag)

In [12]:
t1 >> [t2, t2_com] >> t3

#t1.set_downstream(t2)
#t1.set_downstream(t2_com)
#t2.set_downstream(t3)
#t2_com.set_downstream(t3)


<Task(PythonOperator): print_data>