# Download, filter, recompress bz2 by year

In [2]:
import wget, os, bz2, tarfile
import pandas as pd, datetime as dt
from functools import wraps

In [3]:
def get_pageview_complete_urls_by_year(year:int) -> list[str]:
    """
    Programmatically generate urls for daily pageviews_complete.
    INPUT: year as integer
    OUTPUT: list of urls to all the day-files for that year
    NOTE: filepaths are formatted as:
    https://dumps.wikimedia.org/other/pageview_complete/
        2021/2021-01/pageviews-20210101-user.bz2 
        i.e.
        'yyyy/yyyy-mm/pageviews-yyyymmdd-user.bz2'
    """
    dates = pd.date_range(dt.datetime(year,1,1), 
                          end=dt.datetime(year+1,1,1), 
                          freq='D') # each date in the year
    base_url = 'https://dumps.wikimedia.org/other/pageview_complete/'
    urls = [f'{base_url}{d.year}/{d.year}-{str(d.month).zfill(2)}'
            f'/pageviews-{d.year}{str(d.month).zfill(2)}{str(d.day).zfill(2)}-user.bz2'
            for d in dates]
    return urls

In [4]:
def outpath_from_url(url:str) -> str:
    baseurl = '../data/temp/jawiki_pageviews/'
    yym = '/'.join(url.split('/')[-3:-1])+'/'
    filename = url.split('/')[-1].split('.')[0]
    return baseurl+yym+filename

In [5]:
def extract_jawiki_from_pageviews_complete(bz2path, outpath):
    with open(outpath, 'w+') as f:
        f.write('')
    ct = 0
    with bz2.open(bz2path) as f_bz:
        with open(outpath, 'a+') as f_out:
            while ct<37000000:
                x = f_bz.readline().decode('utf-8')
                ct += 1
                if ord(x[0]) < 105: # 105 is 'i'
                    for i in range(10**5):
                        f_bz.readline()
                    ct+=10**5
                    continue
                if x[0] == 'i' and ord(x[1]) < 117: # 117 is 'u'
                    for i in range(50):
                        f_bz.readline()
                    ct+=50
                    continue
                if x[:12] == 'ja.wikipedia':
                    f_out.write(x)
                    continue
                if x[0] == 'j' and ord(x[1]) > 97: # 97 is 'a'
                    break
                if ord(x[0]) > 106: # 106 is 'j'
                    break

In [116]:
# UNCOMMENT WHEN TESTING
# urls = urls[:2]
# rawpaths = rawpaths[:2]
# outpaths = outpaths[:2]

###### recompress bz2

In [9]:
def make_tarfile(output_filename, source_dir):
    with tarfile.open(output_filename, "w:bz2") as tar:
        tar.add(source_dir, arcname=os.path.basename(source_dir))

In [None]:
# 2016 took 1hr15min, and went from 42 GiB -> 7.6 GiB on ec2-large, 
for y in [2019, 2020, 2021]:
    start = dt.datetime.now()

    source_dir = f'../data/temp/jawiki_pageviews/{y}/'
    output_filename = f'../data/raw/jawiki/pageviews/{y}.tar.bz2'
    make_tarfile(output_filename, source_dir)

    end = dt.datetime.now()
    end-start

In [6]:
os.listdir('../data/temp/')

['test_reexpand', 'raw_bz2', 'jawiki_pageviews']

In [132]:
os.listdir('../data/raw/jawiki/pageviews/')

[]

---

---