# Scripts to download data for AE KB

In [1]:
import json
import requests
import pandas as pd


from tqdm import tqdm
from pathlib import Path

PATH_DATA = Path('../data/askextension_kb')
PATH_DATA.mkdir(parents=True, exist_ok=True)

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
import json
from datetime import date, timedelta
from dateutil.relativedelta import relativedelta

import asyncio
from aiohttp import ClientSession

async def fetch(url, session):
    async with session.get(url) as response:
        return await response.text()

async def run(month):
    url = 'https://qa.osticket.eduworks.com/api/knowledge/{0}/{1}'
    start_date = date(2013, month, 1)
    end_date  = start_date + relativedelta(months=1)
    tasks = []

    # Fetch all responses within one Client session,
    # keep connection alive for all requests.
    async with ClientSession() as session:
        dates = [(start_date+timedelta(days=x)) for x in range((end_date-start_date).days)]

        for d in dates:
            task = fetch(url.format(d.strftime('%Y-%m-%d'), (d+timedelta(days=1)).strftime('%Y-%m-%d')), session)
            tasks.append(task)
            
        results = await asyncio.gather(*tasks)
        for i, r in enumerate(results):
            try:
                r = json.loads(r)
                print(f'{i+1}) Fetched {len(r)}...')
            except ValueError as e:
                print(r)
                raise e
            
        # you now have all response bodies in this variable

for m in range(1, 13):
    loop = asyncio.get_event_loop()
    # future = asyncio.ensure_future(run(m))
    loop.run_until_complete(run(m))

1) Fetched 6...
2) Fetched 5...
3) Fetched 6...
4) Fetched 5...
5) Fetched 8...
6) Fetched 2...
7) Fetched 7...
8) Fetched 11...
9) Fetched 3...
10) Fetched 10...
11) Fetched 4...
12) Fetched 9...
13) Fetched 8...
14) Fetched 9...
15) Fetched 9...
16) Fetched 8...
17) Fetched 14...
18) Fetched 11...
19) Fetched 12...
20) Fetched 9...
21) Fetched 10...
22) Fetched 17...
23) Fetched 10...
24) Fetched 15...
25) Fetched 9...
26) Fetched 9...
27) Fetched 6...
28) Fetched 20...
29) Fetched 17...
30) Fetched 6...
31) Fetched 10...
1) Fetched 8...
2) Fetched 10...
3) Fetched 9...
4) Fetched 11...
5) Fetched 12...
6) Fetched 15...
7) Fetched 20...
8) Fetched 15...
9) Fetched 12...
10) Fetched 11...
11) Fetched 11...
12) Fetched 11...
13) Fetched 17...
14) Fetched 5...
15) Fetched 10...
16) Fetched 15...
17) Fetched 13...
18) Fetched 19...
19) Fetched 15...
20) Fetched 14...
21) Fetched 19...
22) Fetched 9...
23) Fetched 12...
24) Fetched 11...
25) Fetched 20...
26) Fetched 13...
27) Fetched 17.

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
def download_ask_extention_data(start_year: int = 2006, end_year: int = 2024):
    '''Calls OS ticket API to get all ask extension data'''
    for i in tqdm(range(start_year, end_year), desc='Calling OS Ticket API to download AE data...'):
        start = str(i) 
        end = str(i+1)
        url = f'https://qa.osticket.eduworks.com/api/knowledge/{start}-01-01/{end}-01-01'
        print(f'Trying for the year {start}...')
        try:
            r = requests.get(url, timeout=40)
            items = r.json()
        except requests.exceptions.Timeout: 
            print(f"Failed to download data for year {start}")
            continue
        print(f'Number of fetched items: {len(items)}')
        
        if items:
            PATH_SAVE = Path.joinpath(PATH_DATA, f'{start_year}.json')
            with open(PATH_SAVE, 'w') as f:
                json.dump(items, f)

def get_ask_extension_data() -> list:
    '''Attempts to load from AE data. Though, will call os ticket API if not available'''
    DATA_FILE_NAMES = sorted(PATH_DATA.iterdir())
    
    # if len(DATA_FILE_NAMES) == 0:
    download_ask_extention_data()
    DATA_FILE_NAMES = sorted(PATH_DATA.iterdir())

    # Combines the data files into one and returns it.
    df = pd.DataFrame()
    print(f'List of files:\n{[data_file.name for data_file in DATA_FILE_NAMES]}')
    for f in DATA_FILE_NAMES:
        df = pd.concat([df, pd.read_json(f)], ignore_index = True, axis = 0)
        return df

In [None]:
get_ask_extension_data()