In [1]:
from datetime import datetime
import pandas as pd
pd.set_option('display.max_colwidth', None)


import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

import urllib.parse
import json

import os
from tqdm import tqdm

## import helper

In [2]:
from config_GAM2025 import gam_info
from security_config import api_key

import test_functions
import functions

In [3]:
# country
country_codes = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='CountryID')
country_codes = country_codes.rename(columns={'ATI': 'geo_country'})

# week 
week_tester = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='GAM Period')
#week_tester['w/c'] = pd.to_datetime(week_tester['w/c'])

# site info - with api query
site_info = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='Site_API').drop(columns='no results')
site_info['Report No.'] = site_info['Report No.'].astype(str)
site_info = site_info[site_info['script'] == '1_site_ingestion']

# platform codes
platform_codes = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='PlatformID')#[cols]

# service codes
service_codes = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='ServiceID')#[cols]
service_codes = service_codes.rename(columns={'ATI (Level 2 site)': 'site_level2'})

# language service map 
service_language_map = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='Site_language')

# non js 
non_js_map = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='Site_NonJS')

# app
app_map = pd.read_excel(f"../../{gam_info['lookup_file']}", sheet_name='Site_App')


## functions

# ingestion
## Chartbeat


## Piano

In [4]:

test_functions.site_test_unique_entries(site_info, 'Report No.', '1_Site_1', 'initial api query list')

Pass - All numbers in the column 'Report No.' are unique.
...updating logbook...



In [5]:
i = 0
for index, row in site_info.iterrows():
    
    api_query = row['API']
    api_query_key = api_key[row['api_key']]
    report_no = row['Report No.']

    #print(convert_url_to_query(api_query, start, end))
    print(f"starting report no {report_no}")
    print(api_query)
    
    for jndex, row in week_tester.iterrows():
        week_number = row['Week Number']
        filename = f"../data/raw/site/piano_reports/{gam_info['file_timeinfo']}_reportNo{report_no}_weekNo{week_number}.csv"
        
        # Check if the file exists, if so, continue to the next iteration
        if os.path.exists(filename):
            continue
            
        print(f"... iteration {filename}")
        start = row['w/c'] # dtype object
        end = row['week_ending'] # dtype object
        
        # convert to api query 
        query = functions.convert_url_to_query(api_query, start, end)
        
        # run api query 
        temp = functions.api_call(query, api_query_key)
        
        temp['w/c'] = start
        temp['timestamp_queryRun'] = datetime.now().strftime('%y%m%d-%H%M')
        temp['API'] = api_query

        if temp.shape[0] == 0:
            temp = pd.DataFrame({
                'w/c': [start],
                'timestamp_queryRun': [datetime.now().strftime('%y%m%d-%H%M')],
                'api_query': [api_query]
            })
            
        temp.to_csv(filename, index=None)
        
    print(f"finished report no {report_no}")

starting report no 12
https://api.atinternet.io/v3/data/getData?param={"columns":["geo_country","m_unique_visitors"],"sort":["-m_unique_visitors"],"space":{"s":[598340,598342,598346]},"period":{"p1":[{"type":"D","start":"START_DATE","end":"END_DATE"}]},"max-results":10000,"page-num":1}
finished report no 12
starting report no 13
https://api.atinternet.io/v3/data/getData?param={"columns":["geo_country","device_type","m_unique_visitors"],"sort":["-m_unique_visitors"],"space":{"s":[598340,598342,598346]},"period":{"p1":[{"type":"D","start":"START_DATE","end":"END_DATE"}]},"max-results":10000,"page-num":1}
finished report no 13
starting report no 14
https://api.atinternet.io/v3/data/getData?param={"columns":["geo_country","m_unique_visitors"],"sort":["-m_unique_visitors"],"space":{"s":[598340,598342]},"period":{"p1":[{"type":"D","start":"START_DATE","end":"END_DATE"}]},"max-results":10000,"page-num":1}
finished report no 14
starting report no 15
https://api.atinternet.io/v3/data/getData?pa

In [6]:
# test if more than 10000 rows are recorded to see if the pagination works 
    # yes it wokrs: great well done! 
    # no it doesn't: rerun all 10000 long queries
# Result: yes it works and also no report is larger than 200'000 (18k)

# Analysis 

## Chartbeat vs Piano

In [12]:
# build at home

# Processing 

In [8]:
filepath = f"../data/raw/site/piano_reports/"
all_files, empty_report_list = [], []
size = 0
for file in tqdm(os.listdir(filepath)):
    
    if (gam_info['file_timeinfo'] in file):
        temp= pd.read_csv(filepath+file)
        if len(temp.columns) == 3:
            empty_report_list.append(file)
        # measuring how many rows the largest file has
        if temp.shape[0] > size:
            size= temp.shape[0] 
        temp['filename'] = file
        parts = file.split('_')
        temp['Report No.'] = parts[1]
        temp['Report No.'] = temp['Report No.'].str.extract('(\d+)')[0]
        
        all_files.append(temp)

print(f"largest file is {size} rows long")
#empty_report_list.to_csv(f"../test/specific/{gam_info['file_timeinfo']}_empty_report_returns.csv")

combined_df = pd.concat(all_files)
if 'API' not in combined_df.columns:
    print('adding API')
    combined_df['API'] = ''
combined_df['API'] = combined_df['API'].fillna(combined_df['api_query'])
combined_df.drop(columns=['api_query'], inplace=True)
combined_df['w/c'] = pd.to_datetime(combined_df['w/c'] )

100%|██████████████████████████████████████| 3710/3710 [00:35<00:00, 103.92it/s]


largest file is 18254 rows long


In [9]:
# test all reports are there 
test_functions.test_inner_join(site_info, combined_df, ['Report No.', 'API'], 
                               '1_Site_2', 'adding report context info', focus='left')

# add report info
full_df = site_info.merge(combined_df, on=['Report No.', 'API'], how='inner', )
#print(full_df['Report No.'].unique())

# test all weeks are there 
test_functions.test_weeks_presence_per_account('w/c', 'Report No.', full_df, week_tester, 
                                               '1_Site_3', test_step='combining api returns')

# add week_lookup data
full_df = full_df.merge(week_tester[['YearGAE', 'Week Number', 'w/c']], on='w/c', how='left')
# excluded: 'API', 'timestamp_queryRun', 'filename', 'Year',
cols = ['Category', 'Report No.', 'Space', 'Description', 
        'YearGAE', 'Week Number', 'w/c',  
        'site_level2', 'geo_country', 'm_unique_visitors', 'm_page_loads', 
        'device_type', 'app_name', 'language', 'producer_nonjs', 'src']
full_df = full_df[cols]


# Specify the dtype option to avoid DtypeWarning for columns with mixed types
dtype_spec = {
    #'m_unique_visitors': int,
    'Report No.': str,
    'device_type': str,
    'app_name': str,
    'language': str,
    'producer_nonjs': str,
    'src': str
}

# Convert columns to the specified dtypes
for column, dtype in dtype_spec.items():
    if column in full_df.columns:
        full_df[column] = full_df[column].apply(lambda x: str(x) if pd.notnull(x) else '')

full_df.to_csv(f"../data/raw/{gam_info['file_timeinfo']}_rawDataFromPiano.csv", index=None)

Inner join test 1_Site_2 successful: No issues found.
...updating logbook...

All weeks are present in the dataset for each group.
...updating logbook...



In [10]:
full_df.sample()

Unnamed: 0,Category,Report No.,Space,Description,YearGAE,Week Number,w/c,site_level2,geo_country,m_unique_visitors,m_page_loads,device_type,app_name,language,producer_nonjs,src
1538551,Reach Calc.,20,WSLS-Direct-C,World Service Languages - Service Direct by Country,2025,8,2025-02-17,UK China,Viet nam,116.0,,,,,,


In [11]:
full_df['Report No.'].unique()

array(['12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
       '23', '24', '25', '26', '27', '28', '29', '30', '31', '100'],
      dtype=object)