In [1]:
import datetime, requests, json, time
import pandas as pd

import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *

# exporting from API

In [2]:
from export_matomo import extract_entry

In [11]:
PRODUCTION_DOMAIN = 'https://main.matomo.qgiv.com'

URL = '/?module=API&method=Live.getLastVisitsDetails&idSite=1'
URL += '&period=day&date={}&format=json&'
URL += 'token_auth={}&filter_limit={}&filter_offset={}'

AUTH_TOKEN = '20e9d5eeece2562bb882cb1e39f76828'

In [4]:
def store_matomo_file(data, filename):
    df = pd.DataFrame(data).drop_duplicates()
    
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
    df['form'] = pd.to_numeric(df['form'], errors='coerce').fillna(0).astype(int)
    df['org'] = pd.to_numeric(df['org'], errors='coerce').fillna(0).astype(int)
    df['url'] = df['url'].str.replace(',', '')
    df['url'] = df['url'].str.replace('"', '')
    df['url'] = df['url'].str.slice(0, 290)
    df['referrerName'] = df['referrerName'].str.replace(',', '')
    
    cols = ['id', 'ip', 'visitorId', 'visitDuration',
            'actions', 'referrerType', 'referrerName',
            'deviceType', 'deviceBrand', 'deviceModel',
            'operatingSystemName', 'browser', 'url',
            'timeSpent', 'pageviewPosition', 
            'timestamp', 'org', 'form']
    if len(df) > 0:
        save_dataframe_to_file("ingest-records", filename, df[cols], print_output=False)
    else:
        print("\tno data to store in S3")

In [7]:
def backfill_month(month=None, year=None, limit=2000):
    if month is None or year is None:
        raise Exception("Bad input: month or year")
        
    current_date = datetime.datetime(year=year, month=month, day=1)
    while current_date.month == month:
        page_views = []
        date_param = "{:%Y-%m-%d}".format(current_date)
        offset = 0
        
        print("starting to export {}".format(date_param))
        counter = 1
        file_counter = 1
        while True:
            url = URL.format(date_param, AUTH_TOKEN, limit, offset)
            rsp = requests.get(PRODUCTION_DOMAIN + url)
            data = json.loads(rsp.text)
            
            return data
        
            if 'result' in data and data['result'] == 'error':
                # likely memory issue, sleep for 5 seconds and try again
                time.sleep(5)
                continue
                
            for visitor in data:
                try:
                    page_views += extract_entry(visitor)
                except Exception as e:
                    print("\t\t{}: {}".format(e, visitor[:10]))
                    return data
            
            if len(data) < limit:
                # end of day's data
                break
            else:
                if counter % 100 == 0:
                    print("\titeration {} complete, storing {:,} entries to S3 ({} - {})".format(counter, len(page_views), page_views['timestamp'].min(), page_views['timestamp'].max()))
                    filename = "matomo_update.{}.{}.csv".format(date_param, file_counter)
                    store_matomo_file(page_views, filename)
                    file_counter += 1
                    page_views = []
                
                offset += limit
                counter += 1   
        
        # store days data
        if len(page_views) > 0:
            print("\tstoring {:,} visitor logs".format(len(page_views)))
            
            filename = "matomo_update.{}.csv".format(date_param)
            store_matomo_file(page_views, filename)
        else:
            print("\tERROR: no visitor logs on {}".format(date_param))
        
        # increment date
        current_date += datetime.timedelta(days=1)
    
    print("DONE")

In [10]:
error_entry = backfill_month(month=2, year=2023)

starting to export 2023-02-01


ConnectionError: HTTPConnectionPool(host='main.matomo.qgiv.com', port=80): Max retries exceeded with url: /?module=API&method=Live.getLastVisitsDetails&idSite=1&period=day&date=2023-02-01&format=json&token_auth=20e9d5eeece2562bb882cb1e39f76828&filter_limit=2000&filter_offset=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f03b80791d0>: Failed to establish a new connection: [Errno 110] Connection timed out'))

In [None]:
error_entry[:50]

# loading to redshift

In [4]:
filenames = list_files("ingest-records", search_key='matomo', print_output=False)

In [6]:
bucket_name = 'ingest-records'
df = None

for f in filenames:
    df = pd.concat([df, get_dataframe_from_file(bucket_name, f)])

In [11]:
min_date = df['timestamp'].min()
max_date = df['timestamp'].max()

min_date, max_date

('2022-11-01 00:00:00', '2022-12-31 23:59:58')

In [8]:
"{:,}".format(len(df)), "{:,}".format(len(df.drop_duplicates()))

('4,830,270', '4,830,246')

In [15]:
"{}".format(min_date.replace(' 00:00:00', ''))

'2022-11-01'

In [19]:
# delete current data
q = "delete from matomo_traffic where timestamp>='{}' and timestamp<='{}'".format(min_date.replace(' 00:00:00', ''), max_date.replace(' 23:59:58', ''))
redshift_query_write(q, schema='production')

In [20]:
# ingest S3 data
for f in filenames:
    print(f)
    q = '''copy matomo_traffic
            from 's3://ingest-records/{}'
            iam_role 'arn:aws:iam::637885584661:role/AWSRoleForRedshift'
            emptyasnull
            blanksasnull
            fillrecord
            delimiter ','
            ignoreheader 1
            region 'us-east-1';'''.format(f)
    redshift_query_write(q, schema='production')

matomo_update.2022-11-01.csv
matomo_update.2022-11-02.csv
matomo_update.2022-11-03.1.csv
matomo_update.2022-11-03.csv
matomo_update.2022-11-04.1.csv
matomo_update.2022-11-04.csv
matomo_update.2022-11-05.csv
matomo_update.2022-11-06.csv
matomo_update.2022-11-07.1.csv
matomo_update.2022-11-07.csv
matomo_update.2022-11-08.1.csv
matomo_update.2022-11-08.csv
matomo_update.2022-11-09.1.csv
matomo_update.2022-11-09.csv
matomo_update.2022-11-10.1.csv
matomo_update.2022-11-10.csv
matomo_update.2022-11-11.1.csv
matomo_update.2022-11-11.csv
matomo_update.2022-11-12.1.csv
matomo_update.2022-11-12.csv
matomo_update.2022-11-13.1.csv
matomo_update.2022-11-13.csv
matomo_update.2022-11-14.1.csv
matomo_update.2022-11-14.csv
matomo_update.2022-11-15.1.csv
matomo_update.2022-11-15.csv
matomo_update.2022-11-16.1.csv
matomo_update.2022-11-16.2.csv
matomo_update.2022-11-16.csv
matomo_update.2022-11-17.1.csv
matomo_update.2022-11-17.2.csv
matomo_update.2022-11-17.csv
matomo_update.2022-11-18.1.csv
matomo_upda