from _ast import Load

# Simple notebook to process generic webaccess files. 

Assumptions
1. There are one or more csv files with access logs in the given directory. 
1. Each file can be big - 20k or more rows to process

Approach: 
1. Load each csv in the named directory with the given file spec (glob style)
1. combine as we go into a master df.     If the size becomes too large . then process each file one at a time and collect the stats as we go. 



In [None]:
import pandas as pd
import logging
from datetime import date, timedelta
import numpy as np
from datetime import date, datetime
import seaborn as sns
import os
import glob
from df_util import check_df

logging.basicConfig(level=logging.DEBUG, format="%(message)s")
LOGGER = logging.getLogger(__name__)
LOGGER.debug("log level = %s", str(LOGGER.getEffectiveLevel()))


In [None]:
#files to load: 
FILE_PATTERN="../data/webaccess/web_access_*.csv"

#Columns that must be in the CSV. 
REQUIRED_COLUMNS=['date','user','url']
REQUIRED_VALUES=REQUIRED_COLUMNS


DATE_AS_OF=date.fromisoformat('2023-06-01') #Date for analysis

#For data visualization - date range sizes
DATE_GROUP_DAYS=30
CUTOFF_DAYS=90


In [None]:
#  Determine which  files that meet the filespec - this doesn't load them
file_names = glob.glob(FILE_PATTERN)
if len(file_names) == 0:
    msg = f"ERROR: No file names found for pattern (\"{FILE_PATTERN}\". Stopping"
    LOGGER.error(msg)
    raise ValueError(msg)

LOGGER.info("%d files met the file pattern:[%s]", len(file_names), FILE_PATTERN)
    

In [None]:
df = None

#TODO - skip any non value rows from top of CSV. 
for i, file_name in enumerate(file_names):
    df_tmp = pd.read_csv(file_name)
    LOGGER.debug("file %d:[%s]: Loaded %d rows", i, file_name, len(df_tmp))

    if df is None:
        df = df_tmp
    else:
        df = pd.concat([df, df_tmp], axis=0)
        LOGGER.debug(" after concat: df len= %d rows", len(df))
    df.head()

In [None]:
df.head()

In [None]:
#check dataframe for missing blocks.  err is empty if no errors - otherwise an err str
err = check_df(df, required_columns=REQUIRED_COLUMNS, required_values=REQUIRED_VALUES)
if err:
    print(str(err))
    Exception(err)
    
LOGGER.info("Loaded %d clean rows", len(df))

In [None]:
#convert dates
df['date']  = pd.to_datetime(df['date'], format='%Y-%m-%d').dt.date
LOGGER.info("Date-range min: %s", df['date'].min())
LOGGER.info("Date-range max: %s", df['date'].max())

dmin = df['date'].min()
#df['days_since'] = df['date'].apply(lambda x: (x-dmin).days)

In [None]:
################################
# URL Access - count each unique access to the url
df_url_count = df.groupby(['url']).size().reset_index(name='count')
df_url_count.columns = ['url', 'count']

df_url_count.sort_values(by='count', ascending=False, inplace=True)
df_url_count.head(10)



In [None]:
################################
# URL Access by user
#for each URL - count the number of access by each unique user.    
#  this allows multiple per users per day. 
df_url_by_user = df.groupby(['user', 'url']).size().reset_index(name='count')
df_url_by_user.columns = ['user', 'url', 'count']
df_url_by_user.sort_values(by='count', ascending=False, inplace=True)
df_url_by_user.head(10)