In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from sqlalchemy import create_engine
import boto3
import pytz

# Define constants and configurations
S3_STAGING_DIR = "s3://ets-aws-plalab-dii-prod-analyticsbucket-1ktrlhzbrcbkb/athena_query_results/"
ATHENA_REGION = "us-east-1"
START_DATE = pytz.utc.localize(datetime(2024, 2, 1))
END_DATE = pytz.utc.localize(datetime(2024, 5, 27))

# SKU lists
NON_PREP_SKUS = [
    '-2002', '-2001', '-1111', '4001', '4002', '4003', '4004', '4005', '4006', '4007', '4008', '4009', '4010', '4011',
    '4012', '4013', '4014', '4015', '4016', '4017', '4018', '4019', '4020', '4028', '4034', '4035', '4091', '4092', '4093',
    '4321', '5000', '5001', '5002', '5003', '5004', '5005', '5006', '5007', '5009', '5010', '5020', '5021', '5022', '5023',
    '5024', '5025', '5026', '5027', '5028', '5029', '5030', '5031', '5032', '5033', '5035', '5037', '5040', '5041', '5042',
    '5043', '5044', '5045', '5046', '5047', '5048', '5049', '5050', '5051', '5052', '5053', '5054', '5055', '5056', '5057',
    '5058', '5059', '5060', '5061', '5062', '5063', '5064', '5065', '5066', '5067', '5068', '5069', '5070', '5071', '5072',
    '5073', '5074', '5075', '5076', '5077', '5078', '5079', '5080', '5081', '5082', '5083', '5084', '5085', '5086', '5087',
    '5088', '5089', '5091', '5093', '5094', '5095', '5096', '5097', '5098', '5099', '5100', '5101', '5102', '5103', '5104',
    '5105', '5106', '5107', '5108', '5109', '5110', '5111', '5112', '5113', '5114', '5115', '5116', '5117', '5118', '5119',
    '5120', '5121', '5122', '5123', '5124', '5125', '5126', '5127', '5128', '5129', '5130', '5131', '5132', '5133', '5134',
    '5135', '5136', '5137', '5138', '5139', '5140', '5142', '5143', '5144', '5145', '5146', '5147', '5148', '5149', '5150',
    '5151', '5152', '5154', '5155', '5156', '5157', '5159', '5160', '5161', '5162', '5163', '5164', '5165', '5166', '5167',
    '5168', '5169', '5170', '5171', '5172', '5173', '5174', '5175', '5176', '5177', '5178', '5179', '5180', '5181', '5182',
    '5183', '5184', '5185', '5186', '5187', '5188', '5189', '5190', '5191', '5192', '5193', '5194', '5195', '5196', '5197',
    '5198', '5199', '5200', '5600', '5601', '5602', '5603', '5604', '5605', '5606', '5607', '5608', '5609', '5610', '5611',
    '5612', '5613', '5614', '5615', '5616', '5617', '5618', '5619', '5620', '5621', '5622', '5624', '5625', '5626', '5627',
    '5628', '5629', '5630', '5631', '5632', '5633', '5634', '5635', '5636', '5637', '5638', '5639', '5640', '5641', '5642',
    '5643', '5644', '5645', '5646', '5647', '5648', '5649', '5650', '5651', '5652', '5653', '5654', '5655', '5656', '5657',
    '5658', '5659', '5660', '5661', '5662', '5663', '5664', '5665', '5666', '5667', '5668', '5669', '5670', '5671', '5672',
    '5673', '5674', '5675', '5676', '5677', '5678', '5679', '5680', '5681', '5682', '5683', '5684', '5685', '5686', '5687',
    '5688', '5689', '5690', '5691', '5692', '5693', '5694', '5695', '5696', '6001', '6002', '6003', '6004', '6005', '6009',
    '6010', '6011', '6012', '6013', '6014', '6015', '6016', '6017', '6018', '100155', '100156', '100206', '100207'
]
SKU_ID_REGISTRATION = ['4001', '4017', '6010', '6017']

def create_engine_connection():
    """
    Create a connection to the Amazon Athena database using SQLAlchemy.

    Returns:
        engine (SQLAlchemy engine): The engine object to interact with the Athena database.
    """
    connection_string = f"awsathena+rest://:@athena.{ATHENA_REGION}.amazonaws.com:443/labsprodeventsdatabase-x806vjuzpbrd?s3_staging_dir={S3_STAGING_DIR}"
    engine = create_engine(connection_string)
    return engine

def pull_parquet_files_pandas_features(file_name, bucket_name='ets-dii-testready-ds-analytics'):
    """
    Pulls Parquet files from a specified S3 bucket and concatenates them into a single DataFrame.

    Args:
        file_name (str): The name of the file (or file identifier) to search for in the S3 bucket.
        bucket_name (str, optional): The name of the S3 bucket to search in. Defaults to 'ets-dii-testready-ds-analytics'.

    Returns:
        pandas.DataFrame: A DataFrame containing the concatenated data from all found Parquet files.
    """
    s3_client = boto3.client('s3')
    prefix = f'features/file_name={file_name}/'

    def list_parquet_files(bucket, prefix):
        """List all Parquet files within the structured partitioning scheme in S3."""
        file_paths = []
        paginator = s3_client.get_paginator('list_objects_v2')
        for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
            for content in page.get('Contents', []):
                key = content.get('Key')
                if key.endswith('.parquet'):  # Ensure we're only capturing Parquet files
                    file_paths.append(f"s3://{bucket}/{key}")
        return file_paths

    file_paths = list_parquet_files(bucket_name, prefix)

    # Initialize an empty DataFrame
    df = pd.DataFrame()

    for file_path in file_paths:
        # Read each Parquet file into a DataFrame
        temp_df = pd.read_parquet(file_path, engine='pyarrow')
        # Concatenate to the main DataFrame
        df = pd.concat([df, temp_df], ignore_index=True)
    return df

def filter_dataframe_by_date(df, start_date, end_date, date_column='submit_datetime'):
    """
    Filters a DataFrame to include only rows where the date in date_column is between start_date and end_date.

    Args:
        df (pandas.DataFrame): The DataFrame to filter.
        start_date (datetime): The start date for the filter.
        end_date (datetime): The end date for the filter.
        date_column (str, optional): The name of the column containing datetime values. Defaults to 'submit_datetime'.

    Returns:
        pandas.DataFrame: The filtered DataFrame.
    """
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
    # Ensure all datetimes are timezone-aware and converted to UTC
    df[date_column] = df[date_column].apply(lambda x: x.tz_convert('UTC') if x.tzinfo is not None else x.tz_localize('UTC'))
    filtered_df = df[(df[date_column] >= start_date) & (df[date_column] <= end_date)]
    return filtered_df

def process_order_data(df, non_prep_skus, sku_id_registration):
    """
    Processes the order data to filter out non-prep SKUs and identify registration markers using vectorized methods.

    Args:
        df (pandas.DataFrame): The DataFrame containing the raw order data.
        non_prep_skus (list): List of SKUs to be considered as non-prep.
        sku_id_registration (list): List of SKU IDs used for registration.

    Returns:
        pandas.DataFrame: The processed DataFrame with filtered order data.
    """
    # Convert SKU lists to sets for faster membership checking
    non_prep_skus_set = set(map(int, [sku for sku in non_prep_skus if sku.isdigit()]))
    sku_id_registration_set = set(map(int, [sku for sku in sku_id_registration if sku.isdigit()]))
    sku_id_fees = non_prep_skus_set - sku_id_registration_set

    # Ensure 'skuid' is numeric and filter out invalid skus
    df['skuid'] = pd.to_numeric(df['skuid'], errors='coerce')
    df = df.dropna(subset=['skuid'])
    df['skuid'] = df['skuid'].astype(int)

    # Filter out sku_id_fees and keep valid SKUs
    df = df[~df['skuid'].isin(sku_id_fees)]

    # Add registration_marker column
    df['registration_marker'] = df['skuid'].isin(sku_id_registration_set).astype(int)

    # Convert 'submit_datetime' to datetime and ensure timezone-aware
    df['submit_datetime'] = pd.to_datetime(df['submit_datetime'], errors='coerce')
    df['submit_datetime'] = df['submit_datetime'].apply(lambda x: x.tz_convert('UTC') if x.tzinfo is not None else x.tz_localize('UTC'))

    # Filter out orders with 'fulfillmenttype' containing 'RESCHEDULE'
    if 'fulfillmenttype' in df.columns:
        df['fulfillmenttype'] = df['fulfillmenttype'].astype(str)
        df = df[~df['fulfillmenttype'].str.contains('RESCHEDULE', case=False, na=False)]

    # Filter to keep only rows where registration_marker is 0
    df = df[df['registration_marker'] == 0]

    # Select only the required columns
    df = df[['user_id', 'order_id', 'submit_datetime', 'skuid', 'registration_marker']]

    # Sort and drop duplicates to keep only the first order for each user
    df.sort_values(by=['user_id', 'submit_datetime'], ascending=[True, True], inplace=True)
    df = df.drop_duplicates(subset='user_id', keep='first')

    return df

def main():
    """
    Main function to execute the data fetching, processing, and saving operations.
    """
    print("Fetching order data...")
    parquet_data = pull_parquet_files_pandas_features(file_name='eReg_order_items')
    
    # Filter the data by the specified date range
    filtered_data = filter_dataframe_by_date(parquet_data, START_DATE, END_DATE)
    
    # Process the filtered order data
    processed_order_data = process_order_data(filtered_data, NON_PREP_SKUS, SKU_ID_REGISTRATION)
    
    # Save the processed order data DataFrame to a CSV file for verification
    processed_order_data.to_csv("order_data.csv", index=False)
    print("Order data fetched and saved to CSV.")

if __name__ == "__main__":
    main()

Fetching order data...


  df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['skuid'] = pd.to_numeric(df['skuid'], errors='coerce')


Order data fetched and saved to CSV.
