# Notebook for updating dateset timestamps
Amazon Fraud Detector only retain 18 months of data for ingested events. This notebook provide functions to shift dateset timestamps to most recent months. 

In [8]:
import pandas as pd
from datetime import datetime, timezone, timedelta
import glob
import zipfile
import os
import glob

def update_timestamp(file):
    # Input: 
    #     file: file_path to csv 

    df = pd.read_csv(file,
                dtype='object',
                keep_default_na=False,
                na_values='')

    df['EVENT_TIMESTAMP'] = pd.to_datetime(df['EVENT_TIMESTAMP'])
    min_dt = min(df['EVENT_TIMESTAMP']).replace(tzinfo=None)
    max_dt = max(df['EVENT_TIMESTAMP']).replace(tzinfo=None)
    
    if 'LABEL_TIMESTAMP' in df.columns:
        df['LABEL_TIMESTAMP'] = pd.to_datetime(df['LABEL_TIMESTAMP'])
        min_dt = min(min_dt, df['LABEL_TIMESTAMP'].min().replace(tzinfo=None))
        max_dt = max(max_dt, df['LABEL_TIMESTAMP'].max().replace(tzinfo=None))
        
    print('Orignal dates')
    print(min_dt, max_dt)
    
    tz_info = max_dt.tzinfo

    assert max_dt-min_dt<timedelta(days=547)

    time_diff = datetime.now(tz_info)-max_dt-timedelta(days=1)

    df['EVENT_TIMESTAMP'] = df['EVENT_TIMESTAMP'] + time_diff
    print('Updated dates')
    print(df['EVENT_TIMESTAMP'].min(), df['EVENT_TIMESTAMP'].max())
    if 'LABEL_TIMESTAMP' in df.columns:
        df['LABEL_TIMESTAMP'] = df['LABEL_TIMESTAMP'] + time_diff
        print(df['LABEL_TIMESTAMP'].min(), df['LABEL_TIMESTAMP'].max())
        
    df['EVENT_TIMESTAMP'] = df['EVENT_TIMESTAMP'].dt.strftime("%Y-%m-%dT%H:%M:%SZ")
    if 'LABEL_TIMESTAMP' in df.columns:
        df['LABEL_TIMESTAMP'] = df['LABEL_TIMESTAMP'].dt.strftime("%Y-%m-%dT%H:%M:%SZ")
        
    
    df.to_csv(file,index=False)
    return 'SUCCESS'



### Under data folder

In [None]:
update_timestamp('registration_data_20K_full.csv')

update_timestamp('registration_data_20K_minimum.csv')

update_timestamp('transaction_data_100K_full.csv')

with zipfile.ZipFile("ato_data_800K_full.csv.zip","r") as zip_ref:
    zip_ref.extractall(".")
update_timestamp('ato_data_800K_full.csv')
zipfile.ZipFile('ato_data_800K_full.csv.zip', mode='w').write("ato_data_800K_full.csv", compress_type=zipfile.ZIP_DEFLATED)
os.remove('ato_data_800K_full.csv')

### Under demo_scripts/data folder

In [None]:
files = glob.glob('../demo_scripts/data/*')

In [10]:
for f in files:
    if f.endswith('.csv'):
        print(f)
        update_timestamp(f)
        print('====')

../demo_scripts/data/Registration_FakeAccountCreationByHumans_100k.csv
Orignal dates
2021-10-05 11:41:57 2022-10-05 17:18:53
Updated dates
2021-10-05 11:42:29.422949+00:00 2022-10-05 17:18:33.422949+00:00
2022-10-05 17:19:25.422949+00:00 2022-10-05 17:19:25.422949+00:00
====
../demo_scripts/data/Abuse_FreeTrialReferralAbuse_100k.csv
Orignal dates
2020-12-23 15:53:31 2021-12-23 21:30:21
Updated dates
2021-10-05 11:42:36.874333 2022-10-05 17:15:47.874333
2022-10-05 17:19:26.874333+00:00 2022-10-05 17:19:26.874333+00:00
====
../demo_scripts/data/Registration_FakeAccountCreationByBots_100k.csv
Orignal dates
2020-12-27 12:57:57 2021-12-27 18:43:03
Updated dates
2021-10-05 11:34:22.309072 2022-10-05 17:17:38.309072
2022-10-05 17:19:28.309072+00:00 2022-10-05 17:19:28.309072+00:00
====
../demo_scripts/data/Advertisement_AdClickFraud_20k.csv
Orignal dates
2021-01-26 14:54:14 2022-01-26 19:57:01
Updated dates
2021-10-05 12:16:42.411449 2022-10-05 16:39:34.411449
2022-10-05 17:19:29.411449+00:00