In [None]:
import boto3
import tarfile
import xgboost as xgb
import pandas as pd
import numpy as np
import os

def download_model_from_s3(bucket_name, s3_file_path, local_file_path):
    s3 = boto3.client('s3')
    s3.download_file(bucket_name, s3_file_path, local_file_path)

def extract_model(tar_file_path, extract_path):
    with tarfile.open(tar_file_path, 'r:gz') as tar:
        tar.extractall(path=extract_path)

def load_model(model_path):
    return xgb.Booster(model_file=model_path)

def preprocess_data(df):
    # List of features the model expects
    expected_features = ['TX_AMOUNT', 'yyyy', 'mm', 'dd', 'customer_id_nb_txns_15min_window', 
                         'customer_id_nb_txns_30min_window', 'customer_id_nb_txns_60min_window', 
                         'customer_id_nb_txns_1day_window', 'customer_id_nb_txns_7day_window', 
                         'customer_id_nb_txns_15day_window', 'customer_id_nb_txns_30day_window', 
                         'customer_id_avg_amt_15min_window', 'customer_id_avg_amt_30min_window', 
                         'customer_id_avg_amt_60min_window', 'customer_id_avg_amt_1day_window', 
                         'customer_id_avg_amt_7day_window', 'customer_id_avg_amt_15day_window', 
                         'customer_id_avg_amt_30day_window', 'terminal_id_nb_txns_15min_window', 
                         'terminal_id_nb_txns_30min_window', 'terminal_id_nb_txns_60min_window', 
                         'terminal_id_nb_txns_1day_window', 'terminal_id_nb_txns_7day_window', 
                         'terminal_id_nb_txns_15day_window', 'terminal_id_nb_txns_30day_window', 
                         'terminal_id_avg_amt_15min_window', 'terminal_id_avg_amt_30min_window', 
                         'terminal_id_avg_amt_60min_window', 'terminal_id_avg_amt_1day_window', 
                         'terminal_id_avg_amt_7day_window', 'terminal_id_avg_amt_15day_window', 
                         'terminal_id_avg_amt_30day_window']

    # Check if all expected features are in the DataFrame
    missing_features = set(expected_features) - set(df.columns)
    if missing_features:
        raise ValueError(f"Missing features in input data: {missing_features}")

    # Select only the expected features
    df_selected = df[expected_features]

    # Handle missing values if any
    df_selected = df_selected.fillna(0)  # or use another appropriate method

    return df_selected


def predict(model, input_data):
    dmatrix = xgb.DMatrix(input_data)
    return model.predict(dmatrix)

# S3 details
bucket_name = 'sagemaker-us-east-2-386900942011'
s3_file_path = 'cpu-job-2024-10-21-18-14-03-937/output/model.tar.gz'
local_file_path = '/tmp/model.tar.gz'
extract_path = '/tmp/model'

# Download and extract the model
download_model_from_s3(bucket_name, s3_file_path, local_file_path)
extract_model(local_file_path, extract_path)

# Load the model
model = load_model(os.path.join(extract_path, 'model.xgb'))

# Load your Parquet file
parquet_file_path = 's3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_0.parquet'
df = pd.read_parquet(parquet_file_path)

print("Columns in the Parquet file:")
print(df.columns.tolist())

# Preprocess the data
try:
    preprocessed_data = preprocess_data(df)
except ValueError as e:
    print(f"Error: {e}")
    # Handle the error appropriately, maybe exit the script
    exit(1)

# Make predictions
predictions = predict(model, preprocessed_data)

# Add predictions to the original dataframe
df['predictions'] = predictions

# Display a few rows with predictions
print("\nSample predictions:")
print(df[['TX_FRAUD_1', 'TERMINAL_ID_index', 'merchant_index', 'predictions']])

# Clean up
os.remove(local_file_path)
os.remove(os.path.join(extract_path, 'model.xgb'))
os.rmdir(extract_path)