In [2]:
#Import libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import boto3
import sagemaker
import io
from sagemaker.session import Session
from sagemaker.s3 import S3Downloader, S3Uploader

# S3 Datalake

In [3]:
import warnings 
warnings.filterwarnings('ignore') 

In [4]:
#View test and bid files
train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,bidder_id,payment_account,address,outcome
0,91a3c57b13234af24875c56fb7e2b2f4rb56a,a3d2de7675556553a5f08e4c88d2c228754av,a3d2de7675556553a5f08e4c88d2c228vt0u4,0.0
1,624f258b49e77713fc34034560f93fb3hu3jo,a3d2de7675556553a5f08e4c88d2c228v1sga,ae87054e5a97a8f840a3991d12611fdcrfbq3,0.0
2,1c5f4fc669099bfbfac515cd26997bd12ruaj,a3d2de7675556553a5f08e4c88d2c2280cybl,92520288b50f03907041887884ba49c0cl0pd,0.0
3,4bee9aba2abda51bf43d639013d6efe12iycd,51d80e233f7b6a7dfdee484a3c120f3b2ita8,4cb9717c8ad7e88a9a284989dd79b98dbevyi,0.0
4,4ab12bc61c82ddd9c2d65e60555808acqgos1,a3d2de7675556553a5f08e4c88d2c22857ddh,2a96c3ce94b3be921e0296097b88b56a7x1ji,0.0


In [None]:
bid_data = pd.read_csv("bids.csv")
bid_data.head()

In [None]:
test_data = pd.read_csv("test.csv")

In [None]:
print("Bids Dataset Shape: ", bid_data.shape)
print("Train Dataset Shape: ", train_data.shape)
print("Test Dataset Shape: ", test_data.shape)

In [None]:
#Setup S3 bucket and upload files 
bucket = Session().default_bucket()
region = boto3.Session().region_name
s3_client = boto3.client("s3", region_name=region)
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

S3Uploader.upload("test.csv", f"s3://{bucket}/facebook-recruiting-iv-human-or-bot")
S3Uploader.upload("train.csv", f"s3://{bucket}/facebook-recruiting-iv-human-or-bot")
S3Uploader.upload("bids.csv", f"s3://{bucket}/facebook-recruiting-iv-human-or-bot")

# Athena Table SetUp

In [None]:
from pyathena import connect

In [None]:
#Create Athena database and S3 staging directory
database_name = "bot_bids"
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
pd.read_sql(statement, conn)

In [None]:
#Verify database created successfully
statement = "SHOW DATABASES"
df_show = pd.read_sql(statement, conn)
df_show.head(1)

In [None]:
# SQL statement to execute
bid_table_name_csv = "bids"
s3_data_path = f"s3://{bucket}/facebook-recruiting-iv-human-or-bot/"
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         bid_id int,
         bidder_id string,
         auction string,
         merchandise string,
         device string,
         time timestamp,
         country string,
         ip string,
         url string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, bid_table_name_csv, s3_data_path
)

print(statement)

In [None]:
pd.read_sql(statement, conn)

In [None]:
#Verify the table is created 
statement = "SHOW TABLES in {}".format(database_name)
df_show = pd.read_sql(statement, conn)
df_show.head(5)

# Exploratory Data Analysis

In [None]:
#Find missing values and calculate percentage that are missing
bid_data.isnull().sum()

In [None]:
missing_percent = bid_data['country'].isnull().mean()
print(f"Percentage of missing data in country column: {missing_percent*100: .2f}%")

In [None]:
#Check country distribution
unique_countries = bid_data['country'].value_counts()
plt.bar(unique_countries.index, unique_countries.values)
plt.xticks(visible=False)   
plt.title("Distribution of Bidder Countries")
plt.xlabel("Country")
plt.ylabel("Number of Bids")
plt.show()

In [None]:
#Since one country has dramaticaly more bids than the others, fill the NA values with the mode
bid_data['country'] = bid_data['country'].fillna(bid_data['country'].mode()[0])

In [None]:
#Check merchandise distribution 
unique_products = bid_data['merchandise'].value_counts()
plt.bar(unique_products.index, unique_products.values)
plt.title("Distribution of Merchandise Being Bid On")
plt.xlabel("Merchandise Category")
plt.ylabel("Number of Bids")
plt.xticks(rotation=90)                    
plt.show()

In [None]:
#View distribution of bot vs human bids in training dataset
#Uneven distribution indicates oversampling will be needed
train_data['outcome'].value_counts()

# Feature Store

In [None]:
#Setup Feature store
boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "sagemaker-featurestore-demo"

print(default_s3_bucket_name)