### Email Names: Data Preprocess

This notebook preprocesses the raw data and generates a train / test split.

### Setup

In [2]:
import sagemaker
import boto3
sess = sagemaker.Session()

sagemaker_session_bucket='sagemaker-sigparser-caylent-mlops'
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")
print(f"sagemaker default bucket: {sess.default_bucket()}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::818442660361:role/service-role/AmazonSageMaker-ExecutionRole-20231103T203000
sagemaker session region: us-east-1
sagemaker default bucket: sagemaker-sigparser-caylent-mlops


### Load Data

Load the raw data file from S3

In [3]:
s3_input_prefix = 'data/email-names/input/raw'
file_name = 'sp_llm_emailname-mar26.csv'

# data timestamp
data_timestamp = '2024-03-26'

s3_data_path = f"s3://{sess.default_bucket()}/{s3_input_prefix}/{file_name}"

In [5]:
import pandas as pd

data_df = pd.read_csv(s3_data_path)
data_df.head()

Unnamed: 0,Test Group,Email Address,Display Name,First Name,Middle Name,Last Name,Name Prefix,Name Suffix
0,HOLDOUT,a_anderson@allianca.com,Alex Anderson,Alex,,Anderson,,
1,TRAIN,a_anderson@alliant.com,Alex - Alliant Insurance Ltd. Anderson,Alex,,Anderson,,
2,HOLDOUT,a_bell@carpenterfarraday.com,ANNA L. BELL,Anna,L.,Bell,,
3,TRAIN,a_bodnar@orquest.com,"BODNAR, Akshay",Akshay,,Bodnar,,
4,HOLDOUT,a_brown@onesteamboatplace.com,BROWN Adam,Adam,,Brown,,


### Data Preprocess

#### NaN Values

In [6]:
# finding how many NaN values are there in each column
nan_count_per_column = data_df.isna().sum()

print(f"NaN count per column: {nan_count_per_column}")

NaN count per column: Test Group           0
Email Address        0
Display Name         0
First Name           0
Middle Name       8803
Last Name            0
Name Prefix      10070
Name Suffix      10048
dtype: int64


#### Replace NaN with Empty String

In [7]:
data_df.fillna("", inplace=True)

print(f"NaN count per column after replacement: {data_df.isna().sum()}")

NaN count per column after replacement: Test Group       0
Email Address    0
Display Name     0
First Name       0
Middle Name      0
Last Name        0
Name Prefix      0
Name Suffix      0
dtype: int64


### Train / Test Split

In [8]:
data_df['Test Group'].unique()

array(['HOLDOUT', 'TRAIN'], dtype=object)

In [9]:
# splitting the data into train and test sets
train_df = data_df[data_df['Test Group'] == 'TRAIN']
test_df = data_df[data_df['Test Group'] == 'HOLDOUT']

In [10]:
print(f"train data shape: {train_df.shape}")
print(f"test data shape: {test_df.shape}")

train data shape: (8038, 8)
test data shape: (2100, 8)


### Upload Data Splits to S3

In [11]:
# saving data splits to CSV files with timestamps
output_dir = '../data'
train_df.to_csv(f'{output_dir}/{data_timestamp}_train.csv', index=False)
test_df.to_csv(f'{output_dir}/{data_timestamp}_test.csv', index=False)

In [12]:
# method to upload data files to S3
s3_client = boto3.client('s3')

def upload_to_s3(file_name, bucket, object_name=None):
    if object_name is None:
        object_name = file_name
    
    response = s3_client.upload_file(file_name, bucket, object_name)
    return response

In [13]:
# upload the train and test CSV files to S3
s3_output_prefix = 'data/email-names/input/processed'

upload_to_s3(f'{output_dir}/{data_timestamp}_train.csv', sess.default_bucket(), f'{s3_output_prefix}/{data_timestamp}_train.csv')
upload_to_s3(f'{output_dir}/{data_timestamp}_test.csv', sess.default_bucket(), f'{s3_output_prefix}/{data_timestamp}_test.csv')