### Importing necessary packages and importing the dataset

In [2]:
#Importing necessary packages
import numpy as np
import pandas as pd
import sys
import datetime
sys.path.append('..')
import regex as re
from utils.s3_helper import read_s3_csv_to_dataframe, upload_dataframe_to_s3, upload_data_to_s3
pd.options.display.max_colwidth = 500

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [3]:
# s3bucket used
bucket_name = 'sagemaker-sigparser-caylent-mlops'
# input data
s3_file_key = 'data/email-type/input/raw/sp_llm_emailtype-mar28.csv'
data_timestamp = '28-03-2024'

### Cleaning dataset given on Mar 27

In [4]:
df = read_s3_csv_to_dataframe(bucket_name, s3_file_key)

In [5]:
df.head()

Unnamed: 0,LLM Group,Email Address,Email Name,Email Display Name,Type
0,TRAINING,\t-ng@nationalgypsum.com,\t-ng,\t- NG EMAIL,Non-Person
1,TRAINING,\t+12134588429.30119168@resources.lync.com,12134588429,\t+12134588429 30119168,Non-Person
2,TRAINING,\t+12134588429.61498480@resources.lync.com,12134588430,\t+12134588429 61498480,Non-Person
3,TRAINING,\t+146238799022001@voicemail.com,146238799022001,146238799022001,Non-Person
4,TRAINING,!badlandsroom@acuitybrands.com,!badlandsroom,!JLS-Badlands Room,Non-Person


In [6]:
#Baseline data: Dataset with records containing email and display name in non English
#These are records which include display names in non English but emails are in English
# df = df.drop(['Email Name'], axis=1)
#Renaming columns to match prev dataset
df = df.rename(columns={'Email Name': 'Email Address Name', 'Email Display Name': 'Email Address Display Name', 'Type': 'Email Type'})

In [7]:
df.head()

Unnamed: 0,LLM Group,Email Address,Email Address Name,Email Address Display Name,Email Type
0,TRAINING,\t-ng@nationalgypsum.com,\t-ng,\t- NG EMAIL,Non-Person
1,TRAINING,\t+12134588429.30119168@resources.lync.com,12134588429,\t+12134588429 30119168,Non-Person
2,TRAINING,\t+12134588429.61498480@resources.lync.com,12134588430,\t+12134588429 61498480,Non-Person
3,TRAINING,\t+146238799022001@voicemail.com,146238799022001,146238799022001,Non-Person
4,TRAINING,!badlandsroom@acuitybrands.com,!badlandsroom,!JLS-Badlands Room,Non-Person


In [8]:
#Splitting the dataset into train and test
holdout_df = df[df['LLM Group'] == 'HOLDOUT']
train_df = df[df['LLM Group'] == 'TRAINING']

In [9]:
holdout_df = holdout_df.drop(['LLM Group'], axis=1)
train_df = train_df.drop(['LLM Group'], axis=1)

In [10]:
#Saving train and test dataset separately
# saving data splits to CSV files with timestamps
train_df.to_csv(f'{data_timestamp}_train.csv', index=False)
test_df.to_csv(f'{data_timestamp}_test.csv', index=False)

In [12]:
print('Holdout df: ', holdout_df['Email Type'].value_counts().to_dict())
print('Training df: ', train_df['Email Type'].value_counts().to_dict())

Holdout df:  {'Non-Person': 1519, 'Person': 810}
Training df:  {'Non-Person': 5770, 'Person': 2434}


In [15]:
#Dataset 2: Records where both display names and email addressess are in non English characters are removed
def contains_only_non_english(s):
    return not bool(re.search(r'[a-zA-Z]', s))

#Filter rows based on the condition
filtered_df = df[df['Email Address Name'].apply(contains_only_non_english) & df['Email Address Display Name'].apply(contains_only_non_english)]

In [16]:
filtered_df

Unnamed: 0,LLM Group,Email Address,Email Address Name,Email Address Display Name,Email Type
1,TRAINING,\t+12134588429.30119168@resources.lync.com,12134588429,\t+12134588429 30119168,Non-Person
2,TRAINING,\t+12134588429.61498480@resources.lync.com,12134588430,\t+12134588429 61498480,Non-Person
3,TRAINING,\t+146238799022001@voicemail.com,146238799022001,146238799022001,Non-Person
10,TRAINING,001051@mfrm.com,1051,1051,Non-Person
26,TRAINING,10-0102@csusa.us,10-0102,10-0102,Non-Person
27,TRAINING,10-0103@csusa.us,10-0103,10-0103,Non-Person
50,TRAINING,13325.01@harrisgroup.com,13325.01,13325.01,Non-Person
57,TRAINING,163596@united.com,163596,163596,Non-Person
60,TRAINING,17135567099@send.myfax.com,17135567099,17135567099,Non-Person
67,HOLDOUT,182685@mcpsmd.net,182685,182685,Non-Person


### Upload Data Splits to S3

In [None]:
# saving data splits to CSV files with timestamps
train_df.to_csv(f'{data_timestamp}_train.csv', index=False)
test_df.to_csv(f'{data_timestamp}_test.csv', index=False)

In [None]:
# method to upload data files to S3
s3_client = boto3.client('s3')

def upload_to_s3(file_name, bucket, object_name=None):
    if object_name is None:
        object_name = file_name
    
    response = s3_client.upload_file(file_name, bucket, object_name)
    return response

In [None]:
# upload the train and test CSV files to S3
s3_output_prefix = 'data/email-type/input/processed'

upload_to_s3(f'{data_timestamp}_train.csv', sess.default_bucket(), f'{s3_output_prefix}/{data_timestamp}_train.csv')
upload_to_s3(f'{data_timestamp}_test.csv', sess.default_bucket(), f'{s3_output_prefix}/{data_timestamp}_test.csv')