## This notebook contains code to perform data analysis and cleaning tasks on a csv file

### Importing necessary packages and functions

In [8]:
#Run the utility notebook first
%run distilBERT_utility.py

### Reading train and test files, and setting the data prep mode

In [13]:
train_df = retrieve_from_s3(bucket_name='sagemaker-sigparser-caylent-mlops', file_key='data/email-type/input/raw/sp_emailtype_training - apr18 - sp_emailtype_training - apr18.csv')
test_df = retrieve_from_s3(bucket_name='sagemaker-sigparser-caylent-mlops', file_key='data/email-type/input/raw/batch_2_retro - test-apr18 - batch_2_retro - test-apr18.csv')

mode = "test"

### Data Preparation for distilBERT

In [9]:
if mode == "train":
    print("Shape of train dataframe: ", train_df.shape)
    print("Value count of Email Type values in train dataset: ", train_df['Type'].value_counts())
    
    nan_values(train_df)
    
    #Convert Email Type str values to numeric values
    train_df['Type'] = train_df['Type'].apply(email_type_to_int)
    
    #Concatenating the email columns
    train_df['combined'] = train_df['Email Address']+ ', ' +train_df['Email Name']+ ', ' +train_df['Email Display Name']
    
    #Picking out only the Email Type and combined string columns to be in the final train df
    final_df = train_df[['Type', 'combined']]
    
    #Saving train data csv file to S3 
    save_to_s3(final_df, bucket_name='sagemaker-sigparser-caylent-mlops', file_key='data/email-type/input/processed/distilbert/distilbert-train/distilbert-fine_tuning-23-04-2024/data.csv', mode="train")

Shape of train dataframe:  (62509, 5)
Value count of Email Type values in train dataset:  Type
Person        39304
Non-Person    23204
Name: count, dtype: int64


In [15]:
if mode == "test":
    print("Shape of test dataframe: ", test_df.shape)
    
    nan_values(test_df)
    
    #Convert Email Type str values to 'LABEL_0' for Person and 'LABEL_1' for Non-Person to ensure compatibility with the way the model has mapped numeric labels while training to a more descriptive string format for outputs
    test_df['Email Type'] = test_df['Email Type'].apply(lambda x: 'LABEL_0' if x == 'Person' else 'LABEL_1')
    
    #Concatenating the email columns
    test_df['combined'] = test_df['Email']+ ', ' +test_df['Email Name']+ ', ' +test_df['Latest Email Display Name']
    
    #Picking out only the  combined string column to be in the test df and the numeric values to be in ground truth list
    test_df_distilbert = test_df[['combined']]
    gt_df_distilbert = test_df[['Email Type']].rename(columns={'Email Type': 'gt'})
    
    save_to_s3(test_df_distilbert, bucket_name='sagemaker-sigparser-caylent-mlops', file_key='data/email-type/input/processed/distilbert/distilbert-test/distilbert-fine_tuning-23-04-2024/test_data.csv', mode="test")
    save_to_s3(gt_df_distilbert, bucket_name='sagemaker-sigparser-caylent-mlops', file_key='data/email-type/input/processed/distilbert/distilbert-test/distilbert-fine_tuning-23-04-2024/distilbert_test_gt.csv', mode="test")

Shape of test dataframe:  (53646, 4)
