#### Imports 

In [1]:
import pandas as pd
import sagemaker

#### Explore data

In [2]:
df = pd.read_csv('./data/medical_transcripts.csv')
df.head()

Unnamed: 0,specialty,transcription
0,Allergy_Immunology,A 23-year-old white female presents with compl...
1,Bariatrics,Consult for laparoscopic gastric bypass. PAST ...
2,Bariatrics,Consult for laparoscopic gastric bypass. HISTO...
3,Cardiovascular_Pulmonary,"2-D M-Mode. Doppler. 2-D M-MODE: , ,1. Left a..."
4,Cardiovascular_Pulmonary,2-D Echocardiogram 1. The left ventricular ca...


In [3]:
df.shape

(2799, 2)

In [4]:
df['transcription'].tolist()[1500]

'Transesophageal echocardiogram due to vegetation and bacteremia. Normal left ventricular size and function.  Echodensity involving the aortic valve suggestive of endocarditis and vegetation. Doppler study as above most pronounced being moderate-to-severe aortic insufficiency. REASON FOR EXAM: , Vegetation and bacteremia.,PROCEDURE: , Transesophageal echocardiogram.,INTERPRETATION: , The procedure and its complications were explained to the patient in detail and formal consent was obtained.  The patient was brought to special procedure unit.  His throat was anesthetized with lidocaine spray.  Subsequently, 2 mg of IV Versed was given for sedation.  The patient was positioned.  Probe was introduced without any difficulty.  The patient tolerated the procedure very well.  Probe was taken out.  No complications were noted.  Findings are as mentioned below.,FINDINGS:,1.  Left ventricle has normal size and dimensions with normal function.  Ejection fraction of 60%.,2.  Left atrium and right-

In [5]:
set(df['specialty'].tolist())

{'Allergy_Immunology',
 'Bariatrics',
 'Cardiovascular_Pulmonary',
 'Dentistry',
 'General_Medicine',
 'Neurology',
 'Neurosurgery',
 'Obstetrics_Gynecology',
 'Office_Notes',
 'Ophthalmology',
 'Orthopedic',
 'Pain_Management',
 'Pediatrics_Neonatal',
 'Physical_Medicine_Rehab',
 'Podiatry',
 'Psychiatry_Psychology',
 'Radiology',
 'Rheumatology',
 'SOAP_Chart_Progress_Notes',
 'Sleep_Medicine',
 'Speech_Language',
 'Surgery',
 'Urology'}

#### Encode data to be Jumpstart ready

In [6]:
label_map = {'Allergy_Immunology': 0,
             'Bariatrics': 1,
             'Cardiovascular_Pulmonary': 2,
             'Dentistry': 3,
             'General_Medicine': 4,
             'Neurology': 5,
             'Neurosurgery': 6,
             'Obstetrics_Gynecology': 7,
             'Office_Notes': 8,
             'Ophthalmology': 9,
             'Orthopedic': 10,
             'Pain_Management': 11,
             'Pediatrics_Neonatal': 12,
             'Physical_Medicine_Rehab': 13,
             'Podiatry': 14,
             'Psychiatry_Psychology': 15,
             'Radiology': 16,
             'Rheumatology': 17,
             'SOAP_Chart_Progress_Notes': 18,
             'Sleep_Medicine': 19,
             'Speech_Language': 20,
             'Surgery': 21,
             'Urology': 22
            }

In [7]:
inverse_label_map = {0: 'Allergy_Immunology',
                     1: 'Bariatrics',
                     2: 'Cardiovascular_Pulmonary',
                     3: 'Dentistry',
                     4: 'General_Medicine',
                     5: 'Neurology',
                     6: 'Neurosurgery',
                     7: 'Obstetrics_Gynecology',
                     8: 'Office_Notes',
                     9: 'Ophthalmology',
                     10: 'Orthopedic',
                     11: 'Pain_Management',
                     12: 'Pediatrics_Neonatal',
                     13: 'Physical_Medicine_Rehab',
                     14: 'Podiatry',
                     15: 'Psychiatry_Psychology',
                     16: 'Radiology',
                     17: 'Rheumatology',
                     18: 'SOAP_Chart_Progress_Notes',
                     19: 'Sleep_Medicine',
                     20: 'Speech_Language',
                     21: 'Surgery',
                     22: 'Urology'
                   }

In [8]:
encoded_df = df.replace({'specialty': label_map})
encoded_df.head()

Unnamed: 0,specialty,transcription
0,0,A 23-year-old white female presents with compl...
1,1,Consult for laparoscopic gastric bypass. PAST ...
2,1,Consult for laparoscopic gastric bypass. HISTO...
3,2,"2-D M-Mode. Doppler. 2-D M-MODE: , ,1. Left a..."
4,2,2-D Echocardiogram 1. The left ventricular ca...


In [9]:
encoded_df.to_csv('./data/data.csv', header=False, index=False)

##### Check encoded csv

In [10]:
df = pd.read_csv('./data/data.csv', names=['label', 'transcript'])
df.head()

Unnamed: 0,label,transcript
0,0,A 23-year-old white female presents with compl...
1,1,Consult for laparoscopic gastric bypass. PAST ...
2,1,Consult for laparoscopic gastric bypass. HISTO...
3,2,"2-D M-Mode. Doppler. 2-D M-MODE: , ,1. Left a..."
4,2,2-D Echocardiogram 1. The left ventricular ca...


In [11]:
inverse_label_map[df['label'][0]]

'Allergy_Immunology'

#### Copy dataset from local to S3

In [12]:
session = sagemaker.Session()
default_bucket = session.default_bucket()
print(f'Default S3 bucket = {default_bucket}')

Default S3 bucket = sagemaker-us-east-1-119174016168


Remove checkpoints if any

In [13]:
!rm -rf ./data/.ipynb_checkpoints/

In [14]:
!aws s3 cp ./data/data.csv s3://{default_bucket}/transcripts/data.csv

upload: data/data.csv to s3://sagemaker-us-east-1-119174016168/transcripts/data.csv
