In [1]:
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
import boto3
import os
import io


In [2]:
sess = sagemaker.Session()
role = get_execution_role()

In [3]:
prefix_full_data = 'rawdata/original_raw_files/full_data'
prefix_train = 'rawdata/original_raw_files/train'
prefix_valid = 'rawdata/original_raw_files/valid'
bucket = "humana-data"


conn = boto3.client('s3')
contents = conn.list_objects(Bucket=bucket, Prefix=prefix_full_data)['Contents']

rawfile_names = [key['Key'] for key in contents]
rawfile_names = [x for x in rawfile_names if x.endswith('.csv')]
rawfile_names

['rawdata/original_raw_files/full_data/CMS.csv',
 'rawdata/original_raw_files/full_data/Condition.csv',
 'rawdata/original_raw_files/full_data/Credit.csv',
 'rawdata/original_raw_files/full_data/Demo.csv',
 'rawdata/original_raw_files/full_data/Lab.csv',
 'rawdata/original_raw_files/full_data/Medical Claims.csv',
 'rawdata/original_raw_files/full_data/Not sure.csv',
 'rawdata/original_raw_files/full_data/Others.csv',
 'rawdata/original_raw_files/full_data/Pharm.csv',
 'rawdata/original_raw_files/full_data/dependent.csv']

In [16]:
# Get the demo file
response = conn.get_object(Bucket=bucket, Key='rawdata/original_raw_files/full_data/Condition.csv')
status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 get_object response. Status - {status}")
    condition_df = pd.read_csv(response.get("Body"))
else:
    print(f"Unsuccessful S3 get_object response. Status - {status}")

condition_df.columns = condition_df.columns.str.lower()
condition_df = condition_df.set_index('person_id_syn')

# condition_df = dependent_df.merge(condition_df, left_index=True, right_index=True)

condition_train_df = condition_df.sample(frac= 0.7)
condition_train_df = condition_train_df.sample(10000)
train_indexes = pd.DataFrame(condition_train_df.reset_index()['person_id_syn'].to_dict().items(),
                             columns = ['index_num', 'person_id_syn'])


condition_valid_df = condition_df[~(condition_df.index.isin(train_indexes['person_id_syn']))]
condition_valid_df = condition_valid_df.sample(4000)
valid_indexes = pd.DataFrame(condition_valid_df.reset_index()['person_id_syn'].to_dict().items(), columns = ['index_num', 'person_id_syn'])

# Saving train
# filename = 'condition.parquet.gzip'
# s3_train_url = 's3://{}/{}/{}'.format(bucket, prefix_train, filename)
# s3_valid_url = 's3://{}/{}/{}'.format(bucket, prefix_valid, filename)
# condition_train_df.to_parquet(s3_train_url, compression='gzip')
# condition_valid_df.to_parquet(s3_valid_url, compression='gzip')


from io import StringIO # python3; python2: BytesIO 
csv_buffer = StringIO()
condition_train_df.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
filename_condition = "condition.csv"

fe_path = prefix_train + '/' + filename_condition
s3_resource.Object(bucket, fe_path).put(Body=csv_buffer.getvalue())


csv_buffer = StringIO()
condition_valid_df.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
filename_condition = "condition.csv"

fe_path = prefix_valid + '/' + filename_condition
s3_resource.Object(bucket, fe_path).put(Body=csv_buffer.getvalue())

Successful S3 get_object response. Status - 200


{'ResponseMetadata': {'RequestId': 'MXR3MJ46VJRYS4KB',
  'HostId': '9I0kJQz/42i1tfrui7xzh+Y1GFR7sbLQHR/14BlLRmFIoFVIPWdYrbiSp7T6rnqid0wA1r/6O6E=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '9I0kJQz/42i1tfrui7xzh+Y1GFR7sbLQHR/14BlLRmFIoFVIPWdYrbiSp7T6rnqid0wA1r/6O6E=',
   'x-amz-request-id': 'MXR3MJ46VJRYS4KB',
   'date': 'Tue, 29 Nov 2022 20:07:15 GMT',
   'x-amz-version-id': 'nx2qigYqTl9DGuQbngoRdXD65xlPtvsh',
   'etag': '"9bb908fd62c9aa4ecd7b726b083f87ce"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"9bb908fd62c9aa4ecd7b726b083f87ce"',
 'VersionId': 'nx2qigYqTl9DGuQbngoRdXD65xlPtvsh'}

In [24]:
# Dependent
# Get the demo file
response = conn.get_object(Bucket=bucket, Key='rawdata/original_raw_files/full_data/dependent.csv')
status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 get_object response. Status - {status}")
    dependent_df = pd.read_csv(response.get("Body"))
else:
    print(f"Unsuccessful S3 get_object response. Status - {status}")
    
    
dependent_df.columns = dependent_df.columns.str.lower()
dependent_df = dependent_df.set_index('person_id_syn')

dependent_train_df = dependent_df[(dependent_df.index.isin(train_indexes['person_id_syn']))]
dependent_valid_df = dependent_df[(dependent_df.index.isin(valid_indexes['person_id_syn']))]


# filename = 'dependent.parquet.gzip'
# s3_train_url = 's3://{}/{}/{}'.format(bucket, prefix_train, filename)
# s3_valid_url = 's3://{}/{}/{}'.format(bucket, prefix_valid, filename)
# dependent_train_df.to_parquet(s3_train_url, compression='gzip')
# dependent_valid_df.to_parquet(s3_valid_url, compression='gzip')

from io import StringIO # python3; python2: BytesIO 
csv_buffer = StringIO()
dependent_train_df.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
filename_dependent = "dependent.csv"

fe_path = prefix_train + '/' + filename_dependent
s3_resource.Object(bucket, fe_path).put(Body=csv_buffer.getvalue())

csv_buffer = StringIO()
dependent_valid_df.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
filename_dependent = "dependent.csv"

fe_path = prefix_valid + '/' + filename_dependent
s3_resource.Object(bucket, fe_path).put(Body=csv_buffer.getvalue())

Successful S3 get_object response. Status - 200


{'ResponseMetadata': {'RequestId': 'QHPVQ612VYCQ50TN',
  'HostId': '8h9E/xVFpYJNWOEf+rgAggj7opln/YbWR9BfSx0x8FmHGQ31BfzZ0VbYTHYQedFywmD/2FsX1/s=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '8h9E/xVFpYJNWOEf+rgAggj7opln/YbWR9BfSx0x8FmHGQ31BfzZ0VbYTHYQedFywmD/2FsX1/s=',
   'x-amz-request-id': 'QHPVQ612VYCQ50TN',
   'date': 'Tue, 29 Nov 2022 20:10:33 GMT',
   'x-amz-version-id': '7wp2yNPIe9b8SyPNxkeaqlvhBOBzRKAd',
   'etag': '"609483fd6d4c52b1fc1eb4fe9e53ac59"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"609483fd6d4c52b1fc1eb4fe9e53ac59"',
 'VersionId': '7wp2yNPIe9b8SyPNxkeaqlvhBOBzRKAd'}

In [27]:
# Saving index as parquet files
filename = 'index_person_id_syn.parquet.gzip'
s3_train_url = 's3://{}/{}/{}'.format(bucket, prefix_train, filename)
s3_valid_url = 's3://{}/{}/{}'.format(bucket, prefix_valid, filename)


train_indexes.to_parquet(s3_train_url, compression='gzip')
valid_indexes.to_parquet(s3_valid_url, compression='gzip')

In [7]:
# condition_valid_df.reset_index(drop=True)

In [28]:
#Credit

# Get the demo file
response = conn.get_object(Bucket=bucket, Key='rawdata/original_raw_files/full_data/Credit.csv')
status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 get_object response. Status - {status}")
    credit_df = pd.read_csv(response.get("Body"))
else:
    print(f"Unsuccessful S3 get_object response. Status - {status}")
    
    
credit_df.columns = credit_df.columns.str.lower()
credit_df = credit_df.set_index('person_id_syn')
# credit_df = credit_df.merge(condition_df, left_index=True, right_index=True)


credit_train_df = credit_df[(credit_df.index.isin(train_indexes['person_id_syn']))]
credit_valid_df = credit_df[(credit_df.index.isin(valid_indexes['person_id_syn']))]


# filename = 'credit.parquet.gzip'
# s3_train_url = 's3://{}/{}/{}'.format(bucket, prefix_train, filename)
# s3_valid_url = 's3://{}/{}/{}'.format(bucket, prefix_valid, filename)
# credit_train_df.to_parquet(s3_train_url, compression='gzip')
# credit_valid_df.to_parquet(s3_valid_url, compression='gzip')
from io import StringIO # python3; python2: BytesIO 
csv_buffer = StringIO()
credit_train_df.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
filename_credit = "credit.csv"
fe_path = prefix_train + '/' + filename_credit
s3_resource.Object(bucket, fe_path).put(Body=csv_buffer.getvalue())

csv_buffer = StringIO()
credit_valid_df.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
filename_credit = "credit.csv"
fe_path = prefix_valid + '/' + filename_credit
s3_resource.Object(bucket, fe_path).put(Body=csv_buffer.getvalue())



Successful S3 get_object response. Status - 200


{'ResponseMetadata': {'RequestId': '79H7KARMKZFSRPJJ',
  'HostId': 'OcKSIJx7XBBR2sMxjwfqHPWNH6764INmq0QKCYVjwvBTCMIqByZ5Fd3VG22/KxN50jjH9e3FQ1c=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'OcKSIJx7XBBR2sMxjwfqHPWNH6764INmq0QKCYVjwvBTCMIqByZ5Fd3VG22/KxN50jjH9e3FQ1c=',
   'x-amz-request-id': '79H7KARMKZFSRPJJ',
   'date': 'Tue, 29 Nov 2022 20:12:32 GMT',
   'x-amz-version-id': 'MFda9En29TskLn6BzfjYcUSFR3CgZnHR',
   'etag': '"7c67a0113e29e76e2b113ed3407b90a7"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"7c67a0113e29e76e2b113ed3407b90a7"',
 'VersionId': 'MFda9En29TskLn6BzfjYcUSFR3CgZnHR'}