In [9]:
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
import boto3
import os
import io


In [2]:
sess = sagemaker.Session()
role = get_execution_role()

In [3]:
prefix_full_data = 'rawdata/original_raw_files/full_data'
prefix_train = 'rawdata/original_raw_files/train'
prefix_valid = 'rawdata/original_raw_files/valid'
bucket = "humana-data"


conn = boto3.client('s3')
contents = conn.list_objects(Bucket=bucket, Prefix=prefix_full_data)['Contents']

rawfile_names = [key['Key'] for key in contents]
rawfile_names = [x for x in rawfile_names if x.endswith('.csv')]
rawfile_names

['rawdata/original_raw_files/full_data/CMS.csv',
 'rawdata/original_raw_files/full_data/Condition.csv',
 'rawdata/original_raw_files/full_data/Credit.csv',
 'rawdata/original_raw_files/full_data/Demo.csv',
 'rawdata/original_raw_files/full_data/Lab.csv',
 'rawdata/original_raw_files/full_data/Medical Claims.csv',
 'rawdata/original_raw_files/full_data/Not sure.csv',
 'rawdata/original_raw_files/full_data/Others.csv',
 'rawdata/original_raw_files/full_data/Pharm.csv',
 'rawdata/original_raw_files/full_data/dependent.csv']

In [4]:
# Get the demo file
response = conn.get_object(Bucket=bucket, Key='rawdata/original_raw_files/full_data/Condition.csv')
status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 get_object response. Status - {status}")
    condition_df = pd.read_csv(response.get("Body"))
else:
    print(f"Unsuccessful S3 get_object response. Status - {status}")

condition_df.columns = condition_df.columns.str.lower()
condition_df = condition_df.set_index('person_id_syn')

# condition_df = dependent_df.merge(condition_df, left_index=True, right_index=True)

condition_train_df = condition_df.sample(frac= 0.7)

train_indexes = pd.DataFrame(condition_train_df.reset_index()['person_id_syn'].to_dict().items(),
                             columns = ['index_num', 'person_id_syn'])

condition_valid_df = condition_df[~(condition_df.index.isin(train_indexes['person_id_syn']))]
valid_indexes = pd.DataFrame(condition_valid_df.reset_index()['person_id_syn'].to_dict().items(), columns = ['index_num', 'person_id_syn'])

# Saving train
# filename = 'condition.parquet.gzip'
# s3_train_url = 's3://{}/{}/{}'.format(bucket, prefix_train, filename)
# s3_valid_url = 's3://{}/{}/{}'.format(bucket, prefix_valid, filename)
# condition_train_df.to_parquet(s3_train_url, compression='gzip')
# condition_valid_df.to_parquet(s3_valid_url, compression='gzip')


from io import StringIO # python3; python2: BytesIO 
csv_buffer = StringIO()
condition_train_df.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
filename_condition = "condition.csv"

fe_path = prefix_train + '/' + filename_condition
s3_resource.Object(bucket, fe_path).put(Body=csv_buffer.getvalue())




condition_valid_df.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
filename_condition = "condition.csv"

fe_path = prefix_valid + '/' + filename_condition
s3_resource.Object(bucket, fe_path).put(Body=csv_buffer.getvalue())

Successful S3 get_object response. Status - 200


{'ResponseMetadata': {'RequestId': 'H7K7EY7TYCBE7E4C',
  'HostId': '8yLAUapMwH0bW+NZZzV4Ot2jkaDFq/5PyrYPGunnQq6JYgG+vci0/clX7jAqncMvLOAlbNF+vf0=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '8yLAUapMwH0bW+NZZzV4Ot2jkaDFq/5PyrYPGunnQq6JYgG+vci0/clX7jAqncMvLOAlbNF+vf0=',
   'x-amz-request-id': 'H7K7EY7TYCBE7E4C',
   'date': 'Wed, 16 Nov 2022 19:06:29 GMT',
   'x-amz-version-id': 'Uyu5wifroddd2lMBaLeF4GEqQ8KRBJVF',
   'etag': '"25ad293d12eed88a5a0885ddfa6abf57"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"25ad293d12eed88a5a0885ddfa6abf57"',
 'VersionId': 'Uyu5wifroddd2lMBaLeF4GEqQ8KRBJVF'}

In [5]:
# Dependent
# Get the demo file
response = conn.get_object(Bucket=bucket, Key='rawdata/original_raw_files/full_data/dependent.csv')
status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 get_object response. Status - {status}")
    dependent_df = pd.read_csv(response.get("Body"))
else:
    print(f"Unsuccessful S3 get_object response. Status - {status}")
    
    
dependent_df.columns = dependent_df.columns.str.lower()
dependent_df = dependent_df.set_index('person_id_syn')

dependent_train_df = dependent_df[(dependent_df.index.isin(train_indexes['person_id_syn']))]
dependent_valid_df = dependent_df[~(dependent_df.index.isin(train_indexes['person_id_syn']))]


# filename = 'dependent.parquet.gzip'
# s3_train_url = 's3://{}/{}/{}'.format(bucket, prefix_train, filename)
# s3_valid_url = 's3://{}/{}/{}'.format(bucket, prefix_valid, filename)
# dependent_train_df.to_parquet(s3_train_url, compression='gzip')
# dependent_valid_df.to_parquet(s3_valid_url, compression='gzip')

from io import StringIO # python3; python2: BytesIO 
csv_buffer = StringIO()
dependent_train_df.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
filename_dependent = "dependent.csv"

fe_path = prefix_train + '/' + filename_dependent
s3_resource.Object(bucket, fe_path).put(Body=csv_buffer.getvalue())


dependent_valid_df.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
filename_dependent = "dependent.csv"

fe_path = prefix_valid + '/' + filename_dependent
s3_resource.Object(bucket, fe_path).put(Body=csv_buffer.getvalue())

Successful S3 get_object response. Status - 200


{'ResponseMetadata': {'RequestId': 'KV62H82P6C05D2CY',
  'HostId': 'EtEpe0XPLVsp3Yr/Fq5RSfEhm4YzAa5dZGOP1NfvWlMV4yx0umHy2y+dTEAv3KeKWYIN+2tBFnk=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'EtEpe0XPLVsp3Yr/Fq5RSfEhm4YzAa5dZGOP1NfvWlMV4yx0umHy2y+dTEAv3KeKWYIN+2tBFnk=',
   'x-amz-request-id': 'KV62H82P6C05D2CY',
   'date': 'Wed, 16 Nov 2022 19:06:30 GMT',
   'x-amz-version-id': 'ZY1_Fi7zTVzNgHENeU7pGpFGBXTSo6FF',
   'etag': '"bd6ae78448a98df27669c7834bfd8d85"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"bd6ae78448a98df27669c7834bfd8d85"',
 'VersionId': 'ZY1_Fi7zTVzNgHENeU7pGpFGBXTSo6FF'}

In [6]:
# Saving index as parquet files
filename = 'index_person_id_syn.parquet.gzip'
s3_train_url = 's3://{}/{}/{}'.format(bucket, prefix_train, filename)
s3_valid_url = 's3://{}/{}/{}'.format(bucket, prefix_valid, filename)


train_indexes.to_parquet(s3_train_url, compression='gzip')
valid_indexes.to_parquet(s3_valid_url, compression='gzip')

In [7]:
# condition_valid_df.reset_index(drop=True)

In [8]:
#Credit

# Get the demo file
response = conn.get_object(Bucket=bucket, Key='rawdata/original_raw_files/full_data/Credit.csv')
status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 get_object response. Status - {status}")
    credit_df = pd.read_csv(response.get("Body"))
else:
    print(f"Unsuccessful S3 get_object response. Status - {status}")
    
    
credit_df.columns = credit_df.columns.str.lower()
credit_df = credit_df.set_index('person_id_syn')
# credit_df = credit_df.merge(condition_df, left_index=True, right_index=True)


credit_train_df = credit_df[(credit_df.index.isin(train_indexes['person_id_syn']))]
credit_valid_df = credit_df[~(credit_df.index.isin(train_indexes['person_id_syn']))]


# filename = 'credit.parquet.gzip'
# s3_train_url = 's3://{}/{}/{}'.format(bucket, prefix_train, filename)
# s3_valid_url = 's3://{}/{}/{}'.format(bucket, prefix_valid, filename)
# credit_train_df.to_parquet(s3_train_url, compression='gzip')
# credit_valid_df.to_parquet(s3_valid_url, compression='gzip')
from io import StringIO # python3; python2: BytesIO 
csv_buffer = StringIO()
credit_train_df.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
filename_credit = "credit.csv"
fe_path = prefix_train + '/' + filename_credit
s3_resource.Object(bucket, fe_path).put(Body=csv_buffer.getvalue())


credit_valid_df.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
filename_credit = "credit.csv"
fe_path = prefix_valid + '/' + filename_credit
s3_resource.Object(bucket, fe_path).put(Body=csv_buffer.getvalue())



Successful S3 get_object response. Status - 200


{'ResponseMetadata': {'RequestId': 'D8NK9S63TCKZH2KD',
  'HostId': '1umOdeg3XvUgmlt/ZZp2I9+8MGGIEAGhemOmqx/sNznjDD+ROlU1UlYKm4TBhD+gZ6LQMq3YAbY=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '1umOdeg3XvUgmlt/ZZp2I9+8MGGIEAGhemOmqx/sNznjDD+ROlU1UlYKm4TBhD+gZ6LQMq3YAbY=',
   'x-amz-request-id': 'D8NK9S63TCKZH2KD',
   'date': 'Wed, 16 Nov 2022 19:06:44 GMT',
   'x-amz-version-id': '2ddeOMj.1HlO3EIshliV2A8lo_aeZ6SY',
   'etag': '"05365e306463a8d14f695461d04fbcb5"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"05365e306463a8d14f695461d04fbcb5"',
 'VersionId': '2ddeOMj.1HlO3EIshliV2A8lo_aeZ6SY'}