In [1]:
import sqlalchemy
import psycopg2
import pandas as pd

from tqdm import tqdm

# DSN (data source name) format for database connections:  
# [protocol / database  name]://[username]:[password]@[hostname / ip]:[port]/[database name here]

# on your computer you are the user postgres (full administrative access)
db_user = 'postgres'
# if you need a password to access a database, put it here
db_password = ''
# on your computer, use localhost
db_host = 'localhost'
# the default port for postgres is 5432
db_port = 5432
# we want to creat a database with the following name
database =  'cms_claims' #'cms_medicare_claims'   #
#set up a connection to postgres without specifying the database
conn_postgres = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}'

In [9]:
# open a new connection to the database that we created
conn_str = f'postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{database}'
conn = psycopg2.connect(conn_str)

In [10]:
# connect using sqlalchemy
engine = sqlalchemy.engine.create_engine(conn_str)

### Defining file paths

In [11]:
path = '/Users/abhi/Documents/Abhi/General Assembly/Immersive course/Capstone Project/CMS_Data/'

Beneficiary_Summary_File_Sample_1_2008_file = path + 'Sample1/DE1_0_2008_Beneficiary_Summary_File_Sample_1.csv'
Beneficiary_Summary_File_Sample_1_2009_file = path + 'Sample1/DE1_0_2009_Beneficiary_Summary_File_Sample_1.csv'
Beneficiary_Summary_File_Sample_1_2010_file = path + 'Sample1/DE1_0_2010_Beneficiary_Summary_File_Sample_20.csv'
#check why number 20 for 2010

# Inpatient file
Inpatient_Sample_1_file = path + 'Sample1/DE1_0_2008_to_2010_Inpatient_Claims_Sample_1.csv'

# Outpatient file
Outpatient_Sample_1_file = path + 'Sample1/DE1_0_2008_to_2010_Outpatient_Claims_Sample_1.csv'

# Prescription Drug events file
Prescription_events1_file = path + 'Sample1/DE1_0_2008_to_2010_Prescription_Drug_Events_Sample_1.csv'

# ICD 9 Diagnosis and procedure files
ICD9_Diagonsis_file = path + 'ICD-9/CMS32_DESC_LONG_SHORT_DX.xlsx'
ICD9_Procedure_file = path + 'ICD-9/CMS32_DESC_LONG_SHORT_SG.xlsx'

# HCPCS Codes
HCPCS_file = path + 'HCPCS/HCPCS.xlsx'


### Reading files in DataFrames & dealing with Date-time format 

In [12]:
from datetime import datetime
datetime.now()

datetime.datetime(2020, 5, 10, 17, 12, 33, 184073)

In [13]:
Beneficiary_2008DF = pd.read_csv(Beneficiary_Summary_File_Sample_1_2008_file)
Beneficiary_2009DF = pd.read_csv(Beneficiary_Summary_File_Sample_1_2009_file)
Beneficiary_2010DF = pd.read_csv(Beneficiary_Summary_File_Sample_1_2010_file)

# Converting to Datetime format
Beneficiary_2008DF['BENE_BIRTH_DT'] = pd.to_datetime(Beneficiary_2008DF['BENE_BIRTH_DT'], format='%Y%m%d', errors='coerce')
Beneficiary_2008DF['BENE_DEATH_DT'] = pd.to_datetime(Beneficiary_2008DF['BENE_DEATH_DT'], format='%Y%m%d', errors='coerce')

Beneficiary_2009DF['BENE_BIRTH_DT'] = pd.to_datetime(Beneficiary_2009DF['BENE_BIRTH_DT'], format='%Y%m%d', errors='coerce')
Beneficiary_2009DF['BENE_DEATH_DT'] = pd.to_datetime(Beneficiary_2009DF['BENE_DEATH_DT'], format='%Y%m%d', errors='coerce')

Beneficiary_2010DF['BENE_BIRTH_DT'] = pd.to_datetime(Beneficiary_2010DF['BENE_BIRTH_DT'], format='%Y%m%d', errors='coerce')
Beneficiary_2010DF['BENE_DEATH_DT'] = pd.to_datetime(Beneficiary_2010DF['BENE_DEATH_DT'], format='%Y%m%d', errors='coerce')

In [14]:
Inpatient_DF = pd.read_csv(Inpatient_Sample_1_file)

# Converting to Datetime format
#datetime.strptime(str(int(InpatientClaimsDF['CLM_FROM_DT'][100])), '%Y%m%d')
Inpatient_DF['CLM_FROM_DT'] = pd.to_datetime(Inpatient_DF['CLM_FROM_DT'], format='%Y%m%d', errors='coerce')
Inpatient_DF['CLM_THRU_DT'] = pd.to_datetime(Inpatient_DF['CLM_THRU_DT'], format='%Y%m%d', errors='coerce')

In [15]:
Prescription_eventsDF = pd.read_csv(Prescription_events1_file)

# Converting to Datetime format
Prescription_eventsDF['SRVC_DT'] = pd.to_datetime(Prescription_eventsDF['SRVC_DT'], format='%Y%m%d', errors='coerce')

  interactivity=interactivity, compiler=compiler, result=result)


In [16]:
Outpatient_DF = pd.read_csv(Outpatient_Sample_1_file)

# Converting to Datetime format
#datetime.strptime(str(int(Outpatient_DF['CLM_FROM_DT'][100])), '%Y%m%d')
Outpatient_DF['CLM_FROM_DT'] = pd.to_datetime(Outpatient_DF['CLM_FROM_DT'], format='%Y%m%d', errors='coerce')
Outpatient_DF['CLM_THRU_DT'] = pd.to_datetime(Outpatient_DF['CLM_THRU_DT'], format='%Y%m%d', errors='coerce')

  interactivity=interactivity, compiler=compiler, result=result)


In [17]:
ICD9_DiagonsisDF = pd.read_excel(ICD9_Diagonsis_file)
ICD9_ProcedureDF = pd.read_excel(ICD9_Procedure_file)

In [18]:
datetime.now()

datetime.datetime(2020, 5, 10, 17, 12, 51, 612150)

In [19]:
HCPCS_DF = pd.read_excel(HCPCS_file)

In [20]:
#conn.commit()  

In [22]:
datetime.now()

datetime.datetime(2020, 5, 10, 17, 12, 52, 149226)

#### Checking tables in the database before creating new tables

In [23]:
pd.read_sql("SELECT tablename FROM pg_catalog.pg_tables WHERE schemaname='public'", con=engine)

Unnamed: 0,tablename


### Creating tables and loading DataFrames to the Database

In [24]:
datetime.now()

datetime.datetime(2020, 5, 10, 17, 12, 52, 231611)

In [25]:
# Need to run only once for these ICD lookupfiles

ICD9_DiagonsisDF.to_sql(name = 'icd9_diagonsis', con = engine, if_exists = 'replace', index = False)
ICD9_ProcedureDF.to_sql(name = 'icd9_procedures', con = engine, if_exists = 'replace', index = False)

In [26]:
HCPCS_DF.to_sql(name = 'hcpcs', con = engine, if_exists = 'replace', index = False)

In [27]:
datetime.now()

datetime.datetime(2020, 5, 10, 17, 12, 55, 275667)

In [28]:
Beneficiary_2008DF.to_sql(name = 'beneficiary2008', con = engine, if_exists = 'replace', index = False)
Beneficiary_2009DF.to_sql(name = 'beneficiary2009', con = engine, if_exists = 'replace', index = False)
Beneficiary_2010DF.to_sql(name = 'beneficiary2010', con = engine, if_exists = 'replace', index = False)

In [29]:
#from datetime import datetime
datetime.now()

datetime.datetime(2020, 5, 10, 17, 14, 51, 718984)

In [30]:
Inpatient_DF.to_sql(name = 'inpatient_claims', con = engine, if_exists = 'replace', index = False)

In [31]:
#from datetime import datetime
datetime.now()

datetime.datetime(2020, 5, 10, 17, 15, 30, 632196)

In [32]:
# Fairly large file ((165MB))  - takes 7 mins
Outpatient_DF.to_sql(name = 'outpatient_claims', con = engine, if_exists = 'replace', index = False)

In [33]:
datetime.now()

datetime.datetime(2020, 5, 10, 17, 22, 35, 302872)

In [34]:
# Caution: large file (0.5GB) - it takes approx 16 mins for this file alone
Prescription_eventsDF.to_sql(name = 'prescription_drug_events', con = engine, if_exists = 'replace', index = False)

In [35]:
datetime.now()

datetime.datetime(2020, 5, 10, 17, 38, 30, 844834)

In [40]:
conn.commit()   #engine.commit()

#### Checking tables in the database after creating new tables

In [41]:
pd.read_sql("SELECT tablename FROM pg_catalog.pg_tables WHERE schemaname='public'", con=engine)

Unnamed: 0,tablename
0,icd9_diagonsis
1,icd9_procedures
2,hcpcs
3,beneficiary2008
4,beneficiary2009
5,beneficiary2010
6,inpatient_claims
7,outpatient_claims
8,prescription_drug_events
