In [2]:
#ETL tools
import psycopg2
import pygrametl
from pygrametl.datasources import SQLSource, CSVSource, PandasSource
from pygrametl.tables import Dimension, FactTable, CachedDimension, BulkFactTable

#stats tools
import pandas as pd
import pandas_profiling

#Ingest CSV input data file and modify data types
filename = '../../../Applications/Assessment/loans.csv'
names = ['End of Period','loannumber','region','countrycode','country','Borrower','Guarantor Country Code','Guarantor','Loan Type','Loan Status','Interest Rate','Currency of Commitment','projectidsrc','projectname' ,'orig_principal_amt','cancelled_amt','undisbursed_amt','disbursed_amt','repaid_to_ibrd','Due to IBRD','Exchange Adjustment','Borrower\'s Obligation','Sold 3rd Party','Repaid 3rd Party','Due 3rd Party','Loans Held','First Repayment Date','Last Repayment Date','Agreement Signing Date','Board Approval Date','Effective Date','Closed Date','Last Disbursement Date']
data = pd.read_csv(filename, names=names, skiprows=1)
data['projectidsrc'] = data['projectidsrc'].astype(str)
data['countrycode'] = data['countrycode'].astype(str)
data['countrycode'] = data['countrycode'].astype(str).upper()
# df['End of Period'] = pd.to_numeric(df['End of Period'])
# df['Due 3rd Party'] = pd.to_datetime(df['Due 3rd Party']) 

ModuleNotFoundError: No module named 'pandas_profiling'

In [None]:
#Explore data
#peek = data.head(20)
#print(peek)
#print(data.shape)
#print(peek.shape)
#types = data.dtypes
#print(types)
#types = data.dtypes
#print(types)
#pd.set_option( 'display.width' , 100)
#pd.set_option( 'precision' , 3)
#description = data.describe()
#print(description)
#class_counts = data.groupby( 'projectname' ).size()
#print(class_counts)

In [None]:
#profile the data - Statistics
pandas_profiling.ProfileReport(data)

In [None]:

data.drop(columns=['Borrower','Guarantor Country Code','Guarantor','Loan Type','Loan Status','Interest Rate','Currency of Commitment','Due to IBRD','Exchange Adjustment','Borrower\'s Obligation','Sold 3rd Party','Repaid 3rd Party','Due 3rd Party','Loans Held','First Repayment Date','Last Repayment Date','Agreement Signing Date','Board Approval Date','Effective Date','Closed Date','Last Disbursement Date'])
dataPS = PandasSource(data)

In [None]:
#datawarehouse connection
dw_string = "host='localhost' dbname='dw1' user='dw' password='dwhouse'"
dw_pgconn = psycopg2.connect(dw_string)
dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=dw_pgconn)

#bulk insert method
def bulkloader(name, attributes, fieldsep, rowsep, nullval, filehandle):
    cursor = dw_conn_wrapper.cursor()
    cursor.copy_from(file=filehandle, table=name, sep=fieldsep,null="nullval",
                     columns=attributes)

In [None]:
#Dimension and Fact objects
loan_dimension = CachedDimension(
    name='loandim',
    key='loanid',
    attributes=['loannumber'],
    lookupatts=['loannumber'],
    prefill=True)

eop_dimension = CachedDimension(
    name='eopdim',
    key='eopid',
    attributes=['day', 'month', 'year'],
    lookupatts=['day','month','year'],
    prefill=True)

country_dimension = CachedDimension(
    name='countrydim',
    key='countryid',
    attributes=['countrycode', 'country'],
    lookupatts=['country'],
    prefill=True)

region_dimension = CachedDimension(
    name='regiondim',
    key='regionid',
    attributes=['region'],
    lookupatts=['region'],
    prefill=True)

project_dimension = CachedDimension(
    name='projectdim',
    key='projectid',
    attributes=['projectidsrc', 'projectname'],
    lookupatts=['projectidsrc'],
    prefill=True)

fact_table = BulkFactTable(
    name='loanfact',
    keyrefs=['loanid', 'eopid', 'countryid','regionid','projectid'],
    measures=['orig_principal_amt','cancelled_amt','undisbursed_amt','disbursed_amt','repaid_to_ibrd'],
    bulkloader=bulkloader,
    bulksize=1000000)


In [None]:
# A normal Python function is used to split the timestamp into its parts
def split_timestamp(row):
    timestamp = row['End of Period']
    timestamp = timestamp[:10]
    timestamp_split = timestamp.split('-')
    row['year'] = timestamp_split[0]
    row['month'] = timestamp_split[1]
    row['day'] = timestamp_split[2]


In [None]:
#save data to database
for row in dataPS:
    split_timestamp(row)
    row['loanid'] = loan_dimension.ensure(row)
    row['eopid'] = eop_dimension.ensure(row)
    row['countryid'] = country_dimension.ensure(row, namemapping={'countrycode':'countrycode'})
    row['regionid'] = region_dimension.ensure(row)
    row['projectid'] = project_dimension.ensure(row)
    #insert fact
    fact_table.insert(row)
    
dw_conn_wrapper.commit()
dw_conn_wrapper.close()

In [None]:
data.columns

In [None]:
print(data.head(3))

In [None]:
data.isnull().sum()