In [1]:
import psycopg2
import pygrametl
from pygrametl.datasources import SQLSource, CSVSource, PandasSource
from pygrametl.tables import Dimension, FactTable
import pandas as pd

filename = '../../Applications/Assessment/loans.csv'
names = ['End of Period','loannumber','region','countrycode','country','Borrower','Guarantor Country Code','Guarantor','Loan Type','Loan Status','Interest Rate','Currency of Commitment','projectidsrc','projectname' ,'orig_principal_amt','cancelled_amt','undisbursed_amt','disbursed_amt','repaid_to_ibrd','Due to IBRD','Exchange Adjustment','Borrower\'s Obligation','Sold 3rd Party','Repaid 3rd Party','Due 3rd Party','Loans Held','First Repayment Date','Last Repayment Date','Agreement Signing Date','Board Approval Date','Effective Date','Closed Date','Last Disbursement Date']
data = pd.read_csv(filename, names=names, skiprows=1)
data['projectidsrc'] = data['projectidsrc'].astype(str)
data['countrycode'] = data['countrycode'].astype(str)

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
peek = data.head(20)
print(peek)

              End of Period loannumber                   region countrycode  \
0   2011-04-30T00:00:00.000  IBRD00010  EUROPE AND CENTRAL ASIA          FR   
1   2011-05-31T00:00:00.000  IBRD00010  EUROPE AND CENTRAL ASIA          FR   
2   2011-06-30T00:00:00.000  IBRD00010  EUROPE AND CENTRAL ASIA          FR   
3   2011-07-31T00:00:00.000  IBRD00010  EUROPE AND CENTRAL ASIA          FR   
4   2011-08-31T00:00:00.000  IBRD00010  EUROPE AND CENTRAL ASIA          FR   
5   2011-09-30T00:00:00.000  IBRD00010  EUROPE AND CENTRAL ASIA          FR   
6   2011-10-31T00:00:00.000  IBRD00010  EUROPE AND CENTRAL ASIA          FR   
7   2011-11-30T00:00:00.000  IBRD00010  EUROPE AND CENTRAL ASIA          FR   
8   2011-12-31T00:00:00.000  IBRD00010  EUROPE AND CENTRAL ASIA          FR   
9   2012-01-31T00:00:00.000  IBRD00010  EUROPE AND CENTRAL ASIA          FR   
10  2012-02-29T00:00:00.000  IBRD00010  EUROPE AND CENTRAL ASIA          FR   
11  2012-03-31T00:00:00.000  IBRD00010  EUROPE AND C

In [3]:
print(data.shape)

(854942, 33)


In [4]:
print(peek.shape)

(20, 33)


In [5]:
types = data.dtypes
print(types)

End of Period              object
loannumber                 object
region                     object
countrycode                object
country                    object
Borrower                   object
Guarantor Country Code     object
Guarantor                  object
Loan Type                  object
Loan Status                object
Interest Rate             float64
Currency of Commitment    float64
projectidsrc               object
projectname                object
orig_principal_amt        float64
cancelled_amt             float64
undisbursed_amt           float64
disbursed_amt             float64
repaid_to_ibrd            float64
Due to IBRD               float64
Exchange Adjustment       float64
Borrower's Obligation     float64
Sold 3rd Party            float64
Repaid 3rd Party          float64
Due 3rd Party             float64
Loans Held                float64
First Repayment Date       object
Last Repayment Date        object
Agreement Signing Date     object
Board Approval

In [6]:
pd.set_option( 'display.width' , 100)
pd.set_option( 'precision' , 3)
description = data.describe()
print(description)


       Interest Rate  Currency of Commitment  orig_principal_amt  cancelled_amt  undisbursed_amt  \
count     827914.000                     0.0           8.549e+05      8.549e+05        8.549e+05   
mean           4.706                     NaN           7.724e+07      9.715e+06        7.680e+06   
std            3.384                     NaN           1.491e+08      4.478e+07        5.392e+07   
min            0.000                     NaN           0.000e+00      0.000e+00       -1.000e-01   
25%            0.830                     NaN           1.000e+07      0.000e+00        0.000e+00   
50%            5.580                     NaN           3.000e+07      2.552e+02        0.000e+00   
75%            7.350                     NaN           8.522e+07      3.518e+06        0.000e+00   
max           17.000                     NaN           3.750e+09      1.995e+09        3.379e+09   

       disbursed_amt  repaid_to_ibrd  Due to IBRD  Exchange Adjustment  Borrower's Obligation  \
co

In [7]:
class_counts = data.groupby( 'projectname' ).size()
print(class_counts)

projectname
  COAL IAP                                   66
 AR 2nd Norte Grande Water Infrastr          32
 BR Manaus Service Delivery & Fisca          10
 BR Manaus Service Delivery & Fiscal Mgm      8
 CN-Beijing Rooftop Solar PV Scale-          36
                                           ... 
Zhuzhou Brownfield Remediation Proj           7
Zhuzhou Brownfield Remediation Project       17
Zimbabwe:FOREST RESOURCE MGT&                83
eGabon                                       21
fOC TRNG SCTR                               249
Length: 6346, dtype: int64


In [8]:
data.drop(columns=['Borrower','Guarantor Country Code','Guarantor','Loan Type','Loan Status','Interest Rate','Currency of Commitment','Due to IBRD','Exchange Adjustment','Borrower\'s Obligation','Sold 3rd Party','Repaid 3rd Party','Due 3rd Party','Loans Held','First Repayment Date','Last Repayment Date','Agreement Signing Date','Board Approval Date','Effective Date','Closed Date','Last Disbursement Date'])
dataPS = PandasSource(data)

In [9]:
#datawarehouse connection
dw_string = "host='localhost' dbname='dw' user='dw' password='dwhouse'"
dw_pgconn = psycopg2.connect(dw_string)
dw_conn_wrapper = pygrametl.ConnectionWrapper(connection=dw_pgconn)

In [10]:
#Dimension and Fact objects
loan_dimension = Dimension(
    name='loandim',
    key='loanid',
    attributes=['loannumber'])

eop_dimension = Dimension(
    name='eopdim',
    key='eopid',
    attributes=['day', 'month', 'year'])

country_dimension = Dimension(
    name='countrydim',
    key='countryid',
    attributes=['countrycode', 'country'])

region_dimension = Dimension(
    name='regiondim',
    key='regionid',
    attributes=['region'])

project_dimension = Dimension(
    name='projectdim',
    key='projectid',
    attributes=['projectidsrc', 'projectname'],
    lookupatts=['projectidsrc'])

fact_table = FactTable(
    name='loanfact',
    keyrefs=['loanid', 'eopid', 'countryid','regionid','projectid'],
    measures=['orig_principal_amt','cancelled_amt','undisbursed_amt','disbursed_amt','repaid_to_ibrd'])


In [11]:
# A normal Python function is used to split the timestamp into its parts
def split_timestamp(row):
    timestamp = row['End of Period']
    timestamp = timestamp[:10]
    timestamp_split = timestamp.split('-')
    row['year'] = timestamp_split[0]
    row['month'] = timestamp_split[1]
    row['day'] = timestamp_split[2]

In [12]:
#save data to database
for row in dataPS:
    split_timestamp(row)
    row['loanid'] = loan_dimension.ensure(row)
    row['eopid'] = eop_dimension.ensure(row)
    row['countryid'] = country_dimension.ensure(row, namemapping={'countrycode':'countrycode'})
    row['regionid'] = region_dimension.ensure(row)
    row['projectid'] = project_dimension.ensure(row)
    #insert fact
    fact_table.insert(row)
    
dw_conn_wrapper.commit()
dw_conn_wrapper.close()

In [13]:
data.columns

Index(['End of Period', 'loannumber', 'region', 'countrycode', 'country', 'Borrower',
       'Guarantor Country Code', 'Guarantor', 'Loan Type', 'Loan Status', 'Interest Rate',
       'Currency of Commitment', 'projectidsrc', 'projectname', 'orig_principal_amt',
       'cancelled_amt', 'undisbursed_amt', 'disbursed_amt', 'repaid_to_ibrd', 'Due to IBRD',
       'Exchange Adjustment', 'Borrower's Obligation', 'Sold 3rd Party', 'Repaid 3rd Party',
       'Due 3rd Party', 'Loans Held', 'First Repayment Date', 'Last Repayment Date',
       'Agreement Signing Date', 'Board Approval Date', 'Effective Date', 'Closed Date',
       'Last Disbursement Date'],
      dtype='object')

In [14]:
print(data.head(3))

             End of Period loannumber                   region countrycode country  \
0  2011-04-30T00:00:00.000  IBRD00010  EUROPE AND CENTRAL ASIA          FR  France   
1  2011-05-31T00:00:00.000  IBRD00010  EUROPE AND CENTRAL ASIA          FR  France   
2  2011-06-30T00:00:00.000  IBRD00010  EUROPE AND CENTRAL ASIA          FR  France   

          Borrower Guarantor Country Code Guarantor Loan Type Loan Status  ...  Repaid 3rd Party  \
0  CREDIT NATIONAL                     FR    France  NON POOL      Repaid  ...         2.500e+08   
1  CREDIT NATIONAL                     FR    France  NON POOL      Repaid  ...         2.500e+08   
2  CREDIT NATIONAL                     FR    France  NON POOL      Repaid  ...         2.500e+08   

   Due 3rd Party Loans Held     First Repayment Date      Last Repayment Date  \
0            0.0        0.0  1952-11-01T00:00:00.000  1977-05-01T00:00:00.000   
1            0.0        0.0  1952-11-01T00:00:00.000  1977-05-01T00:00:00.000   
2          