In [53]:
#import libraries
import pandas as pd
from datetime import datetime, timedelta
import numpy as np

pd.options.display.max_columns = None # no limits to columns display

In [3]:
#import datasets
path = "data/"
country_stats =pd.read_csv(path + "country_stats.csv")
loans_lenders = pd.read_csv(path+"loans_lenders.csv", nrows=100000)
lenders = pd.read_csv(path+"lenders.csv")
loans = pd.read_csv(path+"loans.csv", nrows=100000)

In [3]:
# 1) Normalize the loan_lenders table. In the normalized table, each row must have one loan_id and one lenders
#FUNZIONA

loans_lenders = pd.concat([pd.Series(row['loan_id'], row['lenders'].split(',')) for _,row in loans_lenders.iterrows()]).reset_index()
loans_lenders.columns = ['permanent_name', 'loan_id']

In [4]:
# 2) For each loan, add a column duration corresponding to the number of days between 
#the disburse time and the planned expiration time. 
#FUNZIONA
def time_delta(y,x): 
    end = pd.to_datetime(y)
    start = pd.to_datetime(x)
    delta = end-start
    return delta

loans["duration"] =  time_delta(loans.planned_expiration_time, loans.disburse_time)


In [6]:
# 3) find the "lenders" that have funded at least twice.
# FUNZIONA
not_unique_lenders = loans_lenders.groupby("permanent_name").size()
not_unique_lenders = pd.DataFrame(not_unique_lenders[not_unique_lenders  > 1])
not_unique_lenders.columns = ["number_of_loans"]
not_unique_lenders.head()

Unnamed: 0_level_0,number_of_loans
permanent_name,Unnamed: 1_level_1
000,3
00000,5
0002,3
0101craign0101,4
0326lsw,3


In [7]:
#4) For each country, compute how many loans have involved that country as borrowers.
#5) For each country, compute the overall amount of money borrowed.
# FUNZIONA
aggregate_state = pd.concat([
    loans.groupby(['country_code']).count()['loan_id'],
    loans.groupby(['country_code']).sum()['funded_amount']], axis = 1)
aggregate_state.columns = ('count_loans', 'amount_loans')

In [8]:
# 6) Like the previous point, but expressed as a percentage of the overall amount lent.
# FUNZIONA
merged = loans.merge(loans_lenders,on='loan_id').merge(lenders,on='permanent_name')
lent = merged.groupby(['country_code_y'])['funded_amount'].sum()
aggregate_state = pd.concat([aggregate_state,lent],axis=1,ignore_index=True, sort=False).fillna(0)
aggregate_state.columns = ('count_loans', 'amount_loans', 'lent_amount')
aggregate_state["percentage"] = aggregate_state["lent_amount"] / aggregate_state["amount_loans"] *100

In [9]:
aggregate_state

Unnamed: 0,count_loans,amount_loans,lent_amount,percentage
AF,98.0,86125.0,700.0,0.812772
AL,241.0,314775.0,1400.0,0.444762
AM,971.0,1429575.0,0.0,0.000000
AZ,661.0,1043300.0,475.0,0.045529
BA,29.0,20475.0,1725.0,8.424908
BF,245.0,263450.0,0.0,0.000000
BG,3.0,2950.0,8275.0,280.508475
BI,147.0,414775.0,0.0,0.000000
BJ,425.0,291650.0,0.0,0.000000
BO,1735.0,2904850.0,600.0,0.020655


In [7]:
# 7) Like the three previous points, but split for each year (with respect to disburse time).
# FUNZIONA
loans['disburse_time'] = pd.to_datetime(loans['disburse_time'])
loans['year'] = loans['disburse_time'].dt.year
aggregate_state_years = pd.concat([
    loans.groupby(['country_code', 'year']).count()['loan_id'],
    loans.groupby(['country_code', 'year']).sum()['funded_amount']], axis = 1)
aggregate_state_years.columns = ('count_loans', 'amount_loans')

In [11]:
aggregate_state_years

Unnamed: 0_level_0,Unnamed: 1_level_0,count_loans,amount_loans
country_code,year,Unnamed: 2_level_1,Unnamed: 3_level_1
AF,2007.0,10,5750.0
AF,2008.0,6,5875.0
AF,2009.0,51,44950.0
AF,2010.0,25,23450.0
AF,2011.0,6,6100.0
AL,2012.0,36,35775.0
AL,2013.0,64,88025.0
AL,2014.0,54,86250.0
AL,2015.0,27,34950.0
AL,2016.0,28,32525.0


In [12]:
# 8) For each lender, compute the overall amount of money lent.
# FUNZIONA
merged['single_loan_amount']= merged['funded_amount'] / merged['num_lenders_total']
lender_amount = merged.groupby(['permanent_name']).sum()['single_loan_amount']

In [13]:
# 9) For each country, compute the difference between the overall amount of money lent and the overall amount of money borrowed.
aggregate_state["difference"] = aggregate_state["lent_amount"] - aggregate_state["amount_loans"]

In [15]:
aggregate_state.reset_index(inplace = True)
aggregate_state=aggregate_state.rename(columns = {'index':'country_code'})
aggregate_state_extended = aggregate_state.merge(country_stats, on = "country_code")

In [16]:
# 10) Which country has the highest ratio between the difference computed at the previous point and the population?
# FUNZIONA
aggregate_state_extended['ratio'] = aggregate_state_extended['difference'] / aggregate_state_extended['population']
aggregate_state_extended.iloc[aggregate_state_extended['ratio'].idxmax()]

country_code                                  NO
count_loans                                    0
amount_loans                                   0
lent_amount                               509325
percentage                                   inf
difference                                509325
country_name                              Norway
country_code3                                NOR
continent                                 Europe
region                           Northern Europe
population                               5305383
population_below_poverty_line                NaN
hdi                                     0.949423
life_expectancy                           81.711
expected_years_of_schooling              17.6719
mean_years_of_schooling                  12.7464
gni                                      67614.4
kiva_country_name                         Norway
ratio                                  0.0960016
Name: 125, dtype: object

In [17]:
#11) Which country has the highest ratio between the difference computed at point 9 and the population that is not below the poverty line?
aggregate_state_extended['ratio_not_poverty'] = aggregate_state_extended['difference'] / (aggregate_state_extended['population'] * (100 - aggregate_state_extended['population_below_poverty_line'] ))
aggregate_state_extended.iloc[aggregate_state_extended['ratio_not_poverty'].idxmax()]

country_code                                   CA
count_loans                                     0
amount_loans                                    0
lent_amount                           3.22205e+06
percentage                                    inf
difference                            3.22205e+06
country_name                               Canada
country_code3                                 CAN
continent                                Americas
region                           Northern America
population                               36624199
population_below_poverty_line                 9.4
hdi                                      0.920284
life_expectancy                            82.224
expected_years_of_schooling                16.325
mean_years_of_schooling                   13.1051
gni                                       42581.9
kiva_country_name                          Canada
ratio                                    0.087976
ratio_not_poverty                     0.000971037


In [60]:
# 12 For each year, compute the total amount of loans. 
#Each loan that has planned expiration time and disburse time 
#in different years must have its amount distributed proportionally to the number of days 
#in each year. For example, a loan with disburse time December 1st, 2016, planned expiration 
#time January 30th 2018, and amount 5000USD has an amount of 5000USD * 31 / (31+365+30) = 363.85
#for 2016, 5000USD * 365 / (31+365+30) = 4284.04 for 2017, and 5000USD * 30 / (31+365+30) = 352.11 for 2018.
loans['planned_expiration_time'] = pd.to_datetime(loans['planned_expiration_time'])
def delta_date(start, stop):
    delta = stop - start
    
    if delta.days < 0 or pd.isnull(start) or pd.isnull(stop): #check error
        return {}
    years = {}
    for i in range(delta.days + 1):
        day = start + timedelta(days=i)
        if not day.year in years:
            years[day.year] = 1;
        else:
            years[day.year] += 1
    return years



Unnamed: 0_level_0,amount
year,Unnamed: 1_level_1


In [71]:
group_years = pd.DataFrame(columns = ["year","amount"])
group_years.set_index('year', inplace=True)
for _, row in loans.iterrows():
    years = delta_date(row["disburse_time"], row["planned_expiration_time"])
    if (row["planned_expiration_time"] - row["disburse_time"]).days == 0:
        continue
    per_day = row["loan_amount"] / (row["planned_expiration_time"] - row["disburse_time"]).days 
    for key, value in years.items():
        if key not in group_years.index:
            group_years.loc[key] = per_day * value
        else:
           group_years["amount"][key] += per_day * value 

In [73]:
group_years

Unnamed: 0_level_0,amount
year,Unnamed: 1_level_1
2013,14215980.0
2014,14741270.0
2015,8377948.0
2012,9098303.0
2016,9028823.0
2017,11674080.0
2018,235364.8
2011,22182.74
