In [1]:
#import libraries
import pandas as pd
from datetime import datetime
pd.options.display.max_columns = None # no limits to columns display

In [2]:
#import datasets
loans_lenders = pd.read_csv("data/loans_lenders.csv")
lenders = pd.read_csv("data/lenders.csv")
loans = pd.read_csv("data/loans.csv")

In [5]:
# 1) Normalize the loan_lenders table. In the normalized table, each row must have one loan_id and one lenders
#FUNZIONA
loans_lenders = loans_lenders.astype(str).applymap(lambda x: x.split(',')[0])
loans_lenders.columns = ["loan_id", "permanent_name"]
loans_lenders["loan_id"] = loans_lenders["loan_id"].astype(int)

In [56]:
# 2) For each loan, add a column duration corresponding to the number of days between 
#the disburse time and the planned expiration time. FUNZIONA
def time_delta(y,x): 
    end = pd.to_datetime(y)
    start = pd.to_datetime(x)
    delta = end-start
    return delta

loans["duration"] =  time_delta(loans.planned_expiration_time, loans.disburse_time)

In [37]:
# 3) find the borrowers that have funded at least twice.
not_unique_borrowers = loans.groupby("loan_name").size()
not_unique_borrowers = pd.DataFrame(not_unique_borrowers[not_unique_borrowers  > 1])
not_unique_borrowers.columns = ["number_of_loans"]

In [229]:
#4) For each country, compute how many loans have involved that country as borrowers.
#5) For each country, compute the overall amount of money borrowed.
aggregate_state = pd.concat([
    loans.groupby(['country_code']).count()['loan_id'],
    loans.groupby(['country_code']).sum()['funded_amount']], axis = 1)
aggregate_state.columns = ('count_loans', 'amount_loans')

In [59]:
# 6) Like the previous point, but expressed as a percentage of the overall amount lent.
merged = loans.merge(loans_lenders,on='loan_id').merge(lenders,on='permanent_name')


In [230]:
lent = merged.groupby(['country_code_y'])['funded_amount'].sum()
aggregate_state = pd.concat([aggregate_state,lent],axis=1,ignore_index=True).fillna(0)
aggregate_state.columns = ('count_loans', 'amount_loans', 'lent_amount')
aggregate_state["percentage"] = aggregate_state["lent_amount"] / aggregate_state["amount_loans"] *100

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [None]:
# 7) Like the three previous points, but split for each year (with respect to disburse time).

In [204]:
# 8) For each lender, compute the overall amount of money lent.
aggregate_lenders = pd.DataFrame(merged.groupby("permanent_name").sum()["funded_amount"])

In [231]:
# 9) For each country, compute the difference between the overall amount of money lent and the overall amount of money borrowed.
aggregate_state["difference"] = aggregate_state["lent_amount"] - aggregate_state["amount_loans"]

In [232]:
country_stats =pd.read_csv("data/country_stats.csv")
aggregate_state.reset_index(inplace = True)
aggregate_state=aggregate_state.rename(columns = {'index':'country_code'})
aggregate_state_extended = aggregate_state.merge(country_stats, on = "country_code")

In [239]:
# 10) Which country has the highest ratio between the difference computed at the previous point and the population?
aggregate_state_extended['ratio'] = aggregate_state_extended['difference'] / aggregate_state_extended['population']
aggregate_state_extended.iloc[aggregate_state_extended['ratio'].argmax()]

will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  This is separate from the ipykernel package so we can avoid doing imports until


country_code                                  NO
count_loans                                    0
amount_loans                                   0
lent_amount                          1.25222e+07
percentage                                   inf
difference                           1.25222e+07
country_name                              Norway
country_code3                                NOR
continent                                 Europe
region                           Northern Europe
population                               5305383
population_below_poverty_line                NaN
hdi                                     0.949423
life_expectancy                           81.711
expected_years_of_schooling              17.6719
mean_years_of_schooling                  12.7464
gni                                      67614.4
kiva_country_name                         Norway
ratio                                    2.36027
Name: 109, dtype: object

In [244]:
#11) Which country has the highest ratio between the difference computed at point 9 and the population that is not below the poverty line?
aggregate_state_extended['ratio_not_poverty'] = aggregate_state_extended['difference'] / (aggregate_state_extended['population'] * (100 - aggregate_state_extended['population_below_poverty_line'] ))
aggregate_state_extended.iloc[aggregate_state_extended['ratio_not_poverty'].argmax()]

will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  


country_code                                   CA
count_loans                                     1
amount_loans                                50000
lent_amount                           7.75041e+07
percentage                                 155008
difference                            7.74541e+07
country_name                               Canada
country_code3                                 CAN
continent                                Americas
region                           Northern America
population                               36624199
population_below_poverty_line                 9.4
hdi                                      0.920284
life_expectancy                            82.224
expected_years_of_schooling                16.325
mean_years_of_schooling                   13.1051
gni                                       42581.9
kiva_country_name                          Canada
ratio                                     2.11483
ratio_not_poverty                       0.0233425
