# Pre-processing

In [3]:
import os
import pandas as pd
from datetime import datetime, timedelta
from collections import Counter

file = os.path.join('raw_data', 'kiva_loans.csv')
df = pd.read_csv(file)

## Covert date time types
time_columns = ['posted_time', 'disbursed_time', 'funded_time']
df.loc[:, time_columns] = df[time_columns].apply(pd.to_datetime)

## Clean up gender

# rule: With only 1 gender, convert to one multiple, take majority

def normalize_gender(borrower_genders):
    #return type(borrower_genders)
    if isinstance(borrower_genders, str):
        return Counter(
            map(
                lambda x: x.replace(' ', ''), 
                borrower_genders.split(', ')
            )
        ).most_common(1)[0][0]
    else:
        return borrower_genders

df.loc[:, 'gender'] = df['borrower_genders'].apply(normalize_gender)

In [16]:
df.head()

Unnamed: 0,id,funded_amount,loan_amount,activity,sector,use,country_code,country,region,currency,...,posted_time,disbursed_time,funded_time,term_in_months,lender_count,tags,borrower_genders,repayment_interval,date,gender
0,653051,300.0,300.0,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",PK,Pakistan,Lahore,PKR,...,2014-01-01 06:12:39,2013-12-17 08:00:00,2014-01-02 10:06:32,12.0,12,,female,irregular,2014-01-01,female
1,653053,575.0,575.0,Rickshaw,Transportation,to repair and maintain the auto rickshaw used ...,PK,Pakistan,Lahore,PKR,...,2014-01-01 06:51:08,2013-12-17 08:00:00,2014-01-02 09:17:23,11.0,14,,"female, female",irregular,2014-01-01,female
2,653068,150.0,150.0,Transportation,Transportation,To repair their old cycle-van and buy another ...,IN,India,Maynaguri,INR,...,2014-01-01 09:58:07,2013-12-17 08:00:00,2014-01-01 16:01:36,43.0,6,"user_favorite, user_favorite",female,bullet,2014-01-01,female
3,653063,200.0,200.0,Embroidery,Arts,to purchase an embroidery machine and a variet...,PK,Pakistan,Lahore,PKR,...,2014-01-01 08:03:11,2013-12-24 08:00:00,2014-01-01 13:00:00,11.0,8,,female,irregular,2014-01-01,female
4,653084,400.0,400.0,Milk Sales,Food,to purchase one buffalo.,PK,Pakistan,Abdul Hakeem,PKR,...,2014-01-01 11:53:19,2013-12-17 08:00:00,2014-01-01 19:18:51,14.0,16,,female,monthly,2014-01-01,female


In [17]:
df.repayment_interval.unique()

array(['irregular', 'bullet', 'monthly', 'weekly'], dtype=object)

In [4]:
df.describe()

Unnamed: 0,id,funded_amount,loan_amount,partner_id,term_in_months,lender_count
count,671205.0,671205.0,671205.0,657698.0,671205.0,671205.0
mean,993248.6,785.995061,842.397107,178.199616,13.739022,20.590922
std,196611.3,1130.398941,1198.660073,94.247581,8.598919,28.459551
min,653047.0,0.0,25.0,9.0,1.0,0.0
25%,823072.0,250.0,275.0,126.0,8.0,7.0
50%,992780.0,450.0,500.0,145.0,13.0,13.0
75%,1163653.0,900.0,1000.0,204.0,14.0,24.0
max,1340339.0,100000.0,100000.0,536.0,158.0,2986.0


In [5]:
df.columns

Index(['id', 'funded_amount', 'loan_amount', 'activity', 'sector', 'use',
       'country_code', 'country', 'region', 'currency', 'partner_id',
       'posted_time', 'disbursed_time', 'funded_time', 'term_in_months',
       'lender_count', 'tags', 'borrower_genders', 'repayment_interval',
       'date', 'gender'],
      dtype='object')

In [7]:
df[['activity', 'sector', 'use', 'country_code', 'country', 'region', 'currency', 'partner_id',
       'posted_time', 'disbursed_time', 'funded_time','tags', 'borrower_genders', 'repayment_interval',
       'date', 'gender']].agg(['min', 'max', 'count', lambda x: len(x.unique())])

Unnamed: 0,activity,sector,use,country_code,country,region,currency,partner_id,posted_time,disbursed_time,funded_time,tags,borrower_genders,repayment_interval,date,gender
<lambda>,163,15,424914.0,87.0,87,12696.0,67,367.0,667399,5720,498008,86720.0,11299.0,4,1298,3.0
count,671205,671205,666977.0,671197.0,671205,614405.0,671205,657698.0,671205,668809,622874,499789.0,666984.0,671205,671205,666984.0
max,Wholesale,Wholesale,,,Zimbabwe,,ZWD,536.0,2017-07-26 06:31:46,2017-09-30 07:00:00,2017-07-26 02:09:43,,,weekly,2017-07-26,
min,Adult Care,Agriculture,,,Afghanistan,,ALL,9.0,2014-01-01 04:49:26,2013-12-02 08:00:00,2014-01-01 12:18:55,,,bullet,2014-01-01,


In [12]:
df.gender.value_counts()

female    517592
male      149392
Name: gender, dtype: int64

In [15]:
df.country.unique()

array(['Pakistan', 'India', 'Kenya', 'Nicaragua', 'El Salvador',
       'Tanzania', 'Philippines', 'Peru', 'Senegal', 'Cambodia', 'Liberia',
       'Vietnam', 'Iraq', 'Honduras', 'Palestine', 'Mongolia',
       'United States', 'Mali', 'Colombia', 'Tajikistan', 'Guatemala',
       'Ecuador', 'Bolivia', 'Yemen', 'Ghana', 'Sierra Leone', 'Haiti',
       'Chile', 'Jordan', 'Uganda', 'Burundi', 'Burkina Faso',
       'Timor-Leste', 'Indonesia', 'Georgia', 'Ukraine', 'Kosovo',
       'Albania', 'The Democratic Republic of the Congo', 'Costa Rica',
       'Somalia', 'Zimbabwe', 'Cameroon', 'Turkey', 'Azerbaijan',
       'Dominican Republic', 'Brazil', 'Mexico', 'Kyrgyzstan', 'Armenia',
       'Paraguay', 'Lebanon', 'Samoa', 'Israel', 'Rwanda', 'Zambia',
       'Nepal', 'Congo', 'Mozambique', 'South Africa', 'Togo', 'Benin',
       'Belize', 'Suriname', 'Thailand', 'Nigeria', 'Mauritania',
       'Vanuatu', 'Panama', 'Virgin Islands',
       'Saint Vincent and the Grenadines',
       "Lao Peo

In [14]:
df.country.value_counts()

Philippines                         160441
Kenya                                75825
El Salvador                          39875
Cambodia                             34836
Pakistan                             26857
Peru                                 22233
Colombia                             21995
Uganda                               20601
Tajikistan                           19580
Ecuador                              13521
Paraguay                             11903
Nicaragua                            11781
India                                11237
Vietnam                              10843
Nigeria                              10136
Bolivia                               8806
Lebanon                               8792
Armenia                               8631
Palestine                             8167
Samoa                                 7396
Guatemala                             7310
Rwanda                                6735
Mali                                  6639
Honduras   