# Data Understanding

## Load Data

In [83]:
import pandas as pd

In [84]:
merged_df = pd.read_pickle("exports/merged.pkl")
disbursements_df = pd.read_pickle("exports/disbursements_clean.pkl")
repayments_df = pd.read_pickle("exports/repayments_clean.pkl")

## Analysis

In [85]:
repayments_summary = repayments_df.groupby("customer")["amount"].sum().reset_index()
repayments_summary

Unnamed: 0,customer,amount
0,000514554c34603e8a7551050e988732cf11a22de40fa6...,31100.00
1,00065b32dfbc80c249efb220420bd4842c778d29ace681...,28281.06
2,000e57e83f161e4ba6458b3e32c00815405c5a005e652b...,10695.02
3,0024e99c9aa8106f28ba1980730ef6ab1649591a43ac83...,3200.10
4,002504d81a9c6d7a83aaba6820f6b20cbd2b917978cb07...,5133.00
...,...,...
2990,ff79919ba4789196e7afac6e1cebe9b20a308939b70624...,1105.01
2991,ff9951cd6541f36eef9e5f5b663f9772d4a492b33eba71...,6437.24
2992,ffbe3f5dfe9f103d05ba680350a6c0a148b03a12d10faf...,19484.00
2993,ffcbac0227255329a9b137f7394fcbe0fc052567ff31f8...,16048.40


### Understanding the data

In [86]:
disbursements_df.customer.nunique()

2996

In [87]:
disbursements_df[["date", "account", "tenure", "amount", "due"]][disbursements_df.customer=="91810ca1aa097db79f050f38e9946fa5482b4e28c925e281a700eb36ee782565"].sort_values("date")

Unnamed: 0,date,account,tenure,amount,due
4592,2024-01-22,K61YSYRS6S0JJXBSQEYNS0EC9RLAFMOM,7,170,187.0
19893,2024-02-10,6CWZP5NXVFPHCCPISBWQWN3Y5V4O3C5P,14,340,380.8
1910,2024-03-02,C107RB9WI5E57FISK9R7JHBZYGWHKHOA,7,340,374.0
2370,2024-03-13,RRD74NPK12D6B04F9MMAAOX1HZ6D4Y8Y,7,360,396.0
0,2024-03-19,3O66YENWELA6E2H1R9YLX0LDZNOMNHD4,14,360,403.2
23706,2024-04-01,YNRY75JI87A5IB2NUXBCRUHM38TDNXVG,14,360,403.2
25349,2024-04-24,5IGQXUQ0HQFRTO0XUFP8LPC8LT06GXVD,14,380,425.6
9603,2024-05-08,W111ITPQMLG7WYQ3RAGOL0JFIAZ2IJ7H,7,200,220.0
8214,2024-05-12,WCSYBN23FNBK9S2U4DKZM8L9MICJPJVR,14,380,425.6
8934,2024-05-26,KONUH6IVMZAIXTSOZGUUJ4W5UR3FI53W,14,380,425.6


Appears every loan has a separate account.

In [88]:
repayments_df[["date", "amount", "type"]][repayments_df.customer=="91810ca1aa097db79f050f38e9946fa5482b4e28c925e281a700eb36ee782565"].sort_values("date")

Unnamed: 0,date,amount,type
27168,2024-01-29,66.67,automatic
27178,2024-01-29,10.95,automatic
27169,2024-01-29,109.38,manual
58792,2024-02-24,5.95,automatic
59315,2024-02-26,66.0,manual
59327,2024-02-26,66.67,automatic
58842,2024-02-29,153.07,manual
58843,2024-02-29,89.11,automatic
17041,2024-03-03,374.0,manual
17396,2024-03-19,396.0,manual


#### Matching repayments to their relevant loans

In [89]:
disbursements_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26542 entries, 0 to 26574
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   customer  26542 non-null  object        
 1   date      26542 non-null  datetime64[ns]
 2   tenure    26542 non-null  int64         
 3   account   26542 non-null  object        
 4   amount    26542 non-null  int64         
 5   fee       26542 non-null  float64       
 6   due       26542 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(2)
memory usage: 1.6+ MB


In [90]:
repayments_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 65905 entries, 0 to 66004
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      65905 non-null  datetime64[ns]
 1   customer  65905 non-null  object        
 2   amount    65905 non-null  float64       
 3   type      65905 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 2.5+ MB


In [91]:
disbursements_df = disbursements_df.sort_values(["customer", "date"])
repayments_df = repayments_df.sort_values(["customer", "date"])

repayments_df["account"] = None

for i, payment in repayments_df.iterrows():
    customer = payment.customer
    pay_date = payment.date
    customer_loans = disbursements_df[(disbursements_df.customer==customer) & (disbursements_df.date <= pay_date)]
    if len(customer_loans) > 0:
        repayments_df.at[i, "account"] = customer_loans.iloc[-1].account

In [92]:
repayments_df

Unnamed: 0,date,customer,amount,type,account
63411,2024-02-13,000514554c34603e8a7551050e988732cf11a22de40fa6...,2038.08,automatic,X2JUBL7TGUNCKG5184U9AEN70I9G60A4
63261,2024-02-14,000514554c34603e8a7551050e988732cf11a22de40fa6...,100.00,automatic,X2JUBL7TGUNCKG5184U9AEN70I9G60A4
63007,2024-02-15,000514554c34603e8a7551050e988732cf11a22de40fa6...,195.33,automatic,X2JUBL7TGUNCKG5184U9AEN70I9G60A4
62832,2024-02-20,000514554c34603e8a7551050e988732cf11a22de40fa6...,31.78,automatic,X2JUBL7TGUNCKG5184U9AEN70I9G60A4
62833,2024-02-20,000514554c34603e8a7551050e988732cf11a22de40fa6...,66.89,automatic,X2JUBL7TGUNCKG5184U9AEN70I9G60A4
...,...,...,...,...,...
23062,2024-08-19,fff1eaa909563ca8116ac27992ffb7b8975f2493e2f0bc...,36.67,automatic,9IGIYHJDPJ2A0S8O9SBJYPGQ2GX8TRYN
23563,2024-08-20,fff1eaa909563ca8116ac27992ffb7b8975f2493e2f0bc...,39.44,automatic,9IGIYHJDPJ2A0S8O9SBJYPGQ2GX8TRYN
23447,2024-08-21,fff1eaa909563ca8116ac27992ffb7b8975f2493e2f0bc...,198.64,automatic,9IGIYHJDPJ2A0S8O9SBJYPGQ2GX8TRYN
23466,2024-08-21,fff1eaa909563ca8116ac27992ffb7b8975f2493e2f0bc...,47.96,automatic,9IGIYHJDPJ2A0S8O9SBJYPGQ2GX8TRYN


In [93]:
repayments_df[["date", "amount", "type", "account"]][repayments_df.customer=="91810ca1aa097db79f050f38e9946fa5482b4e28c925e281a700eb36ee782565"].sort_values("date")

Unnamed: 0,date,amount,type,account
27168,2024-01-29,66.67,automatic,K61YSYRS6S0JJXBSQEYNS0EC9RLAFMOM
27169,2024-01-29,109.38,manual,K61YSYRS6S0JJXBSQEYNS0EC9RLAFMOM
27178,2024-01-29,10.95,automatic,K61YSYRS6S0JJXBSQEYNS0EC9RLAFMOM
58792,2024-02-24,5.95,automatic,6CWZP5NXVFPHCCPISBWQWN3Y5V4O3C5P
59315,2024-02-26,66.0,manual,6CWZP5NXVFPHCCPISBWQWN3Y5V4O3C5P
59327,2024-02-26,66.67,automatic,6CWZP5NXVFPHCCPISBWQWN3Y5V4O3C5P
58842,2024-02-29,153.07,manual,6CWZP5NXVFPHCCPISBWQWN3Y5V4O3C5P
58843,2024-02-29,89.11,automatic,6CWZP5NXVFPHCCPISBWQWN3Y5V4O3C5P
17041,2024-03-03,374.0,manual,C107RB9WI5E57FISK9R7JHBZYGWHKHOA
17396,2024-03-19,396.0,manual,3O66YENWELA6E2H1R9YLX0LDZNOMNHD4


Checking categorization, the figures match. Write new dataframe to file.

In [94]:
repayments_df.to_pickle("exports/repayments_accounts.pkl")