In [471]:
import pandas as pd
import numpy as np
from statistics import mean

import cleaning

In [544]:
test= True

accounts = cleaning.clean_account()
cards = cleaning.clean_card(test)
clients = cleaning.clean_client()
districts = cleaning.clean_district()
loans = cleaning.clean_loans(test)
trans = cleaning.clean_trans(test)
disp = pd.read_csv('../../project/banking_data/disp.csv', sep=';')


## Creation of a new Transactions Dataframe

The new transactions consists on a dataframe where the transactions from the same account are all summarized in 1 row: `min no. trans`, `max no. trans`, `avg no. trans`, `min_balance`, `max_balance` and `avg_balance`.

In [545]:
# Sort values by account ID to make the job easier
trans = trans.sort_values(by="account_id")

aux = 0

# Create new dataframe with the columns needed
trans_final = pd.DataFrame(columns=["account_id", "no. movements", "min no. trans", "max no. trans", "avg no. trans", "min balance", "max balance", "avg balance"])

for i in loans["account_id"]:
    if i != aux:
        trans_test = trans.loc[trans['account_id'] == i]

        # Get the values for each column of the new dataframe
        no_movements = trans_test.shape[0]
        min_no_trans = min(trans_test["amount"])
        max_no_trans = max(trans_test["amount"])
        avg_no_trans = mean(trans_test["amount"])
        min_balance = min(trans_test["balance"])
        max_balance = max(trans_test["balance"])
        avg_balance = mean(trans_test["balance"])
        
        # Append to the new dataframe
        trans_final = trans_final.append({"account_id": i, "no. movements": no_movements, "min no. trans": min_no_trans, "max no. trans": max_no_trans, "avg no. trans": avg_no_trans, "min balance": min_balance, "max balance": max_balance, "avg balance": avg_balance}, ignore_index=True)
        
        aux = i

## Client-District Merge

Merge the clients with the districts using the `district_id` information.

In [546]:
# Add district columns to the right side of the clients columns 
client_district = clients.merge(districts, left_on="district_id", right_on="code ")

# Drop irrelevant columns
client_district = client_district.drop(columns=["name ", "district_id", "region"])

# Rename necessary columns so we don't have merge conflicts later
client_district = client_district.rename(columns={col:("client_district "+ col) for col in client_district.columns[4:]})

## Account-District Merge

In [547]:
# Add district columns to the right side of the accounts columns 
account_district = accounts.merge(districts, left_on="district_id", right_on="code ")

# Drop irrelevant columns
account_district = account_district.drop(columns=["name ", "district_id", "code ", "region"])

# Rename necessary columns so we don't have merge conflicts later
account_district = account_district.rename(columns={col:("account_district "+ col) for col in account_district.columns[4:]})

## Card-Client Merge

### Useful Renames

In [548]:
# Renamed so it doesn't get any problems while merging with the dispositions
cards = cards.rename(columns={"type":"card_type", "issued":"card_issued"})

### Merging

#### 1. Card-Disposition Merge

In [549]:
# Add disposition columns to the right side of the cards columns 
card_client = cards.merge(disp)

# Drop irrelevant columns
card_client = card_client.drop(columns=["card_id", "disp_id", "client_id", "type"])

# Get Clients that have Loans but don't have a Card
loan_client_no_card = []
for i in loans["account_id"]:
    if not i in list(card_client["account_id"]):
        loan_client_no_card.append(i)

# Get Clients that have a Card but don't have Loans
card_client_no_loan = []
for i in card_client["account_id"]:
    if not i in list(loans["account_id"]):
        card_client_no_loan.append(i)

# Erase account that have cards but don't have a loan
card_client = card_client.drop(index=card_client[card_client["account_id"].isin(card_client_no_loan)].index)

# Add accounts that have loans but don't have a card
loan_client_no_card = np.array([loan_client_no_card, ["None"] * len(loan_client_no_card), [0] * len(loan_client_no_card)]).transpose()
loan_client_no_card = pd.DataFrame(loan_client_no_card, columns=["account_id", "card_type", "card_issued"])

# Add accounts that have loans but don't have a card to the accounts that have loans and have a card
card_client = card_client.append(loan_client_no_card)
card_client["account_id"] = card_client["account_id"].astype(int)

## Account-Client Merge

In [550]:
# Merge the disposition columns with the accounts (with district information) columns
account_client = disp.merge(account_district)

# Merge the previous table with the clients (with the district information) columns
account_client = account_client.merge(client_district, on="client_id")

# The only loans that are relevant are the ones that belong to the owner
account_client = account_client[account_client["type"] == "OWNER"]

# Drop irrelevant columns: client id and type (it's always OWNER)
account_client = account_client.drop(columns=["client_id", "type"])

#client_district.sort_values(by="client_id")
account_client

Unnamed: 0,disp_id,account_id,frequency,date,no. of inhabitants,account_district no. of municipalities with inhabitants < 499,account_district no. of municipalities with inhabitants 500-1999,account_district no. of municipalities with inhabitants 2000-9999,account_district no. of municipalities with inhabitants >10000,account_district no. of cities,...,client_district ratio of urban inhabitants,client_district average salary,client_district unemploymant rate '95,client_district unemploymant rate '96,client_district unemploymant_growth,client_district no. of enterpreneurs per 1000 inhabitants,client_district no. of commited crimes '95,client_district no. of commited crimes '96,client_district crime_growth,client_district total_crime
0,1,1,monthly issuance,24-03-1995,70699,60,13,2,1,4,...,65.3,8968,2.83,3.35,0.52,131,1740.0,1910,170.0,3650.0
1,2,2,monthly issuance,26-02-1993,1204953,0,0,0,1,1,...,100.0,12541,0.29,0.43,0.14,167,85677.0,99107,13430.0,184784.0
3,4,3,monthly issuance,07-07-1997,95616,65,30,4,1,6,...,51.4,9307,3.85,4.43,0.58,118,2616.0,3040,424.0,5656.0
5,6,4,monthly issuance,21-02-1996,107870,84,29,6,1,6,...,58.0,8754,3.83,4.31,0.48,137,3804.0,3868,64.0,7672.0
6,7,5,monthly issuance,30-05-1997,58796,22,16,7,1,5,...,51.9,9045,3.13,3.60,0.47,124,1845.0,1879,34.0,3724.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5363,13623,11333,monthly issuance,26-05-1994,112065,95,19,7,1,8,...,69.4,11277,1.25,1.44,0.19,127,5179.0,4987,-192.0,10166.0
5364,13647,11349,weekly issuance,26-05-1995,1204953,0,0,0,1,1,...,100.0,12541,0.29,0.43,0.14,167,85677.0,99107,13430.0,184784.0
5366,13660,11359,monthly issuance,01-10-1994,117897,139,28,5,1,6,...,53.8,8814,4.76,5.74,0.98,107,2112.0,2059,-53.0,4171.0
5367,13663,11362,monthly issuance,14-10-1995,106054,38,25,6,2,6,...,63.1,8110,5.77,6.55,0.78,109,3244.0,3079,-165.0,6323.0


## Final Merge: Loan

### Useful Rename

In [551]:
loans = loans.rename(columns={"date":"loan_date", "amount":"loan_amount", "duration":"loan_duration"})
trans = trans.rename(columns={"amount":"no. transactions"})
account_client = account_client.rename(columns={"frequency":"account_frequency", "date": "account_creation"})


### Merging

In [553]:
# Merge accounts informations into loans columns
loan_final = loans.merge(account_client, on="account_id")

# Merge cards with clients informations into loans columns
loan_final = loan_final.merge(card_client, on="account_id")

# Merge transactions informations into loans colums
loan_final = loan_final.merge(trans_final, on="account_id")

# Drop irrelevant columns
loan_final = loan_final.drop(columns=["account_id", "disp_id"])


## Add to CSV File

In [554]:
loan_final.to_csv("../../csvs/loan_united_"+("test" if test else "train") + ".csv", index=False)
loan_final

Unnamed: 0,loan_id,loan_date,loan_amount,loan_duration,payments,status,account_frequency,account_creation,no. of inhabitants,account_district no. of municipalities with inhabitants < 499,...,client_district total_crime,card_type,card_issued,no. movements,min no. trans,max no. trans,avg no. trans,min balance,max balance,avg balance
0,5895,03-01-1997,93960,60,1566,,monthly issuance,09-10-1995,77917,85,...,4202.0,,0,89.0,14.6,36637.5,8051.737079,800.0,88246.7,54520.202247
1,7122,04-01-1997,260640,36,7240,,monthly issuance,02-09-1995,58400,65,...,2206.0,,0,78.0,4.6,41871.0,6935.733333,-718.6,88731.8,31518.182051
2,6173,08-01-1997,232560,48,4845,,issuance after transaction,19-04-1995,177686,69,...,12899.0,,0,176.0,45.7,43605.0,7217.385795,200.0,79286.6,40175.612500
3,6142,21-01-1997,221880,60,3698,,monthly issuance,16-03-1996,81344,61,...,5474.0,,0,71.0,2.0,33280.0,6430.809859,1000.0,74216.8,44440.912676
4,5358,21-01-1997,38520,12,3210,,monthly issuance,04-06-1995,95907,87,...,3326.0,,0,114.0,14.6,8800.0,2373.266667,900.0,31302.0,20231.313158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,4989,05-12-1998,352704,48,7348,,monthly issuance,10-07-1997,103347,87,...,4653.0,classic,07-06-1998,56.0,14.6,49101.0,8582.460714,800.0,74693.6,30797.487500
350,5221,05-12-1998,52512,12,4376,,monthly issuance,02-07-1997,157042,49,...,7553.0,,0,87.0,10.9,49972.0,12491.751724,1000.0,120419.7,59684.298851
351,6402,06-12-1998,139488,24,5812,,weekly issuance,12-05-1997,75232,55,...,5637.0,,0,104.0,30.0,52305.0,13016.101923,1100.0,95976.0,46755.305769
352,5346,06-12-1998,55632,24,2318,,monthly issuance,02-03-1997,285387,0,...,19986.0,,0,114.0,14.6,23522.0,6040.260526,500.0,52896.1,29645.320175
