In [1]:
# Import the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as seabornInstance
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [2]:
# Import the datasets, 
# Specify the delimeter, header and columns

#  Describes static characteristics of an account
df_account = pd.read_csv('account.csv', delimiter=';', header=0, names=['account_id', 'district_id', 'frequency', 'date'])

# Describes a credit card issued to an account
df_card = pd.read_csv('card.csv', delimiter=';', header=0, names=['card_id', 'disp_id', 'type', 'issued'])

# Describes characteristics of a client
df_client = pd.read_csv('client.csv', delimiter=';', header=0, names=['client_id', 'birth_number', 'district_id'])

# Relates together a client with an account
df_disp = pd.read_csv('disp.csv', delimiter=';', header=0, names=['disp_id', 'client_id', 'account_id', 'type'])

# Describes demographic characteristics of a district
df_district = pd.read_csv('district.csv', delimiter=';', header=0, names=['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16'])

# Describes a loan granted for a given account
df_loan = pd.read_csv('loan.csv', delimiter=';', header=0, names=['loan_id', 'account_id', 'date', 'amount', 'duration', 'payments', 'status'])

# Describes characteristics of a payment order
df_order = pd.read_csv('order.csv', delimiter=';', header=0, names=['order_id', 'account_id', 'bank_to', 'account_to', 'amount', 'k_symbol'])

# Describes one transaction on an account
df_trans = pd.read_csv('trans.csv', delimiter=';', header=0, names=['trans_id', 'account_id', 'date', 'type', 'operation', 'amount', 'balance', 'k_symbol', 'bank', 'account'])

  df_trans = pd.read_csv('trans.csv', delimiter=';', header=0, names=['trans_id', 'account_id', 'date', 'type', 'operation', 'amount', 'balance', 'k_symbol', 'bank', 'account'])


In [3]:
df_district.rename(columns={'A1': 'district_id'}, inplace=True)

In [4]:
# Merge df_account and df_order by account_id
df_merged = pd.merge(df_account, df_order, on='account_id', how='left')
df_merged.drop(columns=['district_id'], inplace=True)

# Merge df_merged and df_trans by account_id
df_merged = pd.merge(df_merged, df_trans, on='account_id', how='left')

# Merge df_merged and df_disp by account_id
df_merged = pd.merge(df_merged, df_disp, on='account_id', how='left')

# Merge df_merged and df_client by client_id
df_merged = pd.merge(df_merged, df_client, on='client_id', how='left')

# Merge df_merged and df_district by district_id
df_merged = pd.merge(df_merged, df_district, on='district_id', how='left')

In [5]:
df_merged

Unnamed: 0,account_id,frequency,date_x,order_id,bank_to,account_to,amount_x,k_symbol_x,trans_id,date_y,...,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,576,POPLATEK MESICNE,930101,30253.0,OP,71033382.0,3662.0,SIPO,171812,930101,...,0,1,1,100.0,10673,4.75,5.44,100,18782,18347
1,576,POPLATEK MESICNE,930101,30253.0,OP,71033382.0,3662.0,SIPO,171812,930101,...,0,1,1,100.0,10673,4.75,5.44,100,18782,18347
2,576,POPLATEK MESICNE,930101,30253.0,OP,71033382.0,3662.0,SIPO,171813,930111,...,0,1,1,100.0,10673,4.75,5.44,100,18782,18347
3,576,POPLATEK MESICNE,930101,30253.0,OP,71033382.0,3662.0,SIPO,171813,930111,...,0,1,1,100.0,10673,4.75,5.44,100,18782,18347
4,576,POPLATEK MESICNE,930101,30253.0,OP,71033382.0,3662.0,SIPO,3549613,930131,...,0,1,1,100.0,10673,4.75,5.44,100,18782,18347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2243453,3276,POPLATEK MESICNE,971229,34262.0,WX,88365083.0,1017.0,POJISTNE,961640,981208,...,0,1,1,100.0,12541,0.29,0.43,167,85677,99107
2243454,3276,POPLATEK MESICNE,971229,34262.0,WX,88365083.0,1017.0,POJISTNE,961634,981213,...,0,1,1,100.0,12541,0.29,0.43,167,85677,99107
2243455,3276,POPLATEK MESICNE,971229,34262.0,WX,88365083.0,1017.0,POJISTNE,961634,981213,...,0,1,1,100.0,12541,0.29,0.43,167,85677,99107
2243456,3276,POPLATEK MESICNE,971229,34262.0,WX,88365083.0,1017.0,POJISTNE,3639005,981231,...,0,1,1,100.0,12541,0.29,0.43,167,85677,99107


In [7]:
# Drop df_account unnecessary data
df_merged.drop(columns=['frequency', 'date_x'], inplace=True)

# Drop df_order unnecessary data
df_merged.drop(columns=['bank_to', 'k_symbol_x'], inplace=True)

# Drop df_trans unnecessary data
df_merged.drop(columns=['balance', 'k_symbol_y', 'bank'], inplace=True)

# Drop df_disp unnecessary data
df_merged.drop(columns=['disp_id', 'type_y'], inplace=True)

# Drop df_client unnecessary data
df_merged.drop(columns=['birth_number'], inplace=True)

# Drop all demographic data except district name and region
df_merged.drop(columns=['A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16'], inplace=True)


In [8]:
df_merged

Unnamed: 0,account_id,order_id,account_to,amount_x,trans_id,date_y,type_x,operation,amount_y,account,client_id,district_id,A2,A3
0,576,30253.0,71033382.0,3662.0,171812,930101,PRIJEM,VKLAD,900.0,,692,74,Ostrava - mesto,north Moravia
1,576,30253.0,71033382.0,3662.0,171812,930101,PRIJEM,VKLAD,900.0,,693,74,Ostrava - mesto,north Moravia
2,576,30253.0,71033382.0,3662.0,171813,930111,PRIJEM,PREVOD Z UCTU,6207.0,30300313.0,692,74,Ostrava - mesto,north Moravia
3,576,30253.0,71033382.0,3662.0,171813,930111,PRIJEM,PREVOD Z UCTU,6207.0,30300313.0,693,74,Ostrava - mesto,north Moravia
4,576,30253.0,71033382.0,3662.0,3549613,930131,PRIJEM,,20.1,,692,74,Ostrava - mesto,north Moravia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2243453,3276,34262.0,88365083.0,1017.0,961640,981208,VYDAJ,VYBER,1920.0,,3966,1,Hl.m. Praha,Prague
2243454,3276,34262.0,88365083.0,1017.0,961634,981213,VYDAJ,PREVOD NA UCET,1017.0,88365083.0,3965,1,Hl.m. Praha,Prague
2243455,3276,34262.0,88365083.0,1017.0,961634,981213,VYDAJ,PREVOD NA UCET,1017.0,88365083.0,3966,1,Hl.m. Praha,Prague
2243456,3276,34262.0,88365083.0,1017.0,3639005,981231,PRIJEM,,177.1,,3965,1,Hl.m. Praha,Prague


In [9]:
# Rename columns
df_merged.rename(columns={'account_to': 'order_receiver'}, inplace=True)
df_merged.rename(columns={'amount_x': 'order_amount'}, inplace=True)
df_merged.rename(columns={'date_y': 'trans_date'}, inplace=True)
df_merged.rename(columns={'type_x': 'trans_type'}, inplace=True)
df_merged.rename(columns={'operation': 'trans_mode'}, inplace=True)
df_merged.rename(columns={'amount_y': 'trans_amount'}, inplace=True)
df_merged.rename(columns={'account': 'trans_receiver'}, inplace=True)
df_merged.rename(columns={'A2': 'district_name'}, inplace=True)
df_merged.rename(columns={'A3': 'district_region'}, inplace=True)

In [10]:
# Define the order of the columns
df_ordered = ['account_id', 
              'order_id', 'order_amount', 'order_receiver',
              'trans_id', 'trans_amount', 'trans_receiver', 'trans_date', 'trans_type', 'trans_mode', 
              'client_id', 
              'district_id', 'district_region', 'district_name']

# Reorder the columns
df_merged = df_merged[df_ordered]

In [11]:
df_merged

Unnamed: 0,account_id,order_id,order_amount,order_receiver,trans_id,trans_amount,trans_receiver,trans_date,trans_type,trans_mode,client_id,district_id,district_region,district_name
0,576,30253.0,3662.0,71033382.0,171812,900.0,,930101,PRIJEM,VKLAD,692,74,north Moravia,Ostrava - mesto
1,576,30253.0,3662.0,71033382.0,171812,900.0,,930101,PRIJEM,VKLAD,693,74,north Moravia,Ostrava - mesto
2,576,30253.0,3662.0,71033382.0,171813,6207.0,30300313.0,930111,PRIJEM,PREVOD Z UCTU,692,74,north Moravia,Ostrava - mesto
3,576,30253.0,3662.0,71033382.0,171813,6207.0,30300313.0,930111,PRIJEM,PREVOD Z UCTU,693,74,north Moravia,Ostrava - mesto
4,576,30253.0,3662.0,71033382.0,3549613,20.1,,930131,PRIJEM,,692,74,north Moravia,Ostrava - mesto
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2243453,3276,34262.0,1017.0,88365083.0,961640,1920.0,,981208,VYDAJ,VYBER,3966,1,Prague,Hl.m. Praha
2243454,3276,34262.0,1017.0,88365083.0,961634,1017.0,88365083.0,981213,VYDAJ,PREVOD NA UCET,3965,1,Prague,Hl.m. Praha
2243455,3276,34262.0,1017.0,88365083.0,961634,1017.0,88365083.0,981213,VYDAJ,PREVOD NA UCET,3966,1,Prague,Hl.m. Praha
2243456,3276,34262.0,1017.0,88365083.0,3639005,177.1,,981231,PRIJEM,,3965,1,Prague,Hl.m. Praha


###### account_id , order_id, trans_id, client_id, district_id
Record identifiers

###### order_amount
Amount debited from order account

###### order_receiver
Account of the order recipient

###### trans_amount
Amount of transaction

###### trans_receiver
Account of the transaction recipient

###### trans_date
Date of transaction
- In the form: YYMMDD

###### trans_type
Debit/credit transaction
- 'PRIJEM' stands for Credit
- 'VYDAJ' stands for Debit (withdrawal)

###### trans_mode
Mode of transaction
- 'VYBER KARTOU' stands for Credit Card Withdrawal
- 'VKLAD' stands for Credit in Cash
- 'PREVOD Z UCTU' stands for Collection from Another Bank
- 'VYBER' stands for Withdrawal in Cash
- 'PREVOD NA UCET' stands for Remittance to Another Bank

###### district_region
Region name

###### district_name
District name