# 1. Libraries and Datasets

In [757]:
# Import the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as seabornInstance
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [758]:
# Import the datasets, 
# Specify the delimeter, header and columns

#  Describes static characteristics of an account
df_account = pd.read_csv('account.csv', delimiter=';', header=0)

# Describes a credit card issued to an account
df_card = pd.read_csv('card.csv', delimiter=';', header=0)

# Describes characteristics of a client
df_client = pd.read_csv('client.csv', delimiter=';', header=0)

# Relates together a client with an account
df_disp = pd.read_csv('disp.csv', delimiter=';', header=0)

# Describes demographic characteristics of a district
df_district = pd.read_csv('district.csv', delimiter=';', header=0)

# Describes a loan granted for a given account
df_loan = pd.read_csv('loan.csv', delimiter=';', header=0)

# Describes characteristics of a payment order
df_order = pd.read_csv('order.csv', delimiter=';', header=0)

# Describes one transaction on an account
df_trans = pd.read_csv('trans.csv', delimiter=';', header=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [759]:
df_account.head()

Unnamed: 0,account_id,district_id,frequency,date
0,576,55,POPLATEK MESICNE,930101
1,3818,74,POPLATEK MESICNE,930101
2,704,55,POPLATEK MESICNE,930101
3,2378,16,POPLATEK MESICNE,930101
4,2632,24,POPLATEK MESICNE,930102


In [760]:
df_card.head()

Unnamed: 0,card_id,disp_id,type,issued
0,1005,9285,classic,931107 00:00:00
1,104,588,classic,940119 00:00:00
2,747,4915,classic,940205 00:00:00
3,70,439,classic,940208 00:00:00
4,577,3687,classic,940215 00:00:00


In [761]:
df_client.head()

Unnamed: 0,client_id,birth_number,district_id
0,1,706213,18
1,2,450204,1
2,3,406009,1
3,4,561201,5
4,5,605703,5


In [762]:
df_disp.head()

Unnamed: 0,disp_id,client_id,account_id,type
0,1,1,1,OWNER
1,2,2,2,OWNER
2,3,3,2,DISPONENT
3,4,4,3,OWNER
4,5,5,3,DISPONENT


In [763]:
df_district.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107
1,2,Benesov,central Bohemia,88884,80,26,6,2,5,46.7,8507,1.67,1.85,132,2159,2674
2,3,Beroun,central Bohemia,75232,55,26,4,1,5,41.7,8980,1.95,2.21,111,2824,2813
3,4,Kladno,central Bohemia,149893,63,29,6,2,6,67.4,9753,4.64,5.05,109,5244,5892
4,5,Kolin,central Bohemia,95616,65,30,4,1,6,51.4,9307,3.85,4.43,118,2616,3040


In [764]:
df_loan.head()

Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status
0,5314,1787,930705,96396,12,8033.0,B
1,5316,1801,930711,165960,36,4610.0,A
2,6863,9188,930728,127080,60,2118.0,A
3,5325,1843,930803,105804,36,2939.0,A
4,7240,11013,930906,274740,60,4579.0,A


In [765]:
df_order.head()

Unnamed: 0,order_id,account_id,bank_to,account_to,amount,k_symbol
0,29401,1,YZ,87144583,2452.0,SIPO
1,29402,2,ST,89597016,3372.7,UVER
2,29403,2,QR,13943797,7266.0,SIPO
3,29404,3,WX,83084338,1135.0,SIPO
4,29405,3,CD,24485939,327.0,


In [766]:
df_trans.head()

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,695247,2378,930101,PRIJEM,VKLAD,700.0,700.0,,,
1,171812,576,930101,PRIJEM,VKLAD,900.0,900.0,,,
2,207264,704,930101,PRIJEM,VKLAD,1000.0,1000.0,,,
3,1117247,3818,930101,PRIJEM,VKLAD,600.0,600.0,,,
4,579373,1972,930102,PRIJEM,VKLAD,400.0,400.0,,,


# 2. Data Cleaning

### 2.1. Column Manipulation

In [767]:
# Index the account_id unique identifier
df_account = df_account.set_index('account_id')

In [768]:
# Drop df_account unnecessary data
df_account.drop(columns=['frequency', 'date'], inplace=True)

# Drop df_client unnecessary data
df_client.drop(columns=['district_id'], inplace=True)

# Drop df_disp unnecessary data
df_disp.drop(columns=['type'], inplace=True)

# Drop all demographic data except district name and region
df_district.drop(columns=['A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A12', 'A13', 'A14', 'A15', 'A16'], inplace=True)

# Drop df_order unnecessary data
df_order.drop(columns=['bank_to', 'k_symbol'], inplace=True)

# Drop df_trans unnecessary data
df_trans.drop(columns=['balance', 'k_symbol', 'bank'], inplace=True)

In [769]:
# Rename columns
df_client.rename(columns={'birth_number': 'date_of_birth'}, inplace=True)
df_order.rename(columns={'account_to': 'order_receiver'}, inplace=True)
df_order.rename(columns={'amount': 'order_amount'}, inplace=True)
df_trans.rename(columns={'date': 'trans_date'}, inplace=True)
df_trans.rename(columns={'type': 'trans_type'}, inplace=True)
df_trans.rename(columns={'operation': 'trans_mode'}, inplace=True)
df_trans.rename(columns={'amount': 'trans_amount'}, inplace=True)
df_trans.rename(columns={'account': 'trans_receiver'}, inplace=True)
df_district.rename(columns={'A1': 'district_id'}, inplace=True)
df_district.rename(columns={'A2': 'district_name'}, inplace=True)
df_district.rename(columns={'A3': 'district_region'}, inplace=True)
df_district.rename(columns={'A11': 'average_salary'}, inplace=True)

### 2.2. Row manipulation

In [770]:
# Define a function to convert the date_of_birth and trans_date values to the formatted dates
def convert_to_date(df, col):
    #df[col] = pd.to_datetime(df[col]).dt.date
    df['sex'] = df['birth_number'].apply(lambda x: 'Male' if int(x[-2]) % 2 == 0 else 'Female')
    return df

# Example usage:
df_client = convert_to_date(df_client, 'date_of_birth')
df_trans = convert_to_date(df_trans, 'trans_date')

In [771]:
# Change df_trans > type values to credit or debit
type_mapping = {'PRIJEM': 'credit', 'VYDAJ': 'debit'} # Define the mapping

# Replace values in the 'type' column using the mapping
df_trans['trans_type'] = df_trans['trans_type'].replace(type_mapping) # Replace the values

In [772]:
# Change df_trans > type values to its description
trans_mode_mapping = {'VYBER KARTOU': 'credit card withdrawal', 
                      'VKLAD': 'credit in cash', 
                      'PREVOD Z UCTU': 'collection from another bank',
                      'VYBER': 'withdrawal in cash', 
                      'PREVOD NA UCET': 'remittance to another bank'} # Define the mapping

# Replace values in the 'type' column using the mapping
df_trans['trans_mode'] = df_trans['trans_mode'].replace(trans_mode_mapping) # Replace the values

In [773]:
# Change df_order/df_trans > order_amount/trans_amount values from Czech Koruna to Euro

czk_to_eur = 0.040 # Define the actual exchange rate

# Define a function to convert from Czech Koruna to Euro
def convert_czk_to_eur(amount_czk):
    return amount_czk * czk_to_eur

# Apply the conversion function to the 'trans_amount' column
df_order['order_amount'] = df_order['order_amount'].apply(convert_czk_to_eur)
df_trans['trans_amount'] = df_trans['trans_amount'].apply(convert_czk_to_eur)
df_district['average_salary'] = df_district['average_salary'].apply(convert_czk_to_eur)

In [774]:
# Check the count of null values in the df_trans dataset
df_trans.isnull().sum()

trans_id               0
account_id             0
trans_date             0
trans_type             0
trans_mode        183114
trans_amount           0
trans_receiver    760931
dtype: int64

In [775]:
# Define a function to fill the null values with the median
def fill_with_median(df, col):
    for column in col:
        df[col] = df[col].fillna(df[col].median())
    return df

to_fill = ['trans_mode', 'trans_receiver']
df_trans = fill_with_median(df_trans, to_fill)

  df[col] = df[col].fillna(df[col].median())


In [776]:
# Merge df_account and df_order by account_id
df_merged = pd.merge(df_account, df_order, on='account_id', how='inner')

# Merge df_merged and df_trans by account_id
df_merged = pd.merge(df_merged, df_trans, on='account_id', how='left')

# Merge df_merged and df_disp by account_id
df_merged = pd.merge(df_merged, df_disp, on='account_id', how='left')

# Merge df_merged and df_client by client_id
df_merged = pd.merge(df_merged, df_client, on='client_id', how='left')

# Merge df_merged and df_district by district_id
df_merged = pd.merge(df_merged, df_district, on='district_id', how='left')

df_merged.head()

Unnamed: 0,account_id,district_id,order_id,order_receiver,order_amount,trans_id,trans_date,trans_type,trans_mode,trans_amount,trans_receiver,disp_id,client_id,date_of_birth,district_name,district_region,average_salary
0,576,55,30253,71033382,146.48,171812,1970-01-01,credit,credit in cash,36.0,45750951.0,692,692,1970-01-01,Brno - venkov,south Moravia,349.72
1,576,55,30253,71033382,146.48,171812,1970-01-01,credit,credit in cash,36.0,45750951.0,693,693,1970-01-01,Brno - venkov,south Moravia,349.72
2,576,55,30253,71033382,146.48,171813,1970-01-01,credit,collection from another bank,248.28,30300313.0,692,692,1970-01-01,Brno - venkov,south Moravia,349.72
3,576,55,30253,71033382,146.48,171813,1970-01-01,credit,collection from another bank,248.28,30300313.0,693,693,1970-01-01,Brno - venkov,south Moravia,349.72
4,576,55,30253,71033382,146.48,3549613,1970-01-01,credit,,0.804,45750951.0,692,692,1970-01-01,Brno - venkov,south Moravia,349.72


In [777]:
# Define the order of the columns
df_ordered = ['account_id', 
              'date_of_birth',
              'order_id', 
              'order_amount', 
              'order_receiver',
              'trans_id', 
              'trans_amount', 
              'trans_receiver', 
              'trans_date', 
              'trans_type', 
              'trans_mode', 
              'client_id', 
              'district_id', 
              'district_region', 
              'district_name', 
              'average_salary']

# Reorder the columns
df_merged = df_merged[df_ordered]

df_merged.head()

Unnamed: 0,account_id,date_of_birth,order_id,order_amount,order_receiver,trans_id,trans_amount,trans_receiver,trans_date,trans_type,trans_mode,client_id,district_id,district_region,district_name,average_salary
0,576,1970-01-01,30253,146.48,71033382,171812,36.0,45750951.0,1970-01-01,credit,credit in cash,692,55,south Moravia,Brno - venkov,349.72
1,576,1970-01-01,30253,146.48,71033382,171812,36.0,45750951.0,1970-01-01,credit,credit in cash,693,55,south Moravia,Brno - venkov,349.72
2,576,1970-01-01,30253,146.48,71033382,171813,248.28,30300313.0,1970-01-01,credit,collection from another bank,692,55,south Moravia,Brno - venkov,349.72
3,576,1970-01-01,30253,146.48,71033382,171813,248.28,30300313.0,1970-01-01,credit,collection from another bank,693,55,south Moravia,Brno - venkov,349.72
4,576,1970-01-01,30253,146.48,71033382,3549613,0.804,45750951.0,1970-01-01,credit,,692,55,south Moravia,Brno - venkov,349.72


In [780]:
df_district['district_region'].unique()

array(['Prague', 'central Bohemia', 'south Bohemia', 'west Bohemia',
       'north Bohemia', 'east Bohemia', 'south Moravia', 'north Moravia'],
      dtype=object)

In [781]:
df_district['district_name'].unique()

array(['Hl.m. Praha', 'Benesov', 'Beroun', 'Kladno', 'Kolin',
       'Kutna Hora', 'Melnik', 'Mlada Boleslav', 'Nymburk',
       'Praha - vychod', 'Praha - zapad', 'Pribram', 'Rakovnik',
       'Ceske Budejovice', 'Cesky Krumlov', 'Jindrichuv Hradec',
       'Pelhrimov', 'Pisek', 'Prachatice', 'Strakonice', 'Tabor',
       'Domazlice', 'Cheb', 'Karlovy Vary', 'Klatovy', 'Plzen - mesto',
       'Plzen - jih', 'Plzen - sever', 'Rokycany', 'Sokolov', 'Tachov',
       'Ceska Lipa', 'Decin', 'Chomutov', 'Jablonec n. Nisou', 'Liberec',
       'Litomerice', 'Louny', 'Most', 'Teplice', 'Usti nad Labem',
       'Havlickuv Brod', 'Hradec Kralove', 'Chrudim', 'Jicin', 'Nachod',
       'Pardubice', 'Rychnov nad Kneznou', 'Semily', 'Svitavy', 'Trutnov',
       'Usti nad Orlici', 'Blansko', 'Brno - mesto', 'Brno - venkov',
       'Breclav', 'Hodonin', 'Jihlava', 'Kromeriz', 'Prostejov', 'Trebic',
       'Uherske Hradiste', 'Vyskov', 'Zlin', 'Znojmo', 'Zdar nad Sazavou',
       'Bruntal', 'Frydek - M