# Context

In [1]:
import pandas as pd

train_data = pd.read_csv('data/train.csv', skipinitialspace = True)
codes_data = pd.read_csv('data/codes.csv', sep = ';')
types_data = pd.read_csv('data/types.csv', sep = ';')

# Data Exploration

## train.csv

`train_data.csv` file has 4400 rows and 6 columns. Let's verify does `train_data` variable has correct shape

In [2]:
train_data.shape

(4400, 6)

So, `train_data` has correctly information about provided dataset

Let's verify columns by printing preview of `train_data`

In [3]:
train_data.head()

Unnamed: 0,client_id,datetime,code,type,amount,target
0,16004843,245 22:10:18,5411,1010,-7928.08,0
1,20873616,433 07:44:21,4814,1030,-56.15,0
2,65231233,164 15:58:23,5411,1010,-4689.47,1
3,29536165,413 11:47:51,5411,1010,-3703.52,1
4,66473821,280 05:17:26,6010,7030,67377.47,1


Let's explore `train_data` columns:

In [4]:
train_data.describe().apply(lambda s: s.apply('{0:.2f}'.format))

Unnamed: 0,client_id,code,type,amount,target
count,4400.0,4400.0,4400.0,4400.0,4400.0
mean,51180948.42,5591.18,2477.99,-25829.42,0.45
std,28459423.46,618.25,2239.28,379993.04,0.5
min,31385.0,742.0,1010.0,-11229578.86,0.0
25%,26409535.0,5211.0,1030.0,-22459.16,0.0
50%,53203093.5,5541.0,1110.0,-5362.35,0.0
75%,75082762.25,6010.0,2370.0,-1122.96,1.0
max,99938948.0,9399.0,7075.0,6737747.32,1.0


Let's explore `train_data` column types:

In [5]:
train_data.dtypes

client_id      int64
datetime      object
code           int64
type           int64
amount       float64
target         int64
dtype: object

Let's calclulate number of unique `client_id` items. May be they are unique?

In [6]:
train_data['client_id'].is_unique

False

So, client_id has duplicated values and cannot be used as column id

Let split `datetime` column to 2 independent

In [7]:
train_data[['unknown_0', 'time']] = train_data['datetime'].str.split(' ', expand = True)
train_data.drop('datetime', axis = 1, inplace = True)
train_data.head()

Unnamed: 0,client_id,code,type,amount,target,unknown_0,time
0,16004843,5411,1010,-7928.08,0,245,22:10:18
1,20873616,4814,1030,-56.15,0,433,07:44:21
2,65231233,5411,1010,-4689.47,1,164,15:58:23
3,29536165,5411,1010,-3703.52,1,413,11:47:51
4,66473821,6010,7030,67377.47,1,280,05:17:26


## codes.csv

`codes.csv` file has 184 rows and 2 columns. Let's verify does `codes_data` variable has correct shape

In [8]:
codes_data.shape

(184, 2)

So, `codes_data` has correctly information about provided dataset

Let's explore `codes_data` columns and their types

In [9]:
codes_data.head()

Unnamed: 0,code,code_description
0,5944,"Магазины по продаже часов, ювелирных изделий и..."
1,5621,Готовые сумочные изделия
2,5697,"Услуги по переделке, починке и пошиву одежды"
3,7995,Транзакции по азартным играм
4,5137,"Мужская, женская и детская спец-одежда"


In [10]:
codes_data.describe()

Unnamed: 0,code
count,184.0
mean,6046.793478
std,1470.328104
min,742.0
25%,5208.25
50%,5813.5
75%,7283.0
max,9402.0


In [11]:
codes_data.dtypes

code                 int64
code_description    object
dtype: object

Let's verify uniquess of `code` column

In [12]:
codes_data['code'].is_unique

True

Sode `code` column could be used as Id

In [13]:
codes_data.set_index('code', inplace = True)
codes_data

Unnamed: 0_level_0,code_description
code,Unnamed: 1_level_1
5944,"Магазины по продаже часов, ювелирных изделий и..."
5621,Готовые сумочные изделия
5697,"Услуги по переделке, починке и пошиву одежды"
7995,Транзакции по азартным играм
5137,"Мужская, женская и детская спец-одежда"
...,...
5044,"Офисное, фотографическое, фотокопировальное, и..."
5983,"Горючее топливо — уголь, нефть, разжиженный бе..."
5994,Дилеры по продаже печатной продукции
5192,"Книги, периодические издания и газеты"


## types.csv

`types.csv` file has 155 rows and 2 columns. Let's verify does `types_data` variable has correct shape

In [14]:
types_data.shape

(155, 2)

So, `codes_data` has correctly information about provided dataset

Let's explore `codes_data` columns and their types

In [15]:
types_data.head()

Unnamed: 0,type,type_description
0,8001,Установление расх. лимита по карте
1,2411,Перевод с карты на счет др.лица в одном тер. б...
2,4035,н/д(нет данных)
3,3001,Комиссия за обслуживание ссудного счета
4,2420,Перевод с карты на счет физ.лица в другом тер....


In [16]:
types_data.describe()

Unnamed: 0,type
count,155.0
mean,10819.045161
std,80000.258847
min,1000.0
25%,2385.5
50%,4040.0
75%,7027.5
max,999999.0


In [17]:
types_data.dtypes

type                 int64
type_description    object
dtype: object

Let's verify uniquess of `type` column

In [18]:
types_data['type'].is_unique

True

Sode `type` column could be used as Id

In [19]:
types_data.set_index('type', inplace = True)
types_data

Unnamed: 0_level_0,type_description
type,Unnamed: 1_level_1
8001,Установление расх. лимита по карте
2411,Перевод с карты на счет др.лица в одном тер. б...
4035,н/д(нет данных)
3001,Комиссия за обслуживание ссудного счета
2420,Перевод с карты на счет физ.лица в другом тер....
...,...
4110,Плата за получение наличных в АТМ.
2320,Безналичный перевод денежных средств через POS
7040,Перевод на карту (с карты) через POS (в предел...
2433,Перевод с карты на счет физ.лица в другом банк...


# Data manipulation

As we know that columns `code` and `type` in `train_data` are categorical, let change their type accordingly

In [20]:
train_data['code'] = pd.Categorical(train_data['code'])
train_data['type'] = pd.Categorical(train_data['type'])
train_data.dtypes

client_id       int64
code         category
type         category
amount        float64
target          int64
unknown_0      object
time           object
dtype: object

Let's create dummy columns for `code` and `type` variables

In [21]:
train_data = train_data.join(pd.get_dummies(train_data['code'], prefix = 'code'))
train_data.drop(['code'], axis = 1, inprlace = True)
train_data = train_data.join(pd.get_dummies(train_data['type'], prefix = 'type'))
train_data.drop(['type'], axis = 1, inplace = True)
train_data.head()

Unnamed: 0,client_id,amount,target,unknown_0,time,code_742,code_3000,code_4111,code_4112,code_4121,...,type_7015,type_7020,type_7021,type_7030,type_7031,type_7034,type_7070,type_7071,type_7074,type_7075
0,16004843,-7928.08,0,245,22:10:18,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20873616,-56.15,0,433,07:44:21,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,65231233,-4689.47,1,164,15:58:23,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,29536165,-3703.52,1,413,11:47:51,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,66473821,67377.47,1,280,05:17:26,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
