In [1]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import seaborn as sns

In [2]:
import tensorflow as tf

from tensorflow import keras
from keras import layers, models

print(tf.__version__)

2.8.2


In [3]:
# Ocitavanje kategorija
column_names_categories = ['code','parent-code','name']
categories_data = pd.read_csv('categories.csv', names = column_names_categories, na_values = '?', comment='\t', sep=',', skipinitialspace=True, header=0)
categories_data.head()

Unnamed: 0,code,parent-code,name
0,0,A,Other
1,1,B,Other
2,2,B,Auto Insurance
3,3,B,Auto Leasing
4,4,B,Gas & Fuel


In [4]:
# Ocitavanje mcc kodova
column_names_mmc = ['code', 'merchant-type']
mcc_data = pd.read_csv('mmc_codes.csv', names = column_names_mmc, na_values = '?', comment='\t', sep=',', skipinitialspace=True, header=0)
mcc_data.head()

Unnamed: 0,code,merchant-type
0,4814,Telecommunication service including local and ...
1,4815,VisaPhone
2,4821,Telegraph services
3,4829,Money Orders - Wire Transfer
4,4899,Cable and other pay television (previously Cab...


In [5]:
def preproccess_data(data_url, labels_url = None):
  column_names_data = ['id', 'beneficiary-name', 'date', 'direction', 'amount', 'description', 'currency', 'mcc', 'kind']
  if labels_url == None:
    column_names_data += ['label']

  dataset = pd.read_csv(data_url, names = column_names_data, na_values = '?', comment='\t', sep=',', skipinitialspace=True, header=0)
  dataset.head()

  #amount
  dataset['amount'] = dataset['amount'].map(lambda x : float(x.replace(",", "")))
  
  #date => day, month, year

  days = [str(i) + "d" for i in range(1, 32)]
  months = [str(i) + "m" for i in range(1, 13)]

  dataset[["month", "day", "year"]] = dataset["date"].str.split("/", expand=True)
  del dataset['date']
  del dataset['year']
  
  dataset['day'] = dataset['day'].map(lambda x : x + "d")
  dataset['month'] = dataset['month'].map(lambda x : x + "m")

  # handle days as categorical
  unique_days_columns = dataset['day'].unique()

  days_dummy = pd.get_dummies(dataset['day'], prefix='', prefix_sep='')
  dataset = pd.concat([dataset, days_dummy], axis=1, join='inner')

  missing_columns_days = set(days) - set(unique_days_columns)
  values = [[0 for i in range(len(missing_columns_days))] for i in range(dataset.shape[0])]
  missing_df_days = pd.DataFrame(values, columns=missing_columns_days)

  dataset = pd.concat([dataset, missing_df_days], axis = 1, join='inner')
  del dataset['day']

  # handle months as categorical

  unique_months_columns = dataset['month'].unique()

  months_dummy = pd.get_dummies(dataset['month'], prefix='', prefix_sep='')
  dataset = pd.concat([dataset, months_dummy], axis=1, join='inner')

  missing_columns_months = set(months) - set(unique_months_columns)
  values = [[0 for i in range(len(missing_columns_months))] for i in range(dataset.shape[0])]
  missing_df_months = pd.DataFrame(values, columns=missing_columns_months)

  dataset = pd.concat([dataset, missing_df_months], axis = 1, join='inner')
  del dataset['month']

  #kind
  kinds = ['dep', 'wdw', 'pmt', 'fee', 'inc', 'rev', 'adj', 'lnd', 'lnr', 'fcx', 'aop', 'acl', 'spl', 'sal']
  
  # check nan values
  dataset['kind'] = dataset['kind'].fillna('U')

  # unique columns of given data (can be subset of all kinds)
  unique_kinds_columns = dataset['kind'].unique()
  
  # get dummies
  kind_dummy = pd.get_dummies(dataset['kind'], prefix='', prefix_sep='')
  dataset = pd.concat([dataset, kind_dummy], axis=1, join='inner')

  # handle missing columns from kinds
  missing_columns_kinds = set(kinds) - set(unique_kinds_columns)
  missing_columns_kinds = [str(x) for x in missing_columns_kinds]
  values = [[0 for i in range(len(missing_columns_kinds))] for i in range(dataset.shape[0])]
  missing_df_kinds = pd.DataFrame(values, columns=missing_columns_kinds)

  dataset = pd.concat([dataset, missing_df_kinds], axis = 1, join='inner')

  del dataset['kind']

  #direction
  direction_dummy = pd.get_dummies(dataset['direction'], prefix='', prefix_sep='')
  del dataset['direction']
  dataset = pd.concat([dataset, direction_dummy], axis=1, join='inner')


  # mcc
  
  # fill nan values & convert to int
  dataset['mcc'] = dataset['mcc'].fillna('0')
  dataset = dataset.astype({"mcc" : int})

  unique_columns_mcc = dataset['mcc'].unique()
  
  # get dummies
  mcc_dummy = pd.get_dummies(dataset['mcc'], prefix='', prefix_sep='')
  dataset = pd.concat([dataset, mcc_dummy], axis=1, join='inner')

  # handle missing values from mcc codes
  mcc_data['code'] = mcc_data['code'].astype({"code" : int}) 
  missing_columns_mcc = set(mcc_data['code']) - set(unique_columns_mcc)
  missing_columns_mcc = [str(x) for x in missing_columns_mcc]
  values = [[0 for i in range(len(missing_columns_mcc))] for i in range(dataset.shape[0])]
  missing_df_mcc = pd.DataFrame(values, columns=missing_columns_mcc)

  dataset = pd.concat([dataset, missing_df_mcc], axis = 1, join='inner')
  del dataset['mcc']

  # description
  del dataset['description']
  
  # beneficiary name
  del dataset['beneficiary-name']
  
  # currency
  del dataset['currency']
  
  # id
  del dataset['id']

  if labels_url != None:
    column_names_labels = ['id', 'label']
    dataset_labels = pd.read_csv(labels_url, names = column_names_labels, na_values = '?', comment='\t', sep = ',' , skipinitialspace=True, header=0)
    dataset_labels.pop('id')
    # add label column
    dataset['label'] = dataset_labels
  
  return dataset

In [6]:
url = "train_with_labels.csv"

dataset = preproccess_data(url)
dataset.head()

Unnamed: 0,amount,label,10d,11d,12d,13d,14d,15d,16d,17d,...,9700,9701,5094,9702,5099,5611,5621,5111,8699,5631
0,187.2,17,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,44.3,39,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,17.0,39,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,300.1,64,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,35.1,67,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
def test_model_accuracy(model , processed_dataset):
  num_classes = categories_data['code'].max() + 1

  X_test = dataset.drop('label', axis=1)
  y_test = dataset[['label']]

  test_labels_cat = tf.keras.utils.to_categorical(y_test, num_classes)

  test_loss, test_acc = model.evaluate(X_test, test_labels_cat, verbose=2)
  return test_loss, test_acc


In [8]:
def load_model(url):
  model = tf.keras.models.load_model(url)
  return model

In [9]:
url = "/content/categories_network.h5"
model = load_model(url)

loss, accuracy = test_model_accuracy(model , dataset)

41/41 - 0s - loss: 0.2394 - accuracy: 0.9256 - 473ms/epoch - 12ms/step
