In [15]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import seaborn as sns

In [16]:
import tensorflow as tf

from tensorflow import keras
from keras import layers, models

print(tf.__version__)

2.8.2


In [17]:
# Ocitavanje kategorija
column_names_categories = ['code','parent-code','name']
categories_data = pd.read_csv('categories.csv', names = column_names_categories, na_values = '?', comment='\t', sep=',', skipinitialspace=True, header=0)
categories_data.head()

Unnamed: 0,code,parent-code,name
0,0,A,Other
1,1,B,Other
2,2,B,Auto Insurance
3,3,B,Auto Leasing
4,4,B,Gas & Fuel


In [18]:
# Ocitavanje mcc kodova
column_names_mmc = ['code', 'merchant-type']
mcc_data = pd.read_csv('mmc_codes.csv', names = column_names_mmc, na_values = '?', comment='\t', sep=',', skipinitialspace=True, header=0)
mcc_data.head()

Unnamed: 0,code,merchant-type
0,4814,Telecommunication service including local and ...
1,4815,VisaPhone
2,4821,Telegraph services
3,4829,Money Orders - Wire Transfer
4,4899,Cable and other pay television (previously Cab...


In [19]:
def preproccess_data(data_url, labels_url = None):
  column_names_data = ['id', 'beneficiary-name', 'date', 'direction', 'amount', 'description', 'currency', 'mcc', 'kind']
  column_names_labels = ['id', 'label']

  dataset = pd.read_csv(data_url, names = column_names_data, na_values = '?', comment='\t', sep=',', skipinitialspace=True, header=0)

  #amount
  dataset['amount'] = dataset['amount'].map(lambda x : float(x.replace(",", "")))
  
  #date => day, month, year
  dataset[["day", "month", "year"]] = dataset["date"].str.split("/", expand=True)
  del dataset['date']
  dataset = dataset.astype({"day" : int, "month" : int, "year" : int})
  
  #kind
  dataset['kind'] = dataset['kind'].fillna('U')
  kind_dummy = pd.get_dummies(dataset['kind'], prefix='', prefix_sep='')
  del dataset['kind']
  dataset = pd.concat([dataset, kind_dummy], axis=1, join='inner')
  
  #direction
  direction_dummy = pd.get_dummies(dataset['direction'], prefix='', prefix_sep='')
  del dataset['direction']
  dataset = pd.concat([dataset, direction_dummy], axis=1, join='inner')
  
  # mcc
  
  # fill nan values & convert to int
  dataset['mcc'] = dataset['mcc'].fillna('0')
  dataset = dataset.astype({"mcc" : int})
  unique_columns = dataset['mcc'].unique()
  
  # get dummies
  mcc_dummy = pd.get_dummies(dataset['mcc'], prefix='', prefix_sep='')
  dataset = pd.concat([dataset, mcc_dummy], axis=1, join='inner')

  # handle missing values from mcc codes
  mcc_data['code'] = mcc_data['code'].astype({"code" : int}) 
  missing_columns = set(mcc_data['code']) - set(unique_columns)
  missing_columns = [str(x) for x in missing_columns]
  values = [[0 for i in range(len(missing_columns))] for i in range(dataset.shape[0])]
  missing_df = pd.DataFrame(values, columns=missing_columns)

  dataset = pd.concat([dataset, missing_df], axis = 1, join='inner')
  del dataset['mcc']

  # description
  del dataset['description']
  
  # beneficiary name
  del dataset['beneficiary-name']
  
  # currency
  del dataset['currency']
  
  # id
  del dataset['id']

  if labels_url != None:
    dataset_labels = pd.read_csv(labels_url, names = column_names_labels, na_values = '?', comment='\t', sep = ',' , skipinitialspace=True, header=0)
    dataset_labels.pop('id')
    # add label column
    dataset['label'] = dataset_labels
  
  return dataset

In [20]:
dataset = preproccess_data("transactions.csv", "transactions_labels.csv")
dataset.head()

Unnamed: 0,amount,day,month,year,dep,fee,pmt,sal,wdw,c,...,9701,5094,9702,5099,5611,5621,5111,8699,5631,label
0,187.2,1,1,2021,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,17
1,44.3,1,1,2021,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,39
2,17.0,1,1,2021,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,39
3,300.1,1,1,2021,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,64
4,35.1,1,1,2021,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,67


In [21]:
def test_model_accuracy(model_url , processed_dataset):
  model = tf.keras.models.load_model(model_url)
  num_classes = categories_data['code'].max() + 1

  X_test = dataset.drop('label', axis=1)
  y_test = dataset[['label']]

  test_labels_cat = tf.keras.utils.to_categorical(y_test, num_classes)

  test_loss, test_acc = model.evaluate(X_test, test_labels_cat, verbose=2)
  return test_loss, test_acc


In [22]:
url = "/content/categories_network.h5"

loss, accuracy = test_model_accuracy(url , dataset)

41/41 - 0s - loss: 0.3710 - accuracy: 0.8404 - 292ms/epoch - 7ms/step
