In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from operator import itemgetter
import json

Reading Dataset

In [2]:
#reading danish atm transaction dataset
df_transaction=pd.read_csv('atm_data.csv')

In [3]:
df_transaction.head(10)

Unnamed: 0,year,month,day,weekday,hour,atm_status,atm_id,atm_manufacturer,atm_location,atm_streetname,...,temp,pressure,humidity,wind_speed,wind_deg,rain_3h,clouds_all,weather_id,weather_main,weather_description
0,2017,January,1,Sunday,0,Active,1,NCR,Næstved,Farimagsvej,...,281.15,1014,87,7,260,0.215,92,500,Rain,light rain
1,2017,January,1,Sunday,0,Inactive,2,NCR,Vejgaard,Hadsundvej,...,280.64,1020,93,9,250,0.59,92,500,Rain,light rain
2,2017,January,1,Sunday,0,Inactive,2,NCR,Vejgaard,Hadsundvej,...,280.64,1020,93,9,250,0.59,92,500,Rain,light rain
3,2017,January,1,Sunday,0,Inactive,3,NCR,Ikast,Rådhusstrædet,...,281.15,1011,100,6,240,,75,300,Drizzle,light intensity drizzle
4,2017,January,1,Sunday,0,Active,4,NCR,Svogerslev,Brønsager,...,280.61,1014,87,7,260,,88,701,Mist,mist
5,2017,January,1,Sunday,0,Active,5,NCR,Nibe,Torvet,...,280.64,1020,93,9,250,0.59,92,500,Rain,light rain
6,2017,January,1,Sunday,0,Active,6,NCR,Fredericia,Sjællandsgade,...,281.15,1014,93,7,230,0.29,92,500,Rain,light rain
7,2017,January,1,Sunday,0,Active,7,Diebold Nixdorf,Hjallerup,Hjallerup Centret,...,280.64,1020,93,9,250,0.59,92,500,Rain,light rain
8,2017,January,1,Sunday,0,Active,8,NCR,Glyngøre,Færgevej,...,281.15,1011,100,6,240,,75,300,Drizzle,light intensity drizzle
9,2017,January,1,Sunday,0,Active,9,Diebold Nixdorf,Hadsund,Storegade,...,280.64,1020,93,9,250,0.59,92,500,Rain,light rain


In [4]:
df_transaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250000 entries, 0 to 1249999
Data columns (total 33 columns):
year                   1250000 non-null int64
month                  1250000 non-null object
day                    1250000 non-null int64
weekday                1250000 non-null object
hour                   1250000 non-null int64
atm_status             1250000 non-null object
atm_id                 1250000 non-null int64
atm_manufacturer       1250000 non-null object
atm_location           1250000 non-null object
atm_streetname         1250000 non-null object
atm_street_number      1250000 non-null int64
atm_zipcode            1250000 non-null int64
atm_lat                1250000 non-null float64
atm_lon                1250000 non-null float64
currency               1250000 non-null object
card_type              1250000 non-null object
service                1250000 non-null object
message_code           4830 non-null float64
message_text           4830 non-null object
wea

Data Pre-processing 

In [5]:
#check if there is any missing values
df_transaction.isnull().sum()

year                         0
month                        0
day                          0
weekday                      0
hour                         0
atm_status                   0
atm_id                       0
atm_manufacturer             0
atm_location                 0
atm_streetname               0
atm_street_number            0
atm_zipcode                  0
atm_lat                      0
atm_lon                      0
currency                     0
card_type                    0
service                      0
message_code           1245170
message_text           1245170
weather_lat                  0
weather_lon                  0
weather_city_id              0
weather_city_name            0
temp                         0
pressure                     0
humidity                     0
wind_speed                   0
wind_deg                     0
rain_3h                1109488
clouds_all                   0
weather_id                   0
weather_main                 0
weather_

In [6]:
#Handling missing values
df_transaction['message_code'] = df_transaction['message_code'].fillna(0)
df_transaction['message_text'] = df_transaction['message_text'].fillna(0)
df_transaction ['rain_3h']= df_transaction['rain_3h'].fillna(df_transaction['rain_3h'].mean())

In [7]:
df_transaction.isnull().sum()

year                   0
month                  0
day                    0
weekday                0
hour                   0
atm_status             0
atm_id                 0
atm_manufacturer       0
atm_location           0
atm_streetname         0
atm_street_number      0
atm_zipcode            0
atm_lat                0
atm_lon                0
currency               0
card_type              0
service                0
message_code           0
message_text           0
weather_lat            0
weather_lon            0
weather_city_id        0
weather_city_name      0
temp                   0
pressure               0
humidity               0
wind_speed             0
wind_deg               0
rain_3h                0
clouds_all             0
weather_id             0
weather_main           0
weather_description    0
dtype: int64

Encodeing categorical features to numerical quantities

In [8]:
#encode 'month' feature to numerical quantities ranging from 1 to 12
month={'January':1, 'February':2, 'March':3, 'April':4, 'May':5, 'June':6,
      'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12}
df_transaction['month'].replace(month, inplace=True)

#encode 'weekday' feature to numerical quantities ranging from 1 to 7
weekday={'Sunday':1, 'Monday':2, 'Tuesday':3, 'Wednesday':4, 'Thursday':5, 'Friday':6, 'Saturday':7}
df_transaction['weekday'].replace(weekday, inplace=True)

#encode 'atm_status' feature to numerical quantities 0 and 1
atm_status={'Active':1, 'Inactive':0}
df_transaction['atm_status'].replace(atm_status, inplace=True)

#encode 'atm_manufacturer' feature to numerical quantities 0 and 1
atm_manufacturer={'NCR':0, 'Diebold Nixdorf':1}
df_transaction['atm_manufacturer'].replace(atm_manufacturer, inplace=True)


#encode 'atm_location' feature to numerical quantities ranging from 1 to 105 (105 is the number
#of unique values in the feature)

labels = df_transaction['atm_location'].astype('category').cat.categories.tolist()
atm_location = {'atm_location' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
df_transaction.replace(atm_location, inplace=True)

#encode 'atm_streetname' feature to numerical quantities ranging from 1 to 80 (80 is the number
#of unique values in the feature)
labels = df_transaction['atm_streetname'].astype('category').cat.categories.tolist()
atm_streetname = {'atm_streetname' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
df_transaction.replace(atm_streetname, inplace=True)

#encode 'currency' feature to numerical quantities ranging from 1 to 4 (4 is the number
#of unique values in the feature-- DKK,EUR, GBP, USD)
labels = df_transaction['currency'].astype('category').cat.categories.tolist()
currency = {'currency' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
df_transaction.replace(currency, inplace=True)

#encode 'card_type' feature to numerical quantities ranging from 1 to 12 (12 is the number
#of unique values in the feature)
labels = df_transaction['card_type'].astype('category').cat.categories.tolist()
card_type = {'card_type' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
df_transaction.replace(card_type, inplace=True)

#encode 'service' feature to numerical quantity 1
service={'Withdrawal':1}
df_transaction['service'].replace(service, inplace=True)

#encode 'message_text' feature to numerical quantities ranging from 1 to 7 (7 is the number
#of unique values in the feature. missing values were replaced with 0 and encode as 1 here)
labels = df_transaction['message_text'].astype('category').cat.categories.tolist()
message_text = {'message_text' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
df_transaction.replace(message_text, inplace=True)

#encode 'weather_city_name' feature to numerical quantities ranging from 1 to 52 (52 is the number
#of unique values in the feature)
labels = df_transaction['weather_city_name'].astype('category').cat.categories.tolist()
weather_city_name = {'weather_city_name' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
df_transaction.replace(weather_city_name, inplace=True)

#encode 'weather_main' feature to numerical quantities ranging from 1 to 9(9 is the number
#of unique values in the feature)
labels = df_transaction['weather_main'].astype('category').cat.categories.tolist()
weather_main = {'weather_main' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
df_transaction.replace(weather_main, inplace=True)

#encode 'weather_description' feature to numerical quantities ranging from 1 to 39(39 is the number
#of unique values in the feature)
labels = df_transaction['weather_description'].astype('category').cat.categories.tolist()
weather_description = {'weather_description' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
df_transaction.replace(weather_description, inplace=True)

df_transaction['month']=df_transaction['month'].astype('int')

In [9]:
df_transaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250000 entries, 0 to 1249999
Data columns (total 33 columns):
year                   1250000 non-null int64
month                  1250000 non-null int64
day                    1250000 non-null int64
weekday                1250000 non-null int64
hour                   1250000 non-null int64
atm_status             1250000 non-null int64
atm_id                 1250000 non-null int64
atm_manufacturer       1250000 non-null int64
atm_location           1250000 non-null int64
atm_streetname         1250000 non-null int64
atm_street_number      1250000 non-null int64
atm_zipcode            1250000 non-null int64
atm_lat                1250000 non-null float64
atm_lon                1250000 non-null float64
currency               1250000 non-null int64
card_type              1250000 non-null int64
service                1250000 non-null int64
message_code           1250000 non-null float64
message_text           1250000 non-null int64
weather

In [10]:
df_transaction.head(10)

Unnamed: 0,year,month,day,weekday,hour,atm_status,atm_id,atm_manufacturer,atm_location,atm_streetname,...,temp,pressure,humidity,wind_speed,wind_deg,rain_3h,clouds_all,weather_id,weather_main,weather_description
0,2017,1,1,1,0,1,1,0,72,16,...,281.15,1014,87,7,260,0.215,92,500,7,15
1,2017,1,1,1,0,0,2,0,99,23,...,280.64,1020,93,9,250,0.59,92,500,7,15
2,2017,1,1,1,0,0,2,0,99,23,...,280.64,1020,93,9,250,0.59,92,500,7,15
3,2017,1,1,1,0,0,3,0,46,50,...,281.15,1011,100,6,240,0.982272,75,300,3,12
4,2017,1,1,1,0,1,4,0,92,9,...,280.61,1014,87,7,260,0.982272,88,701,6,20
5,2017,1,1,1,0,1,5,0,67,69,...,280.64,1020,93,9,250,0.59,92,500,7,15
6,2017,1,1,1,0,1,6,0,23,54,...,281.15,1014,93,7,230,0.29,92,500,7,15
7,2017,1,1,1,0,1,7,1,36,27,...,280.64,1020,93,9,250,0.59,92,500,7,15
8,2017,1,1,1,0,1,8,0,28,19,...,281.15,1011,100,6,240,0.982272,75,300,3,12
9,2017,1,1,1,0,1,9,1,29,58,...,280.64,1020,93,9,250,0.59,92,500,7,15


In [11]:
df_transaction.to_csv(r'TransactionData.csv', index=False)

In [11]:
df_reference=df_transaction.sample(n=1000, replace=False)

In [12]:
df_reference.head()

Unnamed: 0,year,month,day,weekday,hour,atm_status,atm_id,atm_manufacturer,atm_location,atm_streetname,...,temp,pressure,humidity,wind_speed,wind_deg,rain_3h,clouds_all,weather_id,weather_main,weather_description
833844,2017,5,5,6,18,1,53,0,94,70,...,282.361,1037,92,5,347,0.982272,8,800,1,33
1100974,2017,6,12,2,10,0,16,0,83,2,...,287.15,1004,87,6,250,0.982272,75,803,2,2
251248,2017,2,10,6,18,1,6,0,23,54,...,272.43,1029,86,9,80,0.982272,88,804,2,22
780780,2017,4,29,7,12,1,13,0,93,73,...,279.469,1032,100,6,45,0.982272,0,800,1,1
1121704,2017,6,15,5,11,0,94,0,58,52,...,290.72,1015,77,3,230,0.982272,0,800,1,1


In [31]:
ls_reference=df_reference.values.tolist()
ls_object=df_transaction.values.tolist()
with open('References_list.json', 'w') as output:
    json.dump(ls_reference, output)

In [32]:
# Calculating Spearman Footrule Distance
def sfd(obj, ref):
    n=len(obj)
    d=0
    for i in range(n):
        d+=abs(obj[i]-ref[i])
    return d

In [33]:
# Creating Permutation based Indexing list
def pbi(ls_reference, index_obj, obj):
    b=[]
    n_tilda=10
    for i in range(1000):
        distance= sfd(obj, ls_reference[i])
        b.append([i,distance])
    ordered_pruned_list=sorted(b, key=itemgetter(1))[:n_tilda]
    for i in range (n_tilda):
        inverted_list.append([index_obj, ordered_pruned_list[i][0], i+1])
    return inverted_list

In [34]:
#creates the Metric inverted list
def MI(inverted_list, ls_reference):
    
    for i in range(1000):
        temp=[[obj[0],obj[2]] for obj in inverted_list if i==obj[1]]
        mi.append([i,temp])
        
    return mi

In [35]:
before=datetime.now()
inverted_list=[]
for i  in range(len(ls_object)):
    il = pbi(ls_reference, i, ls_object[i])
mi=[]
metric_inverted = MI(il, ls_reference)

#convert metric inverted list to dictionary
dict_mi=dict(metric_inverted)

#write metric inverted (dictionary)  into json file
with open('MIF_dict.json', 'w') as output:
    json.dump(dict_mi, output)

#write metric inverted (list)  into json file    
with open('MIF_list.json', 'w') as output:
    json.dump(metric_inverted, output)
        
print('MI-File has been created')        
after=datetime.now()
execution_time=after-before
print('Execution Time: ', execution_time)

MI-File has been created
Execution Time:  1:56:55.256205
