# Importing data

In [3]:
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency as chi_test
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from keras.models import Sequential
from xgboost import XGBClassifier
from keras import backend as K
from keras.layers import Dense
import pandas as pd
import numpy as np
# import pycm

import warnings
warnings.filterwarnings("ignore")

# Importing data

In [2]:
data = pd.read_excel(r"D:\Academics\CMI\Internship\Fortiate\DST-1_Dimensionality\Data\data_with_target.xlsx")

# Preprocessing

### Header to code mapper

In [4]:
map_head_code = dict(list(zip(list(data.loc[0, :]), list(data.columns))))

In [5]:
data.head(2)

Unnamed: 0,Card-Wallet-Token ID,PROCESSING_CODE,TRANSACTION_AMOUNT,SETTLEMENT_AMOUNT,BILLING_AMOUNT,Transaction Date and Time,CONVERSION_RATE,Expiry Date,CONVERSION_RATE_DATE,Merchant Category Code,...,Interface ID,Terminal Type,Token Data-C1,Additional Payment Info,Ecommerce Data,ECI Indicator,Remaining Balance,Private Data - Additional Info,Token Data,TARGET
0,DE2,DE3,DE4,DE5,DE6,DE7,DE9,DE14,DE16,DE18,...,DE4801,DE4801,DE4803,DE4804,DE4805,DE4806,DE4807,DE4808,DE4809,DE4810
1,6086941819072010,174000,7000,7000,7000,22-MAR-18 09.10.37,0,01-JUL-23 00.00.00,,6011,...,IN1,ATM,0917,,,,35000,,,0


### Removing 1st row, Code

In [6]:
data = data.drop(0)

### Removing not required colums

In [7]:
not_required = ["CONVERSION_RATE_DATE", "Private Data 48", "Transaction Category Code", "DE48 Data Length", "Additional Payment Info", "Ecommerce Data", "ECI Indicator", "Private Data - Additional Info", "Token Data", "POS entry Mode", "Remaining Balance", "Token Data-C1"]

In [8]:
data = data.drop(not_required, axis=1)
data = data.dropna()

In [9]:
data.head(2)

Unnamed: 0,Card-Wallet-Token ID,PROCESSING_CODE,TRANSACTION_AMOUNT,SETTLEMENT_AMOUNT,BILLING_AMOUNT,Transaction Date and Time,CONVERSION_RATE,Expiry Date,Merchant Category Code,ACQUIRING_COUNTRY_CODE,...,TID,MID,MERCHANT NAME AND ADDRESS,TRANSACTION_CURRENCY,SETTLEMENT_CURRENCY,BILLING_CURRENCY,POS_DATA,Interface ID,Terminal Type,TARGET
1,6086941819072010,174000,7000,7000,7000,22-MAR-18 09.10.37,0,01-JUL-23 00.00.00,6011,356,...,GYACBB04,1005044921,LXFC BTNK LTD.KHURDT OR IN,356,356,356,210201210141,IN1,ATM,0
2,6086941026163000,174000,2000,2000,2000,22-MAR-18 12.28.15,0,01-DEC-22 00.00.00,6011,356,...,JSEWLK06,1005044921,LXFC BTNK LTD.LUCKNOWUP IN,356,356,356,210201210141,IN1,ATM,0


### Storing target

In [10]:
target = data.TARGET

In [11]:
print(len(data))
print(len(target[target == 1]))

42793
118


In [12]:
data.head(2)

Unnamed: 0,Card-Wallet-Token ID,PROCESSING_CODE,TRANSACTION_AMOUNT,SETTLEMENT_AMOUNT,BILLING_AMOUNT,Transaction Date and Time,CONVERSION_RATE,Expiry Date,Merchant Category Code,ACQUIRING_COUNTRY_CODE,...,TID,MID,MERCHANT NAME AND ADDRESS,TRANSACTION_CURRENCY,SETTLEMENT_CURRENCY,BILLING_CURRENCY,POS_DATA,Interface ID,Terminal Type,TARGET
1,6086941819072010,174000,7000,7000,7000,22-MAR-18 09.10.37,0,01-JUL-23 00.00.00,6011,356,...,GYACBB04,1005044921,LXFC BTNK LTD.KHURDT OR IN,356,356,356,210201210141,IN1,ATM,0
2,6086941026163000,174000,2000,2000,2000,22-MAR-18 12.28.15,0,01-DEC-22 00.00.00,6011,356,...,JSEWLK06,1005044921,LXFC BTNK LTD.LUCKNOWUP IN,356,356,356,210201210141,IN1,ATM,0


### Seperating numerical and catagorical data

In [13]:
numerical = ["TRANSACTION_AMOUNT", "SETTLEMENT_AMOUNT", "BILLING_AMOUNT", "CONVERSION_RATE"]

In [14]:
data_cat = data.drop(numerical, axis = 1)

In [15]:
data_cat.head(2)

Unnamed: 0,Card-Wallet-Token ID,PROCESSING_CODE,Transaction Date and Time,Expiry Date,Merchant Category Code,ACQUIRING_COUNTRY_CODE,ACQUIRER_INSTITUTION_CODE,Authorization Code,Approval Decision,Card Service Code,TID,MID,MERCHANT NAME AND ADDRESS,TRANSACTION_CURRENCY,SETTLEMENT_CURRENCY,BILLING_CURRENCY,POS_DATA,Interface ID,Terminal Type,TARGET
1,6086941819072010,174000,22-MAR-18 09.10.37,01-JUL-23 00.00.00,6011,356,544921,533038,0,201,GYACBB04,1005044921,LXFC BTNK LTD.KHURDT OR IN,356,356,356,210201210141,IN1,ATM,0
2,6086941026163000,174000,22-MAR-18 12.28.15,01-DEC-22 00.00.00,6011,356,544921,944896,0,201,JSEWLK06,1005044921,LXFC BTNK LTD.LUCKNOWUP IN,356,356,356,210201210141,IN1,ATM,0


In [16]:
data_num = data[numerical]

In [17]:
data_num.head(2)

Unnamed: 0,TRANSACTION_AMOUNT,SETTLEMENT_AMOUNT,BILLING_AMOUNT,CONVERSION_RATE
1,7000,7000,7000,0
2,2000,2000,2000,0


# Finding dependent catagories

In [18]:
cat_columns = ['PROCESSING_CODE',
     'Merchant Category Code',
     'ACQUIRING_COUNTRY_CODE',
     'ACQUIRER_INSTITUTION_CODE',
     'Card Service Code',
     'Interface ID',
     'Terminal Type']

### Cramer's V test

In [19]:
def cramers_corrected_stat(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = chi_test(confusion_matrix)[0]
    #print(chi2)
    n = (confusion_matrix.sum()).sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

In [20]:
count = 0
dependent_pair_cramer = []
for i in cat_columns:
    count += 1
    for j in cat_columns[count:]:
        data_crosstab =  pd.crosstab(data_cat[i], data_cat[j], margins = False)
        p = cramers_corrected_stat(data_crosstab)
        dependent_pair_cramer.append([i,j,p])

In [21]:
dependent_pair_cramer[:5]

[['PROCESSING_CODE', 'Merchant Category Code', 0.3934099833645813],
 ['PROCESSING_CODE', 'ACQUIRING_COUNTRY_CODE', 0.12731015328177156],
 ['PROCESSING_CODE', 'ACQUIRER_INSTITUTION_CODE', 0.7405815722504815],
 ['PROCESSING_CODE', 'Card Service Code', 0.5385232320928783],
 ['PROCESSING_CODE', 'Interface ID', 0.8226495633029258]]

#### Fixing threshold, Choosing pair having correlation more than threshold

In [22]:
thresold = 0.8

In [23]:
# valid_pair = list(map(lambda x: (x[0], x[1]) if (x[2] > thresold) else None , dependent_pair_cramer))
# valid_pair = [i for i in valid_pair if i] 

In [24]:
valid_pair = {}
for pairs in dependent_pair_cramer:
    if pairs[2] >= thresold:
        try:
            valid_pair[pairs[0]].append(pairs[1])
        except:
            valid_pair[pairs[0]] = [pairs[1]]

In [25]:
valid_pair

{'PROCESSING_CODE': ['Interface ID', 'Terminal Type'],
 'ACQUIRING_COUNTRY_CODE': ['ACQUIRER_INSTITUTION_CODE'],
 'ACQUIRER_INSTITUTION_CODE': ['Interface ID', 'Terminal Type'],
 'Card Service Code': ['Interface ID']}

#### Choosing sufficient (minimum required) predictor

method 1, (valid_pair as list)

In [26]:
# final_cat = cat_columns.copy()
# temp = [item for sublist in valid_pair for item in sublist]

# count = {}
# for i in temp:
#     try:
#         count[i] += 1
#     except:
#         count[i] = 0
        
# # count_list = list(count.items())
# # count_list.sort(key = lambda x: x[1], reverse = True)

# count = {k: v for k, v in sorted(count.items(), key=lambda item: item[1], reverse = True)}

Method 2, (valid pair as dictionary)

In [27]:
final_cat = cat_columns.copy()

for key in valid_pair:
    if key not in final_cat:
        final_cat.append(key)
    for value in valid_pair[key]:
        try:
            final_cat.remove(value)
        except:
            continue

In [28]:
final_cat

['PROCESSING_CODE',
 'Merchant Category Code',
 'ACQUIRING_COUNTRY_CODE',
 'Card Service Code',
 'ACQUIRER_INSTITUTION_CODE']

# Onehot encoding of predictors

In [35]:
# final_cat = ['PROCESSING_CODE',
#      'Merchant Category Code',
#      'ACQUIRING_COUNTRY_CODE',
#      'ACQUIRER_INSTITUTION_CODE',
#      'Card Service Code',
#      'Interface ID',
#      'Terminal Type']

In [55]:
final_cat = ['PROCESSING_CODE',
     'Merchant Category Code',
     'ACQUIRING_COUNTRY_CODE',
     'ACQUIRER_INSTITUTION_CODE',
     'Card Service Code']

In [56]:
new_data_1 = pd.DataFrame()
encoder = OneHotEncoder(sparse=False)
for col in final_cat:
    data_temp = data_cat[[col]]
    encoder.fit(data_temp.astype('str'))
    temp = encoder.transform(data_temp.astype('str'))
    temp = pd.DataFrame(temp,columns=[(col+"_"+str(i)) for i in data[col].value_counts().index])
    temp = temp.set_index(data_cat.index.values)
    new_data_1 = pd.concat([new_data_1,temp],axis=1)

In [57]:
new_data_1.head(2)

Unnamed: 0,PROCESSING_CODE_000000,PROCESSING_CODE_003000,PROCESSING_CODE_004000,PROCESSING_CODE_173000,PROCESSING_CODE_171000,PROCESSING_CODE_000098,PROCESSING_CODE_172000,PROCESSING_CODE_174000,PROCESSING_CODE_170000,PROCESSING_CODE_000099,...,ACQUIRER_INSTITUTION_CODE_463858,ACQUIRER_INSTITUTION_CODE_493500,ACQUIRER_INSTITUTION_CODE_465882,Card Service Code_226,Card Service Code_???,Card Service Code_206,Card Service Code_201,Card Service Code_101,Card Service Code_126,Card Service Code_000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [58]:
input_data = pd.concat([data_num, new_data_1], axis=1)

In [59]:
input_data.head(2)

Unnamed: 0,TRANSACTION_AMOUNT,SETTLEMENT_AMOUNT,BILLING_AMOUNT,CONVERSION_RATE,PROCESSING_CODE_000000,PROCESSING_CODE_003000,PROCESSING_CODE_004000,PROCESSING_CODE_173000,PROCESSING_CODE_171000,PROCESSING_CODE_000098,...,ACQUIRER_INSTITUTION_CODE_463858,ACQUIRER_INSTITUTION_CODE_493500,ACQUIRER_INSTITUTION_CODE_465882,Card Service Code_226,Card Service Code_???,Card Service Code_206,Card Service Code_201,Card Service Code_101,Card Service Code_126,Card Service Code_000
1,7000,7000,7000,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,2000,2000,2000,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [61]:
len(input_data.columns)

720

In [62]:
ratio = float(len(target[target == 1]))/float(len(target[target == 0]))
print(ratio)

0.0027650849443468075


# Train_test split

In [64]:
input_data.TRANSACTION_AMOUNT = (input_data.TRANSACTION_AMOUNT).astype("float32")
input_data.SETTLEMENT_AMOUNT = (input_data.SETTLEMENT_AMOUNT).astype("float32")
input_data.BILLING_AMOUNT = (input_data.BILLING_AMOUNT).astype("float32")
input_data.CONVERSION_RATE = (input_data.CONVERSION_RATE).astype("float32")

In [65]:
X_train, X_test, Y_train, Y_test = train_test_split(input_data, target, test_size = 0.2, random_state = 42)

# Model 1 - Random forest

In [66]:
Y_test = Y_test.astype('float64') 
Y_train = Y_train.astype('float64')
Y_test.dtype

dtype('float64')

In [67]:
model_rf = RandomForestClassifier()
model_rf.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [68]:
y_pred_rf = model_rf.predict(X_test)

### Scores

##### Accuracy

In [69]:
accuracy_score(Y_test,y_pred_rf)

0.9982474588152822

##### Precision

In [70]:
precision_score(Y_test,y_pred_rf)

0.8571428571428571

##### Recall

In [71]:
recall_score(Y_test,y_pred_rf)

0.6

# Model 2 - Neural Network

### Defining functions

In [73]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

In [74]:
def fit_model(train_X, train_Y):
    length = len(list(X_train.columns))

    model = Sequential()
    model.add(Dense(128, input_dim = length, activation='relu'))
    model.add(Dense(8, activation='relu'))
#     model.add(Dense(1, activation='softmax'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = [recall_m, "acc"])
    model.fit(train_X, train_Y, epochs = 10, batch_size = 512)
    
    return(model)

In [75]:
def predict(model, test_X):
    return(model.predict(test_X))

In [76]:
def score(predicted, test_Y):
    acc = accuracy_score(test_Y, predicted)
    recall = recall_score(test_Y, predicted)
    prec = precision_score(test_Y, predicted)
    
    print("Accuracy:: {} \nRecall :: {} \nPrecision:: {}".format(acc, recall, prec))
    
    return(acc, recall, prec)

### Model and score

In [78]:
model = fit_model(X_train, Y_train)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [79]:
predicted = predict(model, X_test)
acc, recl, prec = score(predicted.flatten().round(), Y_test)

Accuracy:: 0.9967285897885267 
Recall :: 0.06666666666666667 
Precision:: 1.0


# Model 3 - AdaBoostClassifier

In [80]:
model_abc = AdaBoostClassifier(n_estimators=50, learning_rate=1)
model_abc.fit(X_train, Y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
                   n_estimators=50, random_state=None)

In [107]:
predicted = predict(model_abc, X_test)
acc, recl, prec = score(predicted.flatten().round(), Y_test)

Accuracy:: 0.972193013202477 
Recall :: 0.8666666666666667 
Precision:: 0.1


# Model 4 - XGBoost

In [108]:
model_xgb = XGBClassifier()
model_xgb.fit(X_train, Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [83]:
predicted = predict(model_xgb, X_test)
acc, recl, prec = score(predicted.flatten().round(), Y_test)

Accuracy:: 0.9974296062624138 
Recall :: 0.26666666666666666 
Precision:: 1.0


# Use of over sampling

In [84]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state = 0)
X_resampled, y_resampled = ros.fit_resample(X_train, Y_train)

In [99]:
len(y_resampled[y_resampled == 0])

34146

In [100]:
len(y_resampled[y_resampled == 1])

34146

## Random forest

In [101]:
model_rf = RandomForestClassifier()
model_rf.fit(X_resampled, y_resampled)

y_pred_rf = model_rf.predict(X_test)

acc, recl, prec = score(y_pred_rf, Y_test)

Accuracy:: 0.9976632784203762 
Recall :: 0.6 
Precision:: 0.6923076923076923


## NN

In [104]:
model_nn = fit_model(X_resampled, y_resampled)

predicted = predict(model_nn, X_test)
acc, recl, prec = score(predicted.flatten().round(), Y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy:: 0.865521673092651 
Recall :: 1.0 
Precision:: 0.02540220152413209


## Adaboost

In [105]:
model_abc = AdaBoostClassifier(n_estimators=50, learning_rate=1)
model_abc.fit(X_resampled, y_resampled)

predicted = predict(model_abc, X_test)
acc, recl, prec = score(predicted.flatten().round(), Y_test)

Accuracy:: 0.972193013202477 
Recall :: 0.8666666666666667 
Precision:: 0.1


## XGboost

In [106]:
model_xgb = XGBClassifier()
model_xgb.fit(X_resampled, y_resampled)

predicted = predict(model_xgb, X_test)
acc, recl, prec = score(predicted.flatten().round(), Y_test)

Accuracy:: 0.9726603575184016 
Recall :: 0.8333333333333334 
Precision:: 0.0984251968503937


# Rough