# Importing data

In [35]:
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency as chi_test
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
import pandas as pd
import numpy as np
import pycm

import warnings
warnings.filterwarnings("ignore")

# Importing data

In [2]:
data = pd.read_excel(r"D:\Academics\CMI\Internship\Fortiate\DST-1_Dimensionality\Data\data_with_target.xlsx")

# Preprocessing

### Header to code mapper

In [3]:
map_head_code = dict(list(zip(list(data.loc[0, :]), list(data.columns))))

In [4]:
data.head(2)

Unnamed: 0,Card-Wallet-Token ID,PROCESSING_CODE,TRANSACTION_AMOUNT,SETTLEMENT_AMOUNT,BILLING_AMOUNT,Transaction Date and Time,CONVERSION_RATE,Expiry Date,CONVERSION_RATE_DATE,Merchant Category Code,...,Interface ID,Terminal Type,Token Data-C1,Additional Payment Info,Ecommerce Data,ECI Indicator,Remaining Balance,Private Data - Additional Info,Token Data,TARGET
0,DE2,DE3,DE4,DE5,DE6,DE7,DE9,DE14,DE16,DE18,...,DE4801,DE4801,DE4803,DE4804,DE4805,DE4806,DE4807,DE4808,DE4809,DE4810
1,6086941819072010,174000,7000,7000,7000,22-MAR-18 09.10.37,0,01-JUL-23 00.00.00,,6011,...,IN1,ATM,0917,,,,35000,,,0


### Removing 1st row, Code

In [5]:
data = data.drop(0)

### Removing not required colums

In [6]:
not_required = ["CONVERSION_RATE_DATE", "Private Data 48", "Transaction Category Code", "DE48 Data Length", "Additional Payment Info", "Ecommerce Data", "ECI Indicator", "Private Data - Additional Info", "Token Data", "POS entry Mode", "Remaining Balance", "Token Data-C1"]

In [7]:
data = data.drop(not_required, axis=1)
data = data.dropna()

In [8]:
data.head(2)

Unnamed: 0,Card-Wallet-Token ID,PROCESSING_CODE,TRANSACTION_AMOUNT,SETTLEMENT_AMOUNT,BILLING_AMOUNT,Transaction Date and Time,CONVERSION_RATE,Expiry Date,Merchant Category Code,ACQUIRING_COUNTRY_CODE,...,TID,MID,MERCHANT NAME AND ADDRESS,TRANSACTION_CURRENCY,SETTLEMENT_CURRENCY,BILLING_CURRENCY,POS_DATA,Interface ID,Terminal Type,TARGET
1,6086941819072010,174000,7000,7000,7000,22-MAR-18 09.10.37,0,01-JUL-23 00.00.00,6011,356,...,GYACBB04,1005044921,LXFC BTNK LTD.KHURDT OR IN,356,356,356,210201210141,IN1,ATM,0
2,6086941026163000,174000,2000,2000,2000,22-MAR-18 12.28.15,0,01-DEC-22 00.00.00,6011,356,...,JSEWLK06,1005044921,LXFC BTNK LTD.LUCKNOWUP IN,356,356,356,210201210141,IN1,ATM,0


### Storing target

In [9]:
target = data.TARGET

In [10]:
print(len(data))
print(len(target[target == 1]))

42793
118


In [11]:
data.head(2)

Unnamed: 0,Card-Wallet-Token ID,PROCESSING_CODE,TRANSACTION_AMOUNT,SETTLEMENT_AMOUNT,BILLING_AMOUNT,Transaction Date and Time,CONVERSION_RATE,Expiry Date,Merchant Category Code,ACQUIRING_COUNTRY_CODE,...,TID,MID,MERCHANT NAME AND ADDRESS,TRANSACTION_CURRENCY,SETTLEMENT_CURRENCY,BILLING_CURRENCY,POS_DATA,Interface ID,Terminal Type,TARGET
1,6086941819072010,174000,7000,7000,7000,22-MAR-18 09.10.37,0,01-JUL-23 00.00.00,6011,356,...,GYACBB04,1005044921,LXFC BTNK LTD.KHURDT OR IN,356,356,356,210201210141,IN1,ATM,0
2,6086941026163000,174000,2000,2000,2000,22-MAR-18 12.28.15,0,01-DEC-22 00.00.00,6011,356,...,JSEWLK06,1005044921,LXFC BTNK LTD.LUCKNOWUP IN,356,356,356,210201210141,IN1,ATM,0


### Seperating numerical and catagorical data

In [12]:
numerical = ["TRANSACTION_AMOUNT", "SETTLEMENT_AMOUNT", "BILLING_AMOUNT", "CONVERSION_RATE"]

In [13]:
data_cat = data.drop(numerical, axis = 1)

In [14]:
data_cat.head(2)

Unnamed: 0,Card-Wallet-Token ID,PROCESSING_CODE,Transaction Date and Time,Expiry Date,Merchant Category Code,ACQUIRING_COUNTRY_CODE,ACQUIRER_INSTITUTION_CODE,Authorization Code,Approval Decision,Card Service Code,TID,MID,MERCHANT NAME AND ADDRESS,TRANSACTION_CURRENCY,SETTLEMENT_CURRENCY,BILLING_CURRENCY,POS_DATA,Interface ID,Terminal Type,TARGET
1,6086941819072010,174000,22-MAR-18 09.10.37,01-JUL-23 00.00.00,6011,356,544921,533038,0,201,GYACBB04,1005044921,LXFC BTNK LTD.KHURDT OR IN,356,356,356,210201210141,IN1,ATM,0
2,6086941026163000,174000,22-MAR-18 12.28.15,01-DEC-22 00.00.00,6011,356,544921,944896,0,201,JSEWLK06,1005044921,LXFC BTNK LTD.LUCKNOWUP IN,356,356,356,210201210141,IN1,ATM,0


In [15]:
data_num = data[numerical]

In [16]:
data_num.head(2)

Unnamed: 0,TRANSACTION_AMOUNT,SETTLEMENT_AMOUNT,BILLING_AMOUNT,CONVERSION_RATE
1,7000,7000,7000,0
2,2000,2000,2000,0


# Finding dependent catagories

In [12]:
cat_columns = ['PROCESSING_CODE',
     'Merchant Category Code',
     'ACQUIRING_COUNTRY_CODE',
     'ACQUIRER_INSTITUTION_CODE',
     'Card Service Code',
     'Interface ID',
     'Terminal Type',
     'Token Data-C1']

### Cramer's V test

In [13]:
def cramers_corrected_stat(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = chi_test(confusion_matrix)[0]
    #print(chi2)
    n = (confusion_matrix.sum()).sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

In [14]:
count = 0
dependent_pair_cramer = []
for i in cat_columns:
    count += 1
    for j in cat_columns[count:]:
        data_crosstab =  pd.crosstab(data_cat[i], data_cat[j], margins = False)
        p = cramers_corrected_stat(data_crosstab)
        #if p < 0.01:
        dependent_pair_cramer.append([i,j,p])

In [22]:
thresold = 0.8

In [65]:
valid_pair = list(map(lambda x: (x[0], x[1]) if (x[2] > thresold) else None , dependent_pair_cramer))
valid_pair = [i for i in valid_pair if i] 

In [76]:
temp = [item for sublist in valid_pair for item in sublist]

In [80]:
valid_pair

[('PROCESSING_CODE', 'Interface ID'),
 ('PROCESSING_CODE', 'Terminal Type'),
 ('ACQUIRING_COUNTRY_CODE', 'ACQUIRER_INSTITUTION_CODE'),
 ('ACQUIRER_INSTITUTION_CODE', 'Interface ID'),
 ('ACQUIRER_INSTITUTION_CODE', 'Terminal Type'),
 ('Card Service Code', 'Interface ID')]

In [87]:
count = {}
for i in temp:
    try:
        count[i] += 1
    except:
        count[i] = 1
        
count_list = list(count.items())
count_list.sort(key = lambda x: x[1], reverse = True)

In [18]:
final_cat = ['PROCESSING_CODE',
     'Merchant Category Code',
     'ACQUIRING_COUNTRY_CODE',
     'ACQUIRER_INSTITUTION_CODE',
     'Card Service Code',
     'Interface ID',
     'Terminal Type']

# Onehot encoding of predictors

In [20]:
new_data_1 = pd.DataFrame()
encoder=OneHotEncoder(sparse=False)
for col in final_cat:
    data_temp = data_cat[[col]]
    encoder.fit(data_temp)
    temp = encoder.transform(data_cat[[col]])
    temp = pd.DataFrame(temp,columns=[(col+"_"+str(i)) for i in data[col].value_counts().index])
    temp = temp.set_index(data_cat.index.values)
    new_data_1 = pd.concat([new_data_1,temp],axis=1)

In [21]:
new_data_1.head(2)

Unnamed: 0,PROCESSING_CODE_000000,PROCESSING_CODE_003000,PROCESSING_CODE_004000,PROCESSING_CODE_173000,PROCESSING_CODE_171000,PROCESSING_CODE_000098,PROCESSING_CODE_172000,PROCESSING_CODE_174000,PROCESSING_CODE_170000,PROCESSING_CODE_000099,...,Card Service Code_101,Card Service Code_126,Card Service Code_000,Interface ID_IN2,Interface ID_IN9,Interface ID_IN1,Interface ID_IN4,Terminal Type_POS,Terminal Type_ATM,Terminal Type_APP
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [22]:
input_data = pd.concat([data_num, new_data_1], axis=1)

In [23]:
input_data.head(2)

Unnamed: 0,TRANSACTION_AMOUNT,SETTLEMENT_AMOUNT,BILLING_AMOUNT,CONVERSION_RATE,PROCESSING_CODE_000000,PROCESSING_CODE_003000,PROCESSING_CODE_004000,PROCESSING_CODE_173000,PROCESSING_CODE_171000,PROCESSING_CODE_000098,...,Card Service Code_101,Card Service Code_126,Card Service Code_000,Interface ID_IN2,Interface ID_IN9,Interface ID_IN1,Interface ID_IN4,Terminal Type_POS,Terminal Type_ATM,Terminal Type_APP
1,7000,7000,7000,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2000,2000,2000,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [24]:
ratio = float(len(target[target == 1]))/float(len(target[target == 0]))
print(ratio)

0.0027650849443468075


# Train_test split

In [44]:
X_train, X_test, Y_train, Y_test = train_test_split(input_data, target, test_size = 0.2, random_state = 42)

# Model 1 with all predictor - Random forest

In [45]:
Y_test = Y_test.astype('float64') 
Y_train = Y_train.astype('float64')
Y_test.dtype

dtype('float64')

In [61]:
model_rf = RandomForestClassifier()
model_rf.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [62]:
y_pred_rf = model_rf.predict(X_test)

### Scores

##### Accuracy

In [63]:
accuracy_score(Y_test,y_pred_rf)

0.9980137866573198

##### Precision

In [64]:
precision_score(Y_test,y_pred_rf)

0.8421052631578947

##### Recall

In [65]:
recall_score(Y_test,y_pred_rf)

0.5333333333333333

# Rough