In this BONUS point, we start again from the RAW data, because we will need some features that we discarded when saving the processed data.

### Preprocessing

In [None]:
import sys
!{sys.executable} -m pip install sklearn pandas imblearn

In [2]:
import time
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import StandardScaler
from IPython.display import display, HTML

In [3]:
CSV_PATH = r"..\data\data_for_student_case.csv"
data_df = pd.read_csv(CSV_PATH)
print(data_df.shape)

(290382, 17)


In [4]:
# same preprocessing like before
data_df = data_df[data_df['simple_journal'] != 'Refused']
data_df['cvcresponsecode'] = data_df['cvcresponsecode'].apply(lambda cvc: 3 if cvc >= 3 else cvc)
data_df = data_df.loc[~data_df['cardverificationcodesupplied'].isna()]
data_df['label'] = data_df['simple_journal'].apply(lambda label: 1 if label == 'Chargeback' else 0)
data_df = data_df.drop(columns = ['simple_journal'])

# we keep creation_date this time.

conversion_rate = {"AUD": 0.626093,
                    "NZD": 0.591501,
                    "SEK": 0.0935468,
                    "GBP": 1.16536,
                    "MXN": 0.0467946}

data_df['amount'] = data_df[['amount', 'currencycode']].apply(lambda row: row['amount']*conversion_rate[row['currencycode']], axis=1)

for col_name in ['issuercountrycode', 'txvariantcode', 'currencycode', 'shoppercountrycode', 'shopperinteraction', 'accountcode', 'mail_id', 'ip_id', 'card_id']:
    unique_values = data_df[col_name].unique()
    mapped_values = dict(zip(unique_values, range(len(unique_values))))
    data_df[col_name] = data_df[col_name].apply(lambda old_value: mapped_values[old_value])

data_df = data_df.drop(columns = ['currencycode','txid', 'bin'])

In [5]:
data_df['creationdate'] = pd.to_datetime(data_df['creationdate'],format='%Y-%m-%d %H:%M:%S')
data_df['creationdate_no_hour'] = data_df.creationdate.dt.date
data_df['creationmonth'] = data_df.creationdate.dt.month
data_df['creationweekday'] = data_df.creationdate.dt.weekday
data_df['creationhour'] = data_df.creationdate.dt.hour


data_df['bookingdate'] = pd.to_datetime(data_df['bookingdate'],format='%Y-%m-%d %H:%M:%S')

In [6]:
data_df.head()

Unnamed: 0,bookingdate,issuercountrycode,txvariantcode,amount,shoppercountrycode,shopperinteraction,cardverificationcodesupplied,cvcresponsecode,creationdate,accountcode,mail_id,ip_id,card_id,label,creationdate_no_hour,creationmonth,creationweekday,creationhour
0,2015-11-09 14:26:51,0,0,3032.29008,0,0,True,0,2015-07-01 23:03:11,0,0,0,0,1,2015-07-01,7,2,23
1,2015-11-09 14:27:38,0,0,2101.07754,0,0,True,0,2015-07-02 04:50:55,0,1,1,1,1,2015-07-02,7,3,4
2,2015-11-23 16:34:16,0,0,7014.51054,0,0,True,0,2015-07-02 14:30:28,0,2,2,2,1,2015-07-02,7,3,14
3,2015-11-23 16:34:51,0,0,5142.72654,0,0,True,0,2015-07-03 07:53:37,0,3,3,3,1,2015-07-03,7,4,7
4,2015-11-09 14:26:08,0,1,4206.83454,0,0,True,0,2015-07-08 18:35:35,0,4,4,4,1,2015-07-08,7,2,18


### Aggregations

In [7]:
final_df = data_df.copy()

First idea: We want to check whether of not using previously detected frauds (grouping by card/ip/mail) can help the detection of fraudolent cases in the future.   
     
     
However, we are only allowed to use this information for new cases whose 'creationdate' happens after the 'bookingdate' of the fraudolent case.      
     
     
In English language this translates into 'We can use our knowledge of a fraudolent case only for detecting cases which happen after we acquire such knowledge'.

In [8]:
fraudolent_df = data_df[data_df['label'] == 1]
fraudolent_df.shape
final_df.head()

Unnamed: 0,bookingdate,issuercountrycode,txvariantcode,amount,shoppercountrycode,shopperinteraction,cardverificationcodesupplied,cvcresponsecode,creationdate,accountcode,mail_id,ip_id,card_id,label,creationdate_no_hour,creationmonth,creationweekday,creationhour
0,2015-11-09 14:26:51,0,0,3032.29008,0,0,True,0,2015-07-01 23:03:11,0,0,0,0,1,2015-07-01,7,2,23
1,2015-11-09 14:27:38,0,0,2101.07754,0,0,True,0,2015-07-02 04:50:55,0,1,1,1,1,2015-07-02,7,3,4
2,2015-11-23 16:34:16,0,0,7014.51054,0,0,True,0,2015-07-02 14:30:28,0,2,2,2,1,2015-07-02,7,3,14
3,2015-11-23 16:34:51,0,0,5142.72654,0,0,True,0,2015-07-03 07:53:37,0,3,3,3,1,2015-07-03,7,4,7
4,2015-11-09 14:26:08,0,1,4206.83454,0,0,True,0,2015-07-08 18:35:35,0,4,4,4,1,2015-07-08,7,2,18


In [9]:
for _, row in fraudolent_df.iterrows():
    current_booking = row['bookingdate']
    card_id = row['card_id']
    ip_id = row['ip_id']
    mail_id = row['mail_id']
    
    # affected rows for which we can use our knowledge about previous frauds
    affected_rows_by_card = final_df[(final_df['creationdate'] > current_booking) & (final_df['card_id'] == card_id)]
    if affected_rows_by_card.shape[0]:
        display(affected_rows_by_card)
        
    affected_rows_by_ip = final_df[(final_df['creationdate'] > current_booking) & (final_df['ip_id'] == ip_id)]
    if affected_rows_by_ip.shape[0]:
        display(affected_rows_by_ip)
    
    affected_rows_by_mail = final_df[(final_df['creationdate'] > current_booking) & (final_df['mail_id'] == mail_id)]
    if affected_rows_by_mail.shape[0]:
        display(affected_rows_by_mail)

Unnamed: 0,bookingdate,issuercountrycode,txvariantcode,amount,shoppercountrycode,shopperinteraction,cardverificationcodesupplied,cvcresponsecode,creationdate,accountcode,mail_id,ip_id,card_id,label,creationdate_no_hour,creationmonth,creationweekday,creationhour
153894,2015-09-03 02:15:19,22,5,28551.32,19,0,True,1,2015-09-01 16:20:26,2,32558,41779,114149,0,2015-09-01,9,1,16
194697,2015-10-01 17:53:09,22,5,5244.12,19,1,False,0,2015-09-30 19:06:28,2,32558,41779,114149,0,2015-09-30,9,2,19
235525,2015-10-30 00:02:19,22,5,14567.0,19,1,False,0,2015-10-29 00:24:24,2,32558,41283,114149,0,2015-10-29,10,3,0
238336,2015-11-02 18:58:12,22,5,9672.488,19,1,False,0,2015-10-30 22:33:59,2,32558,178495,114149,0,2015-10-30,10,4,22


Unnamed: 0,bookingdate,issuercountrycode,txvariantcode,amount,shoppercountrycode,shopperinteraction,cardverificationcodesupplied,cvcresponsecode,creationdate,accountcode,mail_id,ip_id,card_id,label,creationdate_no_hour,creationmonth,creationweekday,creationhour
289549,2015-10-26 18:50:04,21,5,6819.56172,26,0,True,1,2015-10-23 19:53:50,3,197312,179623,199246,0,2015-10-23,10,4,19


Unnamed: 0,bookingdate,issuercountrycode,txvariantcode,amount,shoppercountrycode,shopperinteraction,cardverificationcodesupplied,cvcresponsecode,creationdate,accountcode,mail_id,ip_id,card_id,label,creationdate_no_hour,creationmonth,creationweekday,creationhour
289549,2015-10-26 18:50:04,21,5,6819.56172,26,0,True,1,2015-10-23 19:53:50,3,197312,179623,199246,0,2015-10-23,10,4,19


All these affected rows have label 0 (no-fraud). Therefore, using this type of aggregation does not provide any meaningful insight for the classification (For the dataset in use).

A second idea is to aggregate per card/ip/mail the number of transactions that have been made in the same day before the current transaction as well as the amount of money that results by summing these transactions up. Also, we store the average of these values

In [10]:
print("This preprocessing can take a lot of time... (printing index every 10000 processed transactions)")
data_df = data_df.sort_values('creationdate').reset_index(drop=True) # reset the index is important for the loop 'idx'

n_of_transactions_in_day_card = []
amount_in_day_card = []
average_amount_in_day_card = []

n_of_transactions_in_day_ip = []
amount_in_day_ip = []
average_amount_in_day_ip = []

n_of_transactions_in_day_mail = []
amount_in_day_mail = []
average_amount_in_day_mail = []

for idx, row in data_df.iterrows():
    card_id = row['card_id']
    ip_id = row['ip_id']
    mail_id = row['mail_id']
    day = row['creationdate_no_hour']
    
    available_data = data_df[0:idx] # what we have so far (remember: we have sorted by creation date)
    
    # card_id
    by_card = available_data[available_data['card_id'] == card_id]
    by_card_in_day = by_card[by_card['creationdate_no_hour'] == day]
    
    if by_card_in_day.empty:
        n_of_transactions_in_day_card.append(0)
        amount_in_day_card.append(0)
        average_amount_in_day_card.append(0)
    else:
        n_of_transactions_in_day_card.append(by_card_in_day.shape[0])
        sum_amount = np.sum(by_card_in_day.amount)
        amount_in_day_card.append(sum_amount)
        average_amount_in_day_card.append(sum_amount/by_card_in_day.shape[0])
        
    # ip
    by_ip = available_data[available_data['ip_id'] == ip_id]
    by_ip_in_day = by_ip[by_ip['creationdate_no_hour'] == day]
    
    if by_ip_in_day.empty:
        n_of_transactions_in_day_ip.append(0)
        amount_in_day_ip.append(0)
        average_amount_in_day_ip.append(0)
    else:
        n_of_transactions_in_day_ip.append(by_ip_in_day.shape[0])
        sum_amount = np.sum(by_ip_in_day.amount)
        amount_in_day_ip.append(sum_amount)
        average_amount_in_day_ip.append(sum_amount/by_ip_in_day.shape[0])
        
    # mail
    by_mail = available_data[available_data['mail_id'] == mail_id]
    by_mail_in_day = by_mail[by_mail['creationdate_no_hour'] == day]
    
    if by_mail_in_day.empty:
        n_of_transactions_in_day_mail.append(0)
        amount_in_day_mail.append(0)
        average_amount_in_day_mail.append(0)
    else:
        n_of_transactions_in_day_mail.append(by_mail_in_day.shape[0])
        sum_amount = np.sum(by_mail_in_day.amount)
        amount_in_day_mail.append(sum_amount)
        average_amount_in_day_mail.append(sum_amount/by_mail_in_day.shape[0])
        
    if idx%10000 == 0:
        print(idx)
        
data_df["transactions_in_day_card"] = n_of_transactions_in_day_card
data_df["amount_in_day_card"] = amount_in_day_card
data_df["average_amount_in_day_card"] = average_amount_in_day_card

data_df["transactions_in_day_ip"] = n_of_transactions_in_day_ip
data_df["amount_in_day_ip"] = amount_in_day_ip
data_df["average_amount_in_day_ip"] = average_amount_in_day_ip

data_df["transactions_in_day_mail"] = n_of_transactions_in_day_mail
data_df["amount_in_day_mail"] = amount_in_day_mail
data_df["average_amount_in_day_mail"] = average_amount_in_day_mail

This preprocessing can take a lot of time...
0
20000
40000
60000
80000
100000
120000
140000
160000
180000
200000
220000


It's now time to test one of the classifiers we used in the assignment to check whether or not these features can help solving the task. Since the best single classifier was Knn, we go for it!   
First of all, we need to remove some of the columns that we have, before proceeding with the test.

In [11]:
data_df = data_df.drop(columns=['bookingdate', 'creationdate', 'mail_id', 'ip_id', 'card_id', 'creationdate_no_hour'])
data_df.head()

Unnamed: 0,issuercountrycode,txvariantcode,amount,shoppercountrycode,shopperinteraction,cardverificationcodesupplied,cvcresponsecode,accountcode,label,creationmonth,...,creationhour,transactions_in_day_card,amount_in_day_card,average_amount_in_day_card,transactions_in_day_ip,amount_in_day_ip,average_amount_in_day_ip,transactions_in_day_mail,amount_in_day_mail,average_amount_in_day_mail
0,22,5,5937.5092,19,0,True,1,2,0,7,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
1,22,5,5704.4372,19,0,True,1,2,0,7,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2,22,5,5471.3652,19,0,True,1,2,0,7,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
3,22,5,6520.1892,19,0,True,1,2,0,7,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
4,22,5,8151.6932,19,0,True,1,2,0,7,...,0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0


In [13]:
# let's normalize the amounts
data_df['amount'] = StandardScaler().fit_transform(data_df['amount'].values.reshape(-1, 1))

data_df['amount_in_day_card'] = StandardScaler().fit_transform(data_df['amount_in_day_card'].values.reshape(-1, 1))
data_df['average_amount_in_day_card'] = StandardScaler().fit_transform(data_df['average_amount_in_day_card'].values.reshape(-1, 1))

data_df['amount_in_day_ip'] = StandardScaler().fit_transform(data_df['amount_in_day_ip'].values.reshape(-1, 1))
data_df['average_amount_in_day_ip'] = StandardScaler().fit_transform(data_df['average_amount_in_day_ip'].values.reshape(-1, 1))

data_df['amount_in_day_mail'] = StandardScaler().fit_transform(data_df['amount_in_day_mail'].values.reshape(-1, 1))
data_df['average_amount_in_day_mail'] = StandardScaler().fit_transform(data_df['average_amount_in_day_mail'].values.reshape(-1, 1))

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn import metrics
import math

In [15]:
clsf = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
smote_ratio = 0.015

y = data_df['label']
X = data_df.drop(columns=['label'])

In [16]:
conf_mat = np.array([[0, 0], [0, 0]])

skf = StratifiedKFold(n_splits=10, shuffle=True)
for idx, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {idx + 1}")

    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    smote = SMOTE(sampling_strategy=smote_ratio)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    trained = clsf.fit(X_train_smote, y_train_smote)

    y_pred = trained.predict(X_test)
    conf_mat += metrics.confusion_matrix(y_test, y_pred)

tn, fp, fn, tp = conf_mat.ravel()
print(conf_mat)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F0.5: {( (1+math.pow(0.5,2))*precision*recall )/( math.pow(0.5,2) * precision + recall )}")

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
Fold 10
[[222447   1050]
 [   285     60]]
Precision: 0.05405405405405406
Recall: 0.17391304347826086
F0.5: 0.06269592476489029
