In [1]:
#data loading
import pandas as pd

predata = pd.read_csv(r'C:/Users/BeatrizCarvalho/OneDrive - Closer Consultoria Lda/Documents/Entangled-Spaces/Payments/Datasets/0payments_nd_corr.csv', low_memory = False) 

In [2]:
#ratios of each class
predata['is_fraud'].value_counts() / predata.shape[0]

0    0.999963
1    0.000037
Name: is_fraud, dtype: float64

In [3]:
#inicial number of columns and rows of the dataframe
predata.shape

(15388489, 21)

In [4]:
#visualize the whole output
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
#create column weekday
predata['weekday'] = pd.to_datetime(predata['timestamp']).apply(lambda x: x.weekday())

In [6]:
#create column month
predata['month'] = pd.DatetimeIndex(predata['timestamp']).month

In [7]:
#drop column timestamp that was used for creating weekday and month columns
predata.drop(['timestamp'], axis = 1, inplace = True)

In [8]:
#renaming browsers
def assign_brws_fam(x):

    changes = {
        "m bot": "other",  #15
        "android": "other",  #104
        "1password": "other",  #57
        "chrome mobile": "chrome",  #735168
        "chrome mobile webview": "chrome",  #2681
        "mobile safari": "safari",  #338413
        "mobile safari ui/wkwebview": "safari",  #4190964
        "firefox mobile": "firefox",  #5515
        "edge mobile": "edge",
        "firefox mobile": "firefox"
    }

    if x in changes.keys():
        return changes[x]
    else:
        return x

predata["browser_family"] = predata["browser_family"].apply(lambda x: assign_brws_fam(x))

In [9]:
#renaming os 
def assign_os_fam(x):

    changes = {
        "windows phone": "windows"  #863
    }

    if x in changes.keys():
        return changes[x]
    else:
        return x

predata["os_family"] = predata["os_family"].apply(lambda x: assign_os_fam(x))

In [10]:
#remove spaces on browsers and os 
cols = ['os_family', 'browser_family']

for col in cols:
      predata[col] = predata[col].apply(lambda x: x.replace(' ', ''))

In [11]:
#make all lowercase
def convert_column_lower(column, df):
    df[column] = df[column].str.lower()
    
columns_to_lower = ['canal', 'operativa', 'browser_family', 'os_family', 'ipaddress', 'trusted_indicator']
for col in columns_to_lower:
    predata[col] = predata[col].str.lower()

In [12]:
#remove dots from ipaddress
predata['ipaddress'] = predata['ipaddress'].apply(lambda x: "".join(x.split(".")))

In [13]:
#merge is_tablet, is_pc, is_mobile, is_touch on a new column called device 
def deviceselect(x): 
    #if True in tablet and touch
    if x['is_tablet'] and x['is_touch']:
       return 'ttablet'
    
    #if True in pc and touch
    if x['is_pc'] and x['is_touch']:
       return 'tpc'
    
    #if True in mobile and touch
    if x['is_mobile'] and x['is_touch']:
       return 'tmobile'

    #if True only in mobile
    if x['is_mobile']:
        return 'mobile'
    
    #if True only in pc
    if x['is_pc']:
        return 'pc'

    #if True only in tablet
    if x['is_tablet']:
        return 'tablet'  
    
    #if none of the above return otherd
    return 'otherd'                            
    
predata['device'] = predata.apply(deviceselect, axis = 1)

In [14]:
#drop columns that were merged on the new device column
predata.drop(['is_pc', 'is_tablet', 'is_mobile', 'is_touch'], axis = 1, inplace = True)  

In [63]:
#final number of columns and rows of the dataframe
predata.shape

(15388489, 19)

In [62]:
#final columns of the dataframe
predata.columns

Index(['canal', 'operativa', 'entity', 'reference', 'trusted_indicator',
       'iban_orig', 'iban_dest', 'amount', 'accountbalance', 'ipaddress',
       'is_fraud', 'clientid', 'browser_family', 'os_family', 'hour', 'week',
       'weekday', 'month', 'device'],
      dtype='object')

In [64]:
#number of lines, null/nan values and type of each column
predata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15388489 entries, 0 to 15388488
Data columns (total 19 columns):
 #   Column             Dtype  
---  ------             -----  
 0   canal              object 
 1   operativa          object 
 2   entity             int64  
 3   reference          int64  
 4   trusted_indicator  object 
 5   iban_orig          int64  
 6   iban_dest          int64  
 7   amount             float64
 8   accountbalance     float64
 9   ipaddress          object 
 10  is_fraud           int64  
 11  clientid           int64  
 12  browser_family     object 
 13  os_family          object 
 14  hour               int64  
 15  week               int64  
 16  weekday            int64  
 17  month              int64  
 18  device             object 
dtypes: float64(2), int64(10), object(7)
memory usage: 2.2+ GB


In [65]:
#change data types for columns that need to be categorical
convert_dict = {'clientid': object, 'entity': object, 'reference': object, 'iban_orig': object, 'iban_dest': object, 
                'hour': object, 'week': object, 'weekday': object, 'month': object}

predata = predata.astype(convert_dict)

In [72]:
#balance the dataset

#undersampling -> RandomUnderSampler
from imblearn.under_sampling import RandomUnderSampler

under = RandomUnderSampler(sampling_strategy = {0: 7000000 , 1: 569})  #number of initial nonfrauds = 15387920

#oversampling -> Smote-NC
from imblearn.over_sampling import SMOTENC
                                                                    
#target column
y = predata['is_fraud'] 

#all the other columns
X = predata.drop('is_fraud', axis = 1)  

#specify the categorical columns for smotenc
smotenc = SMOTENC(categorical_features = [X.dtypes == object])  

MemoryError: Unable to allocate 1.83 GiB for an array with shape (16, 15388489) and data type object

In [68]:
#pipeline
from imblearn.pipeline import Pipeline

steps = [('u', under), ('o', smotenc)]
pipeline = Pipeline(steps = steps)

In [None]:
#resample X and y 
X, y = pipeline.fit_resample(X, y)

In [6]:
#create the new dataframe
predatabalanced = X.copy()
predatabalanced['is_fraud'] = y

#save the balanced dataset
predatabalanced.to_csv(r'C:/Users/BeatrizCarvalho/OneDrive - Closer Consultoria Lda/Documents/Entangled-Spaces/Payments/Datasets/2payments_balanced_smote+rund.csv', index = False)

In [28]:
#profiling of the data
from pandas_profiling import ProfileReport

#ProfileReport(predata.sample(n = 10000)) for randomize data order and select representative sample
prof = ProfileReport(predatabalanced, minimal = True)             
prof.to_file(output_file = r'C:/Users/BeatrizCarvalho/OneDrive - Closer Consultoria Lda/Documents/Entangled-Spaces/Payments/Profiles/2payments_balanced_smote+rund.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [28]:
#new class ratios
predatabalanced['is_fraud'].value_counts() / predatabalanced.shape[0]

0    0.5
1    0.5
Name: is_fraud, dtype: float64

In [347]:
#columns and rows of the new dataframe
predatabalanced.shape

(1200, 19)

In [None]:
#evaluate how similar the two dataframes are by graphics 
from table_evaluator import TableEvaluator

table_evaluator =  TableEvaluator(predata, predatabalanced) 

table_evaluator.visual_evaluation()

In [29]:
#select all payments frauds after balance and save in a csv file
#payments frauds
target_col = 'is_fraud'
frauds_transfers = predatabalanced[(predatabalanced[target_col] == 1)].copy()

#save csv file
frauds_transfers.to_csv(r'C:/Users/BeatrizCarvalho/OneDrive - Closer Consultoria Lda/Documents/Entangled-Spaces/Payments/Datasets/2frauds_payments_balanced_smote+rund.csv', index = False)