In [6]:
#data loading
import pandas as pd
import zipfile

#predata = pd.read_csv('0transfers_nd_corr.csv', low_memory = False) 

In [23]:
#use this data loading due to memory error - remove after 
import pandas as pd
predataf = pd.read_csv('frauds_transfers.csv', low_memory = False, usecols = ['canal', 'operativa', 'entity', 'reference', 
                                                                              'trusted_indicator', 'iban_orig', 'iban_dest', 
                                                                              'amount', 'accountbalance', 'ipaddress', 
                                                                              'is_fraud', 'is_tablet', 'is_pc', 'is_mobile', 
                                                                              'is_touch', 'clientid', 'browser_family', 
                                                                              'os_family', 'hour', 'week', 'timestamp'])

predatanf = pd.read_csv('nonfrauds_transfers.csv', low_memory = False, usecols = ['canal', 'operativa', 'entity', 
                                                                                        'reference', 'trusted_indicator', 
                                                                                        'iban_orig', 'iban_dest', 'amount', 
                                                                                        'accountbalance', 'ipaddress', 
                                                                                        'is_fraud', 'is_tablet', 'is_pc', 
                                                                                        'is_mobile', 'is_touch', 'clientid', 
                                                                                        'browser_family', 'os_family', 'hour', 
                                                                                        'week', 'timestamp'], nrows = 1000)  

predata = pd.concat([predataf, predatanf], ignore_index = True)  #nf = 15637832 and f = 569

In [24]:
#make sure the dataframes were well selected
counts = predata['is_fraud'].value_counts()
counts

0    1000
1     569
Name: is_fraud, dtype: int64

In [25]:
#ratios of each class
counts / predata.shape[0]

0    0.637349
1    0.362651
Name: is_fraud, dtype: float64

In [26]:
#inicial number of columns and rows of the dataframe
predata.shape

(1569, 21)

In [5]:
#visualize the whole output
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [27]:
#create column weekday
predata['weekday'] = pd.to_datetime(predata['timestamp']).apply(lambda x: x.weekday())

In [28]:
#create column month
predata['month'] = pd.DatetimeIndex(predata['timestamp']).month

In [29]:
#renaming browsers
def assign_brws_fam(x):

    changes = {
        "m bot": "other",  #15
        "android": "other",  #104
        "1password": "other",  #57
        "chrome mobile": "chrome",  #735168
        "chrome mobile webview": "chrome",  #2681
        "mobile safari": "safari",  #338413
        "mobile safari ui/wkwebview": "safari",  #4190964
        "firefox mobile": "firefox"  #5515
    }

    if x in changes.keys():
        return changes[x]
    else:
        return x

predata["browser_family"] = predata["browser_family"].apply(lambda x: assign_brws_fam(x))

In [30]:
#renaming os 
def assign_os_fam(x):

    changes = {
        "windows phone": "windows",  #863
    }

    if x in changes.keys():
        return changes[x]
    else:
        return x

predata["os_family"] = predata["os_family"].apply(lambda x: assign_os_fam(x))

In [31]:
#remove spaces on browsers and os 
cols = ['os_family', 'browser_family']

for col in cols:
      predata[col] = predata[col].apply(lambda x: x.replace(' ', ''))

In [32]:
#make all lowercase
def convert_column_lower(column, df):
    df[column] = df[column].str.lower()
    
columns_to_lower = ['canal', 'operativa', 'browser_family', 'os_family', 'ipaddress']
for col in columns_to_lower:
    predata[col] = predata[col].str.lower()

In [33]:
#remove dots from ipaddress
predata['ipaddress'] = predata['ipaddress'].apply(lambda x: "".join(x.split(".")))

In [34]:
#merge is_tablet, is_pc, is_mobile on device column
#takes the 1st true that appears
def deviceselect(x): 
    #if True in tablet and touch
    if x['is_tablet'] and x['is_touch']:
       return 'ttablet'
    
    #if True in pc and touch
    if x['is_pc'] and x['is_touch']:
       return 'tpc'
    
    #if True in mobile and touch
    if x['is_mobile'] and x['is_touch']:
       return 'tmobile'

    #if True only in mobile
    if x['is_mobile']:
        return 'mobile'
    
    #if True only in pc
    if x['is_pc']:
        return 'pc'

    #if True only in tablet
    if x['is_tablet']:
        return 'tablet'  
    
    #if all false return otherd
    return 'otherd'                            
    
predata['device'] = predata.apply(deviceselect, axis = 1)

In [35]:
#drop columns that were merged on the device column
predata.drop(['is_pc', 'is_tablet', 'is_mobile', 'is_touch'], axis = 1, inplace = True)  

In [36]:
#final number of columns and rows of the dataframe
predata.shape

(1569, 20)

In [37]:
#final columns of the dataframe
predata.columns

Index(['timestamp', 'canal', 'operativa', 'clientid', 'entity', 'reference',
       'trusted_indicator', 'iban_orig', 'iban_dest', 'amount',
       'accountbalance', 'ipaddress', 'is_fraud', 'browser_family',
       'os_family', 'hour', 'week', 'weekday', 'month', 'device'],
      dtype='object')

In [42]:
#undersampling -> RandomUnderSampler
from imblearn.under_sampling import RandomUnderSampler
#sampling_strategy = minority_class_data/majority_class_data 
#ex: majority = 1000, minority = 100, sampling_strategy = 0.5 -> 100/200 = 0.5 -> 200 in the majority class
under = RandomUnderSampler(sampling_strategy = 0.9)  #change for sampling_strategy = 0.00569 // total = 15638401
                                                                
#target column
y = predata['is_fraud']  #series

#all the other columns
X = predata.drop('is_fraud', axis = 1)  #dataframe

In [43]:
#resample X and y 
X, y = under.fit_resample(X, y)

In [48]:
#create the new dataframe
predatabalanced = X.copy()
predatabalanced['is_fraud'] = y

#save the balanced dataset
predatabalanced.to_csv("5transfers_rund.csv", index = False)

In [45]:
#check the new data distribution
counts_n = predatabalanced['is_fraud'].value_counts()
counts_n

0    632
1    569
Name: is_fraud, dtype: int64

In [46]:
#new percentage of frauds
counts_n / predatabalanced.shape[0]

0    0.526228
1    0.473772
Name: is_fraud, dtype: float64

In [47]:
predatabalanced.shape

(1201, 20)