In [3]:
#data loading
import pandas as pd

predata = pd.read_csv(r'C:/Users/BeatrizCarvalho/OneDrive - Closer Consultoria Lda/Documents/Entangled-Spaces/Datasets/0transfers_nd_corr.csv', low_memory = False) 

In [2]:
#make sure the dataframes were well selected
counts = predata['is_fraud'].value_counts()
counts

0    8000000
1        569
Name: is_fraud, dtype: int64

In [3]:
#ratios of each class
counts / predata.shape[0]

0    0.999929
1    0.000071
Name: is_fraud, dtype: float64

In [4]:
#inicial number of columns and rows of the dataframe
predata.shape

(8000569, 21)

In [5]:
#visualize the whole output
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [6]:
#create column weekday
predata['weekday'] = pd.to_datetime(predata['timestamp']).apply(lambda x: x.weekday())

In [7]:
#create column month
predata['month'] = pd.DatetimeIndex(predata['timestamp']).month

In [8]:
#drop column timestamp that was used for creating weekday and month
predata.drop(['timestamp'], axis = 1, inplace = True)

In [9]:
#renaming browsers
def assign_brws_fam(x):

    changes = {
        "m bot": "other",  #15
        "android": "other",  #104
        "1password": "other",  #57
        "chrome mobile": "chrome",  #735168
        "chrome mobile webview": "chrome",  #2681
        "mobile safari": "safari",  #338413
        "mobile safari ui/wkwebview": "safari",  #4190964
        "firefox mobile": "firefox"  #5515
        "edge mobile": "edge",
        "firefox mobile": "firefox"
    }

    if x in changes.keys():
        return changes[x]
    else:
        return x

predata["browser_family"] = predata["browser_family"].apply(lambda x: assign_brws_fam(x))

In [10]:
#renaming os 
def assign_os_fam(x):

    changes = {
        "windows phone": "windows",  #863
    }

    if x in changes.keys():
        return changes[x]
    else:
        return x

predata["os_family"] = predata["os_family"].apply(lambda x: assign_os_fam(x))

In [11]:
#remove spaces on browsers and os 
cols = ['os_family', 'browser_family']

for col in cols:
      predata[col] = predata[col].apply(lambda x: x.replace(' ', ''))

In [12]:
#make all lowercase
def convert_column_lower(column, df):
    df[column] = df[column].str.lower()
    
columns_to_lower = ['canal', 'operativa', 'browser_family', 'os_family', 'ipaddress', 'trusted_indicator']
for col in columns_to_lower:
    predata[col] = predata[col].str.lower()

In [13]:
#remove dots from ipaddress
predata['ipaddress'] = predata['ipaddress'].apply(lambda x: "".join(x.split(".")))

In [14]:
#merge is_tablet, is_pc, is_mobile on device column
#takes the 1st true that appears
def deviceselect(x): 
    #if True in tablet and touch
    if x['is_tablet'] and x['is_touch']:
       return 'ttablet'
    
    #if True in pc and touch
    if x['is_pc'] and x['is_touch']:
       return 'tpc'
    
    #if True in mobile and touch
    if x['is_mobile'] and x['is_touch']:
       return 'tmobile'

    #if True only in mobile
    if x['is_mobile']:
        return 'mobile'
    
    #if True only in pc
    if x['is_pc']:
        return 'pc'

    #if True only in tablet
    if x['is_tablet']:
        return 'tablet'  
    
    #if all false return otherd
    return 'otherd'                            
    
predata['device'] = predata.apply(deviceselect, axis = 1)

In [15]:
#drop columns that were merged on the device column
predata.drop(['is_pc', 'is_tablet', 'is_mobile', 'is_touch'], axis = 1, inplace = True)  

In [17]:
#final columns of the dataframe
predata.columns

Index(['canal', 'operativa', 'clientid', 'entity', 'reference',
       'trusted_indicator', 'iban_orig', 'iban_dest', 'amount',
       'accountbalance', 'ipaddress', 'is_fraud', 'browser_family',
       'os_family', 'hour', 'week', 'weekday', 'month', 'device'],
      dtype='object')

In [18]:
#undersampling -> RandomUnderSampler
from imblearn.under_sampling import RandomUnderSampler

under = RandomUnderSampler(sampling_strategy = {0: 600, 1:569})  
                                                                
#target column
y = predata['is_fraud']  #series

#all the other columns
X = predata.drop('is_fraud', axis = 1)  #dataframe

In [19]:
#resample X and y 
X, y = under.fit_resample(X, y)

In [20]:
#create the new dataframe
predatabalanced = X.copy()
predatabalanced['is_fraud'] = y

#save the balanced dataset
predatabalanced.to_csv(r'C:/Users/BeatrizCarvalho/OneDrive - Closer Consultoria Lda/Documents/Entangled-Spaces/Datasets/2transfers_rund.csv', index = False)

In [None]:
#profiling of the imbalanced data
from pandas_profiling import ProfileReport

#ProfileReport(predata.sample(n = 10000)) for much data - it randomizes data order and selects representative sample
prof = ProfileReport(predatabalanced)             
prof.to_file(output_file = r'C:/Users/BeatrizCarvalho/OneDrive - Closer Consultoria Lda/Documents/Entangled-Spaces/Profiles/2transfers_rund.html')

In [21]:
#check the new data distribution
counts_n = predatabalanced['is_fraud'].value_counts()
counts_n

0    7112
1     569
Name: is_fraud, dtype: int64

In [23]:
#new percentage of frauds
counts_n / predatabalanced.shape[0]

0    0.925921
1    0.074079
Name: is_fraud, dtype: float64

In [24]:
predatabalanced.shape

(7681, 19)

In [None]:
#graphs