In [58]:
# data loading
import pandas as pd

predata = pd.read_csv('correct_data.csv', low_memory=False)

# create column weekday
predata['weekday'] = pd.to_datetime(predata['timestamp']).apply(lambda x: x.weekday())

# create column month
predata['month'] = pd.DatetimeIndex(predata['timestamp']).month

# drop column timestamp that was used for creating weekday and month columns
predata.drop(['timestamp'], axis=1, inplace=True)


# renaming browsers
def assign_brws_fam(x):
    changes = {
        "m bot": "other",  # 15
        "android": "other",  # 104
        "1password": "other",  # 57
        "chrome mobile": "chrome",  # 735168
        "chrome mobile webview": "chrome",  # 2681
        "mobile safari": "safari",  # 338413
        "mobile safari ui/wkwebview": "safari",  # 4190964
        "firefox mobile": "firefox",  # 5515
        "edge mobile": "edge"
    }

    if x in changes.keys():
        return changes[x]
    else:
        return x


predata["browser_family"] = predata["browser_family"].apply(lambda x: assign_brws_fam(x))


# renaming os
def assign_os_fam(x):
    changes = {
        "windows phone": "windows"  # 863
    }

    if x in changes.keys():
        return changes[x]
    else:
        return x


predata["os_family"] = predata["os_family"].apply(lambda x: assign_os_fam(x))

# remove spaces on browsers and os
cols = ['os_family', 'browser_family']

for col in cols:
    predata[col] = predata[col].apply(lambda x: x.replace(' ', ''))


# make all lowercase
def convert_column_lower(column, df):
    df[column] = df[column].str.lower()


columns_to_lower = ['canal', 'operativa', 'browser_family', 'os_family', 'ipaddress', 'trusted_indicator']
for col in columns_to_lower:
    predata[col] = predata[col].str.lower()

# remove dots from ipaddress
predata['ipaddress'] = predata['ipaddress'].apply(lambda x: "".join(x.split(".")))


# merge is_tablet, is_pc, is_mobile, is_touch on a new column called device
def deviceselect(x):
    # if True in tablet and touch
    if x['is_tablet'] and x['is_touch']:
        return 'ttablet'

    # if True in pc and touch
    if x['is_pc'] and x['is_touch']:
        return 'tpc'

    # if True in mobile and touch
    if x['is_mobile'] and x['is_touch']:
        return 'tmobile'

    # if True only in mobile
    if x['is_mobile']:
        return 'mobile'

    # if True only in pc
    if x['is_pc']:
        return 'pc'

    # if True only in tablet
    if x['is_tablet']:
        return 'tablet'

        # if none of the above return otherd
    return 'otherd'


predata['device'] = predata.apply(deviceselect, axis=1)

# drop columns that were merged on the new device column
predata.drop(['is_pc', 'is_tablet', 'is_mobile', 'is_touch'], axis=1, inplace=True)

# change data types for columns that need to be categorical
convert_dict = {'clientid': object, 'entity': object, 'reference': object, 'iban_orig': object, 'iban_dest': object,
                'hour': object, 'week': object, 'weekday': object, 'month': object}

predata = predata.astype(convert_dict)

frauds = predata[predata['is_fraud'] == 1].copy(deep=True)
non_frauds = predata[predata['is_fraud'] == 0].sample(500_000).copy(deep=True)
del predata
result = pd.concat([non_frauds, frauds], ignore_index=True).copy(deep=True)
del frauds, non_frauds

In [61]:
dums = pd.get_dummies(result,
                      columns=['canal', 'operativa', 'trusted_indicator', 'browser_family', 'os_family', 'hour', 'week',
                               'weekday', 'month', 'device'])

  dums = pd.get_dummies(result,


In [62]:
y = dums['is_fraud']

X = dums.drop('is_fraud', axis=1)

In [64]:
X = X.apply(pd.to_numeric, errors='ignore')

In [65]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500569 entries, 0 to 500568
Columns: 134 entries, entity to device_ttablet
dtypes: float64(2), int64(5), object(1), uint8(126)
memory usage: 90.7+ MB


In [66]:
X.dtypes

entity              int64
reference           int64
iban_orig           int64
iban_dest           int64
amount            float64
                   ...   
device_otherd       uint8
device_pc           uint8
device_tmobile      uint8
device_tpc          uint8
device_ttablet      uint8
Length: 134, dtype: object

In [67]:
# TODO: Needs to be back to none when reverting
def adjust_ip(ip):
    try:
        return int(ip)
    except:
        return 0


X['ipaddress'] = X['ipaddress'].apply(lambda x: adjust_ip(x))

In [68]:
X.dtypes

entity              int64
reference           int64
iban_orig           int64
iban_dest           int64
amount            float64
                   ...   
device_otherd       uint8
device_pc           uint8
device_tmobile      uint8
device_tpc          uint8
device_ttablet      uint8
Length: 134, dtype: object

In [69]:
y = y.apply(pd.to_numeric)

In [70]:
from imblearn.over_sampling import SMOTE

over = SMOTE()
X_res, y_res = over.fit_resample(X, y)

In [71]:
X_res.shape

(1000000, 134)

In [72]:
X_res['is_fraud'] = y_res.copy()

  X_res['is_fraud'] = y_res.copy()


In [73]:
X_res.shape

(1000000, 135)

In [74]:
len(X_res[X_res['is_fraud'] == 0])

500000

In [75]:
safe_X = X_res.copy(deep=True)

In [76]:
set(result['device'])

{'mobile', 'otherd', 'pc', 'tmobile', 'tpc', 'ttablet'}

In [77]:
X_res.shape

(1000000, 135)

In [78]:
dummies_cols = [('canal', 'mbe'), ('operativa', 'trfint'), ('trusted_indicator', 'unknown'),
                ('browser_family', 'chrome'), ('os_family', 'android'), ('hour', '17'), ('week', '46'), ('weekday', '3'),
                ('month', '9'), ('device', 'mobile')]

In [79]:
def get_proper_value(df, created_cols, origin_col, default_val):
    proper_result = []

    for _, row in df.iterrows():
        curr_values = row[created_cols]
        was_set = False
        for i, val in enumerate(curr_values.values):
            if val == 1:
                was_set = True
                to_append = str(created_cols[i])
                to_append = to_append.replace(origin_col, '')
                proper_result.append(to_append)
                break
        if not was_set:
            proper_result.append(default_val)
    return proper_result

In [80]:
for (dummie, default) in dummies_cols:
    curr_dummie = dummie + '_'
    used_cols = [curr for curr in X_res.columns if curr_dummie in curr]
    X_res[dummie] = get_proper_value(X_res, used_cols, curr_dummie, default)

In [81]:
remove_cols = ['canal_', 'operativa_', 'trusted_indicator_', 'browser_family_', 'os_family_', 'hour_', 'week_', 'weekday_',
               'month_', 'device_']

In [82]:
for rem in remove_cols:
    used_cols = [curr for curr in X_res.columns if rem in curr]
    for c in used_cols:
        X_res.drop(c, inplace=True, axis=1)

In [83]:
def adjust_ip_reverse(ip):
    if ip == 0:
        return 'unknown'
    return ip

X_res['ipaddress'] = X_res['ipaddress'].apply(lambda x: adjust_ip_reverse(x))

In [84]:
X_res.to_csv('res.csv', index=False)