In [30]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import os
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [2]:
consumer_user_details = pd.read_parquet("../data/tables/consumer_user_details.parquet")
tbl_merchants = pd.read_parquet("../data/tables/tbl_merchants.parquet")
consumer_fraud_probability = pd.read_csv("../data/tables/consumer_fraud_probability.csv")
merchant_fraud_probability = pd.read_csv("../data/tables/merchant_fraud_probability.csv")
tbl_consumer = pd.read_csv("../data/tables/tbl_consumer.csv",sep="|")

In [3]:
def tags_process(s):
    s2 = re.sub("[\(\)\[\]]","###",s)
    s3lst = re.findall("###[^#]+###",s2)
    s3lst = [_s[3:-3] for _s in s3lst]
    return s3lst

In [4]:
tbl_merchants['tags_proed'] = tbl_merchants['tags'].apply(tags_process)
_cols = ['business_area','revenue_level','take_rate']
for i in range(3):
    tbl_merchants[_cols[i]] = tbl_merchants['tags_proed'].apply(lambda _x: _x[i])
tbl_merchants['take_rate'] = tbl_merchants['take_rate'].str[-4:].astype(float)
tbl_merchants.reset_index(inplace=True)

In [5]:
merchant_df = pd.merge(
    merchant_fraud_probability,
    tbl_merchants[['merchant_abn','business_area','revenue_level','take_rate']]
)
merchant_df['business_area'] = merchant_df['business_area'].apply(lambda _s: re.sub("\s","",_s.lower()))
merchant_df.to_parquet("../data/tables/merchant_df.parquet")

In [6]:
root_dir = "../data/tables/transactions_20210228_20210827_snapshot"
trans_df = pd.DataFrame()
for fs in os.listdir(root_dir):
    folder_dir = root_dir + "/" + fs
    try:
        temp_df = pd.read_parquet(folder_dir +"/" + os.listdir(folder_dir)[1])[
            ['user_id','merchant_abn','dollar_value']
        ]
    except NotADirectoryError:
        continue
    temp_df['date'] = fs[-10:]
    trans_df = pd.concat([trans_df,temp_df])
trans_df.to_parquet(root_dir + "/" + "20210228_20210827.parquet")

In [7]:
root_dir = "../data/tables/transactions_20210828_20220227_snapshot"
trans_df = pd.DataFrame()
for fs in os.listdir(root_dir):
    folder_dir = root_dir + "/" + fs
    try:
        temp_df = pd.read_parquet(folder_dir +"/" + os.listdir(folder_dir)[1])[
            ['user_id','merchant_abn','dollar_value']
        ]
    except NotADirectoryError:
        continue
    temp_df['date'] = fs[-10:]
    trans_df = pd.concat([trans_df,temp_df])
trans_df.to_parquet(root_dir + "/" + "20210828_20220227.parquet")

In [8]:
trans_df = pd.concat([
    pd.read_parquet("../data/tables/transactions_20210228_20210827_snapshot/20210228_20210827.parquet"),
    pd.read_parquet("../data/tables/transactions_20210828_20220227_snapshot/20210828_20220227.parquet")])

In [9]:
merchant_merged_df = pd.merge(
    trans_df,
    merchant_df,
    left_on=["merchant_abn","date"],
    right_on=["merchant_abn","order_datetime"],
    how="inner"
)

In [10]:
merchant_train_df = merchant_merged_df[['fraud_probability','dollar_value','take_rate']]
merchant_train_df = pd.concat([
    merchant_train_df,
    pd.get_dummies(merchant_merged_df['business_area'])],1)
merchant_train_df = pd.concat([
    merchant_train_df,
    pd.get_dummies(merchant_merged_df['merchant_abn'])],1)
merchant_train_df = pd.concat([
    merchant_train_df,
    pd.get_dummies(merchant_merged_df['revenue_level'])],1)

merchant_X = merchant_train_df.iloc[:,1:].values
merchant_y = merchant_train_df.iloc[:,0].values

In [12]:
# merchant_cv_score_lst = []
# merchant_params_lst = []

# for C in [0.1,1.0,5.0]:
#     for epsilon in [0.0,0.1,0.2]:
#         regr = make_pipeline(StandardScaler(), SVR(C=C, epsilon=epsilon))
#         merchant_cv_score_lst.append(cross_val_score(regr, merchant_X, merchant_y, cv=5))
#         merchant_params_lst.append((C,epsilon))

In [None]:
np.mean(np.asarray(merchant_cv_score_lst),1)

In [None]:
merchant_params_lst[np.argmax(np.mean(np.asarray(merchant_cv_score_lst),1))]

In [None]:
# That's, C = 0.1 and epsilon = 0.2 gives the best result.

In [13]:
consumer_df = pd.merge(
    tbl_consumer,
    consumer_user_details,
    on="consumer_id",
    how="inner"
)
consumer_df = pd.merge(
    consumer_df,
    consumer_fraud_probability,
    on="user_id",
    how="inner"
)

In [14]:
consumer_merged_df = pd.merge(
    trans_df,
    consumer_df,
    left_on=["user_id","date"],
    right_on=["user_id","order_datetime"],
    how="inner"
)

In [15]:
consumer_train_df = consumer_merged_df[['fraud_probability','dollar_value']]
consumer_train_df = pd.concat([
    consumer_train_df,
    pd.get_dummies(consumer_merged_df['state'])],1)
consumer_train_df = pd.concat([
    consumer_train_df,
    pd.get_dummies(consumer_merged_df['gender'])],1)

In [16]:
consumer_X = consumer_train_df.iloc[:,1:].values
consumer_y = consumer_train_df.iloc[:,0].values

In [None]:
consumer_cv_score_lst = []
consumer_params_lst = []
for alpha in [0.1,1,10]:
    regr = make_pipeline(StandardScaler(), Ridge(alpha=alpha))
    consumer_cv_score_lst.append(cross_val_score(regr, consumer_X, consumer_y, cv=5, n_jobs=-1))
    consumer_params_lst.append(alpha)

In [None]:
np.mean(np.asarray(consumer_cv_score_lst),1)

In [None]:
consumer_params_lst[np.argmax(np.mean(np.asarray(consumer_cv_score_lst),1))]

In [None]:
# That is, alpha = 10 gives the best result

In [None]:
root_dir = "../data/tables/transactions_20220228_20220828_snapshot"
trans_df_test = pd.DataFrame()
for fs in os.listdir(root_dir):
    folder_dir = root_dir + "/" + fs
    try:
        
        temp_df = pd.read_parquet(folder_dir +"/" + np.sort(os.listdir(folder_dir))[-1])[
            ['user_id','merchant_abn','dollar_value']
        ]
    except NotADirectoryError:
        continue
    temp_df['date'] = fs[-10:]
    trans_df_test = pd.concat([trans_df_test,temp_df])
trans_df_test.to_parquet(root_dir + "/" + "20220228_20220828.parquet")

In [21]:
root_dir = "../data/tables/transactions_20220228_20220828_snapshot"
trans_df_test = pd.read_parquet(root_dir + "/" + "20220228_20220828.parquet")

In [22]:
merchant_merged_df_test = pd.merge(
    trans_df_test,
    merchant_df,
    left_on=["merchant_abn"],
    right_on=["merchant_abn"],
    how="inner"
)

In [23]:
merchant_train_df = merchant_merged_df[['fraud_probability','dollar_value','take_rate']]
merchant_train_df = pd.concat([
    merchant_train_df,
    pd.get_dummies(merchant_merged_df['business_area'])],1)
merchant_train_df = pd.concat([
    merchant_train_df,
    pd.get_dummies(merchant_merged_df['revenue_level'])],1)

merchant_X = merchant_train_df.iloc[:,1:].values
merchant_y = merchant_train_df.iloc[:,0].values

In [24]:
merchant_test_df = merchant_merged_df_test[['dollar_value','take_rate']]
merchant_test_df = pd.concat([
    merchant_test_df,
    pd.get_dummies(merchant_merged_df_test['business_area'])],1)
merchant_test_df = pd.concat([
    merchant_test_df,
    pd.get_dummies(merchant_merged_df_test['revenue_level'])],1)
merchant_X_test = merchant_test_df.values

In [42]:
regr = make_pipeline(StandardScaler(), SVR(C=0.1, epsilon=0.2))
regr.fit(merchant_X, merchant_y)
print("training finished")
merchant_y_pred = regr.predict(merchant_X_test)
print("prediction finished")

training finished
prediction finished


In [43]:
y_pred = np.sign(regr.predict(merchant_X) - 50)
y_real = np.sign(merchant_y - 50)
print("merchant")
print("Accuracy on Training DataSet: %.2f%%" %((y_pred == y_real).sum() / y_pred.shape[0] * 100))

merchant
Accuracy on Training DataSet: 98.08%


In [26]:
consumer_merged_df_test = pd.merge(
    trans_df_test,
    consumer_df,
    left_on=["user_id"],
    right_on=["user_id"],
    how="inner"
)

In [27]:
consumer_test_df = consumer_merged_df_test[['dollar_value']]
consumer_test_df = pd.concat([
    consumer_test_df,
    pd.get_dummies(consumer_merged_df_test['state'])],1)
consumer_test_df = pd.concat([
    consumer_test_df,
    pd.get_dummies(consumer_merged_df_test['gender'])],1)

In [28]:
consumer_X_test = consumer_test_df.values

In [31]:
regr = make_pipeline(StandardScaler(), Ridge(alpha=10))
regr.fit(consumer_X, consumer_y)
print("training finished")
consumer_y_pred = regr.predict(consumer_X_test)
print("prediction finished")

training finished
prediction finished


In [41]:
y_pred = np.sign(regr.predict(consumer_X) - 50)
y_real = np.sign(consumer_y - 50)
print("consumer")
print("Accuracy on Training DataSet: %.2f%%" %((y_pred == y_real).sum() / y_pred.shape[0] * 100))

consumer
Accuracy on Training DataSet: 98.41%
