# 1. data preparation

## 1.1 read train and testb data

In [1]:
import pandas as pd
import numpy as np

def count_one_ratio(arr):
    counts = arr.value_counts()
    try:
        if counts.sum()<100:
            return 0
        return counts[1]/counts.sum()
    except:
        return 0

train_data = pd.read_csv('../data/full_size/atec_anti_fraud_train.csv',index_col = 0)
testb_data = pd.read_csv('../data/full_size/atec_anti_fraud_test_b.csv',index_col = 0)

## 1.2 split data into labeled and unlabeled

In [2]:
unlabeled_data=(train_data[train_data['label']==-1])
normal_data = train_data[train_data['label']==0]
anormal_data=train_data[train_data['label']==1]
labeled_data = train_data[(train_data['label']==0)|(train_data['label']==1)]

## 1.3 train test split

In [3]:
train_num = int(labeled_data.shape[0]*0.8)
train_x = labeled_data.iloc[:train_num,:].drop(columns=['label'])
train_y = labeled_data.iloc[:train_num,:]['label']
test_x = labeled_data.iloc[train_num:,:].drop(columns=['label'])
test_y = labeled_data.iloc[train_num:,:]['label']

# 2. generate risk ratio of every feature value

In [98]:
risk_ratios = {} # key:feature_name,value: list of anormal probabilities for every possible value
for feature in labeled_data.columns[2:]:
    max_value = labeled_data[feature].max()
    min_value = labeled_data[feature].min()
    value_num = labeled_data[feature].value_counts().shape[0]
    if value_num>10000:
        # guarantee the number in every bin won't be too small
        bins = 1000
        ratio = labeled_data.groupby(pd.cut(labeled_data[feature],np.linspace(min_value-0.1,max_value,bins))).label.mean()
        risk_ratios[feature]=ratio
    else:
        risk_ratios[feature]=labeled_data.groupby(feature).label.agg(count_one_ratio)


## 2.1 get high risk feature value

In [99]:
high_risks = {}
for feature,ratios in risk_ratios.items():
    high_risk = ratios[ratios>0.35]
    if not high_risk.empty:
        high_risks[feature]=high_risk

In [100]:
len(high_risks.items())

74

In [88]:
isinstance(high_risks['f82'].index,pd.core.indexes.category.CategoricalIndex)

True

In [291]:
high_risks['f82'].index.get_loc(76000)

0

# 3. prediction and evaluation on test dataset

## 3.1 define predict function

In [101]:
def mannual_predict(X):
    '''
    X should be in shape of (n_samples,n_columns)
    '''
    risk_thr = 0.5
    Y = np.zeros(X.shape[0])
    Y_proba = np.zeros(X.shape[0])
    i = 0
    all_risk = []
    for x in X.iterrows():
        x=x[1]
        risks = []
        risk = 0
        for feature_name,feature_ratio in high_risks.items():
            value = x[feature_name]
            if value in feature_ratio:
                risks.append(feature_ratio[value])
            else:
                if isinstance(feature_ratio.index,pd.core.indexes.category.CategoricalIndex):
                    try:
                        loc = feature_ratio.index.get_loc(value)
                        risks.append(feature_ratio[loc])
                    except:
                        pass
        if risks:
            risk_score = np.array(risks).mean()
            Y_proba[i] = risk_score
            if risk_score>risk_thr:
                Y[i] = 1
        i+=1
    return Y,Y_proba

# def mannual_predict_proba(X):
#     '''
#     X should be in shape of (n_samples,n_columns)
#     '''
#     Y = np.zeros(X.shape[0])
#     i = 0
#     all_risk = []
#     for x in X.iterrows():
#         print(i)
#         x=x[1]
#         risks = []
#         risk = 0
#         for feature_name,feature_ratio in high_risks.items():
#             value = x[feature_name]
#             if value in feature_ratio:
#                 risks.append(feature_ratio[value])
#         if risks:
#             risk_score = np.array(risks).mean()
#             Y[i]=risk_score
#         i+=1
#     return Y

In [102]:
predict_y,predict_y_proba = mannual_predict(test_x)

In [63]:
from sklearn import metrics
import bisect
def get_tpr_from_fpr(fpr_array, tpr_array, target):
    fpr_index = np.where(fpr_array == target)
    assert target <= 0.01, 'the value of fpr in the custom metric function need lt 0.01'
    if len(fpr_index[0]) > 0:
        return np.mean(tpr_array[fpr_index])
    else:
        tmp_index = bisect.bisect(fpr_array, target)
        fpr_tmp_1 = fpr_array[tmp_index-1]
        fpr_tmp_2 = fpr_array[tmp_index]
        if (target - fpr_tmp_1) > (fpr_tmp_2 - target):
            tpr_index = tmp_index
        else:
            tpr_index = tmp_index - 1
        return tpr_array[tpr_index]


def eval_metric(labels,pred):
    fpr, tpr, _ = metrics.roc_curve(labels, pred, pos_label=1)
    tpr1 = get_tpr_from_fpr(fpr, tpr, 0.001)
    tpr2 = get_tpr_from_fpr(fpr, tpr, 0.005)
    tpr3 = get_tpr_from_fpr(fpr, tpr, 0.01)
    return 0.4*tpr1 + 0.3*tpr2 + 0.3*tpr3

In [78]:
from sklearn.metrics import precision_score, recall_score
print(precision_score(test_y,predict_y),recall_score(test_y,predict_y),eval_metric(test_y,predict_y))

0.6020408163265306 0.02425986842105263 0.02425986842105263


In [67]:
predict_y = (predict_y_proba>0.5).astype(int)

In [79]:
pd.Series(predict_y_proba).value_counts()

0.000000    197904
0.619792        31
0.765766        15
0.624334         7
0.673267         7
0.641557         5
0.640645         4
0.611570         4
0.646018         4
0.692779         4
0.643618         3
0.631562         2
0.642419         2
0.637097         2
0.641988         2
0.628794         2
0.659643         1
0.634876         1
0.625793         1
0.652127         1
dtype: int64

In [72]:
predict_y_proba.value_counts()

AttributeError: 'numpy.ndarray' object has no attribute 'value_counts'