# To insert into section 1.4 of the final submision 
We recalculate some of the previous derived measures but using only the authorized transactions and the raw values of the purchase amounts.

In [34]:
df_all_trans = pd.concat([df_hist_trans, df_new_trans])

In [89]:
del df_hist_trans
gc.collect()

198

In [39]:
df_all_trans = df_all_trans[df_all_trans['authorized_flag']==1] #We will only use authorized transactions
df_all_trans['purchase_amount_raw'] = df_all_trans['purchase_amount']/0.00150265118 + 497.06

In [66]:
# Aggregate for authorized transactions: number of transactions and merchants, and total raw amount
agg_funcs_auth = {
    'card_id': ['size'],
    'merchant_id': ['nunique'],
    'purchase_amount_raw': ['sum'],
    'purchase_date': ['max']
}
df_agg_auth = df_all_trans.groupby('card_id').agg(agg_funcs_auth)

In [67]:
df_agg_auth.columns = ['_'.join(col).strip() for col in df_agg_auth.columns.values]
df_agg_auth.reset_index(drop=False, inplace=True)

In [72]:
#Time up to May 1th, 2018
df_agg_auth['last_date_upto_may'] = (datetime.datetime.strptime('May 1 2018', '%b %d %Y') - df_agg_auth['purchase_date_max']).dt.days
df_agg_auth.drop('purchase_date_max', axis=1, inplace=True)

In [74]:
df_train_po = df_train.merge(df_agg_auth, on='card_id', how='left')
df_test_po = df_test.merge(df_agg_auth, on='card_id', how='left')

In [75]:
df_train_po.rename(columns={'card_id_size': 'n_purchase_amount', 'merchant_id_nunique': 'n_merchant', 'purchase_amount_raw_sum': 'sum_purchase_amount'}, inplace=True)
df_test_po.rename(columns={'card_id_size': 'n_purchase_amount', 'merchant_id_nunique': 'n_merchant', 'purchase_amount_raw_sum': 'sum_purchase_amount'}, inplace=True)

In [90]:
del df_agg_auth
gc.collect()

0

In [80]:
#Create year-month variable
def create_date_year_month(df, source_column, preposition):
    df[preposition + '_ym'] = df[source_column].dt.strftime('%Y') + '-' + df[source_column].dt.strftime('%m')    
    return df
df_all_trans = create_date_year_month(df_all_trans, 'purchase_date', 'purchase')

In [83]:
#Shape of the transactions over time
def compute_shape(user):
    grid = np.linspace(-1,1,user['purchase_ym'].nunique())
    coeffs = np.polyfit(grid, user.groupby('purchase_ym')['purchase_amount_raw'].sum().values, 2)
    return(pd.DataFrame({'purchase_shape': [np.sign(coeffs[0])]}))

df_coeffs = df_all_trans.groupby('card_id')[['purchase_ym','purchase_amount_raw']].apply(compute_shape)
df_coeffs = df_coeffs.reset_index(level=0)

In [91]:
df_train_po = df_train_po.merge(df_coeffs, on='card_id', how='left') 
df_test_po = df_test_po.merge(df_coeffs, on='card_id', how='left')

In [99]:
del df_coeffs
gc.collect()

14

In [105]:
#Ratio max/median amount
df_amounts_over_time = df_all_trans.groupby(['card_id','purchase_ym'])['purchase_amount_raw'].sum().reset_index()
df_ratios = df_amounts_over_time.groupby('card_id')['purchase_amount_raw'].apply(lambda dfx: dfx.max() / dfx.median())
df_ratios = df_ratios.reset_index(level=0)
df_ratios.rename(columns={'purchase_amount_raw': 'ratio_amount'}, inplace=True)

In [108]:
df_train_po = df_train_po.merge(df_ratios, on='card_id', how='left') 
df_test_po = df_test_po.merge(df_ratios, on='card_id', how='left')

In [114]:
del df_amounts_over_time, df_ratios
gc.collect()

85

In [115]:
#Time differences for favourite merchant
def diff_times_fav(user):
    id_merch_temp = user.groupby('merchant_id').size().idxmax()
    dates_temp = user[user['merchant_id']==id_merch_temp].sort_values('purchase_date')['purchase_date'].values
    diff_temp = (np.diff(dates_temp)/3.6e+12).astype(float)
    return(pd.DataFrame({'diff_time_fav_mean': [diff_temp.mean()], 'diff_time_fav_std': [diff_temp.std()]}))

df_diff_times = df_all_trans.groupby('card_id')[['merchant_id','purchase_date']].apply(diff_times_fav).reset_index(level=0)
df_diff_times['diff_time_fav_mean'].fillna(0, inplace=True)
df_diff_times['diff_time_fav_std'].fillna(0, inplace=True)

In [118]:
df_train_po = df_train_po.merge(df_diff_times, on='card_id', how='left') 
df_test_po = df_test_po.merge(df_diff_times, on='card_id', how='left')

In [124]:
del df_diff_times
gc.collect()

70

In [125]:
#Time differences for all transactions
def diff_times(user):
    diff_temp = (np.diff(np.sort(user.values))/3.6e+12).astype(float)
    return(pd.DataFrame({'diff_time_mean': [diff_temp.mean()], 'diff_time_std': [diff_temp.std()]}))

df_diff_times2 = df_all_trans.groupby('card_id')['purchase_date'].apply(diff_times).reset_index(level=0)

In [128]:
df_train_po = df_train_po.merge(df_diff_times2, on='card_id', how='left') 
df_test_po = df_test_po.merge(df_diff_times2, on='card_id', how='left')

In [134]:
del df_diff_times2
gc.collect()

63

In [136]:
#If the user is in the new transactions set
df_train_po['in_new'] = (df_train_po['card_id'].isin(df_new_trans['card_id'].values))*1
df_test_po['in_new'] = (df_test_po['card_id'].isin(df_new_trans['card_id'].values))*1

In [142]:
del df_new_trans
gc.collect()

14

We use a perceptron with a single layer of 50 neurons.

In [180]:
from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier

kf = KFold(n_splits=5)
X = df_train_po[['feature_1', 'feature_2', 'feature_3', 'n_purchase_amount',
       'n_merchant', 'sum_purchase_amount', 'last_date_upto_may',
       'purchase_shape', 'ratio_amount', 'diff_time_fav_mean',
       'diff_time_fav_std', 'diff_time_mean', 'diff_time_std', 'in_new']]
y = df_train_po['outliers']

In [173]:
# Predictions for the train set with 5-folds
os_smote = SMOTE(random_state=1)
y_train_prob = np.zeros(y.shape[0])

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    #Over-sample the data
    os_data_X, os_data_y = os_smote.fit_sample(X_train, y_train)
    os_data_X = pd.DataFrame(data=os_data_X, columns=X_train.columns)
    os_data_y= pd.DataFrame(data=os_data_y, columns=['y'])
    
    #Fit the MLPC
    clf = MLPClassifier(hidden_layer_sizes=(50), max_iter=100, early_stopping=True, verbose=False,  random_state=1, tol=0.001)
    clf.fit(os_data_X, os_data_y)
    y_train_prob[test_index] = clf.predict_proba(X_test)[:,1]

In [183]:
y_train_pred = 1*(y_train_prob>=0.9)

In [None]:
#Predictions for the test set

#Over-sample the data
os_data_X, os_data_y = os_smote.fit_sample(X, y)
os_data_X = pd.DataFrame(data=os_data_X, columns=X.columns)
os_data_y= pd.DataFrame(data=os_data_y, columns=['y'])
    
#Fit the MLPC
clf = MLPClassifier(hidden_layer_sizes=(50), max_iter=100, early_stopping=True, verbose=False,  random_state=1, tol=0.001)
clf.fit(os_data_X, os_data_y)

#Predictions for the test set
X_test = df_test_po[['feature_1', 'feature_2', 'feature_3', 'n_purchase_amount',
       'n_merchant', 'sum_purchase_amount', 'last_date_upto_may',
       'purchase_shape', 'ratio_amount', 'diff_time_fav_mean',
       'diff_time_fav_std', 'diff_time_mean', 'diff_time_std', 'in_new']]
y_test_prob = clf.predict_proba(X_test)[:,1]

In [184]:
y_test_pred = 1*(y_test_prob >= 0.9)

Store just the final predictions

In [189]:
df_train_po['possible_out'] = y_train_pred
df_test_po['possible_out'] = y_test_pred
df_train = df_train.merge(df_train_po[['card_id','possible_out']], on='card_id', how='left')
df_test = df_test.merge(df_test_po[['card_id','possible_out']], on='card_id', how='left')