In [1]:
from flask import Flask, jsonify, request
from flask_cors import CORS, cross_origin

import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
from sklearn import preprocessing as pp

In [2]:
imported_model = tf.keras.models.load_model("./fraud_model/")

df_test = pd.read_csv('medical_insurance_fraud_test.csv', index_col=0)

def bucketize(val, size, count):
    i=0
    for i in range(count):
        if val <= (i+1)*size:
            return i
    return i

def bucketize_df(df):
    df['Age_group'] = [bucketize(x, 10, 5) for x in df['Age']]
    df['BMI_group'] = [bucketize(x, 10, 5) for x in df['BMI']]
    df.drop(['Age'], axis=1, inplace=True)
    df.drop(['BMI'], axis=1, inplace=True)
    return df

df_test = bucketize_df(df_test)

def one_hot_encode(df):
    df = pd.concat([df,pd.get_dummies(df['Age_group'], prefix='Age')],axis=1)
    df = pd.concat([df,pd.get_dummies(df['BMI_group'], prefix='BMI')],axis=1)
    df.drop(['Age_group'], axis=1, inplace=True)
    df.drop(['BMI_group'], axis=1, inplace=True)
    return df

df_test = one_hot_encode(df_test)

In [3]:
pd.value_counts(df_test['Fraud'])

0.0    30000
1.0      300
Name: Fraud, dtype: int64

In [4]:
df_test.Cost.describe()

count    30300.000000
mean     16526.867915
std      10337.683697
min       2253.404003
25%       7626.555685
50%      12943.563178
75%      25986.360584
max      80337.557453
Name: Cost, dtype: float64

In [5]:
testDataX = df_test.copy().drop(['Fraud'],axis=1)
testDataY = df_test['Fraud'].copy()

sX = pp.StandardScaler(copy=True, with_mean=True, with_std=True)
featuresToScale = testDataX.columns
testDataX.loc[:,featuresToScale] = sX.fit_transform(testDataX[featuresToScale])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [6]:
def anomalyScores(originalDF, reducedDF):
    loss = np.sum((np.array(originalDF) - \
                   np.array(reducedDF))**2, axis=1)
    loss = pd.Series(data=loss,index=originalDF.index)
    loss = (loss-np.min(loss))/(np.max(loss)-np.min(loss))
    
    print('Mean for anomaly scores: ', np.mean(loss))
    
    return loss

In [7]:
predictions = imported_model.predict(testDataX, verbose=1)
anomalyScoresAE = anomalyScores(testDataX, predictions)

W0523 14:57:05.064521 140157617170240 training.py:504] Falling back from v2 loop because of error: Failed to find data adapter that can handle input: <class 'pandas.core.frame.DataFrame'>, <class 'NoneType'>


Mean for anomaly scores:  0.0033387805516674014


In [None]:
app = Flask(__name__)
CORS(app)

@app.route("/katana-ml/api/v1.0/fraud/process", methods=['POST'])
def process():
    fraud_threshold = request.json['fraud_threshold']
    
    testDataCost = df_test['Cost'].copy()
    df_preds = pd.concat([testDataCost, testDataY, anomalyScoresAE], axis=1)
    df_preds.columns = ['Cost', 'Fraud', 'AnomalyScore']

    conditions = [
        (df_preds['Fraud'] == 1) & (df_preds['AnomalyScore'] >= fraud_threshold),
        (df_preds['Fraud'] == 0) & (df_preds['AnomalyScore'] >= fraud_threshold),
        (df_preds['Fraud'] == 1) & (df_preds['AnomalyScore'] < fraud_threshold)]
    choices = [1, 2, 3]

    df_preds['FraudPredict'] = np.select(conditions, choices, default=0)
    
    outliers = pd.DataFrame()
    outliers['transaction_id'] = df_preds.index.values
    outliers['fraud_predict'] = df_preds['FraudPredict']
    outliers['claim_cost'] = df_preds['Cost']
    
    outliers_non_fraud = outliers.loc[outliers['fraud_predict'] == 0]
    claim_cost_max = outliers_non_fraud['claim_cost'].max()
    
    outliers_fraud = outliers.loc[outliers['fraud_predict'] != 0]
    outliers_fraud['non_fraud_cost_max'] = claim_cost_max
    
    result = outliers_fraud.to_json(orient='records', date_format='iso')
    return result

@app.route("/katana-ml/api/v1.0/fraud/stats", methods=['POST'])
def fraud_stats():
    fraud_threshold = request.json['fraud_threshold']
    
    testDataCost = df_test['Cost'].copy()
    df_preds = pd.concat([testDataCost, testDataY, anomalyScoresAE], axis=1)
    df_preds.columns = ['Cost', 'Fraud', 'AnomalyScore']

    conditions = [
        (df_preds['Fraud'] == 1) & (df_preds['AnomalyScore'] >= fraud_threshold),
        (df_preds['Fraud'] == 0) & (df_preds['AnomalyScore'] >= fraud_threshold),
        (df_preds['Fraud'] == 1) & (df_preds['AnomalyScore'] < fraud_threshold)]
    choices = [1, 2, 3]

    df_preds['FraudPredict'] = np.select(conditions, choices, default=0)
    df_value_counts = pd.value_counts(df_preds['FraudPredict'])
    
    df_value_counts = df_value_counts.reset_index()
    df_value_counts.columns = ['unique_values', 'counts']
    
    return df_value_counts.to_json(orient='records', date_format='iso')

# running REST interface
if __name__ == "__main__":
    app.run(host='0.0.0.0', port=5000)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
