In [1]:
from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import col

In [2]:
#sc = SparkContext(appName="FraudDetection").getOrCreate()
sess = SparkSession(sc)

## Data Preprocessing
The csv file is not uploaded because it's 481 MB.

In [3]:
df = sess.read.csv("../pyspark-machine-learning-and-streaming/data/PS_20174392719_1491204439457_log.csv", 
                   inferSchema=True, 
                   header=True)

In [4]:
df.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)



In [5]:
df.show(5)

+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT| 1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|   181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|   181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1| PAYMENT|11668.14|C2048537720|      41554.0|      29885.86|M1230701703|      

In [6]:
df.count(), len(df.columns)

(6362620, 11)

In [7]:
df.createTempView("df_fraud")

In [8]:
df_model = df.select(['type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 
                      'oldbalancedest', 'newbalanceDest', 'isFraud']).withColumn("isFraud", col("isFraud").cast("double"))
df_model.show(5)

+--------+--------+-------------+--------------+--------------+--------------+-------+
|    type|  amount|oldbalanceOrg|newbalanceOrig|oldbalancedest|newbalanceDest|isFraud|
+--------+--------+-------------+--------------+--------------+--------------+-------+
| PAYMENT| 9839.64|     170136.0|     160296.36|           0.0|           0.0|    0.0|
| PAYMENT| 1864.28|      21249.0|      19384.72|           0.0|           0.0|    0.0|
|TRANSFER|   181.0|        181.0|           0.0|           0.0|           0.0|    1.0|
|CASH_OUT|   181.0|        181.0|           0.0|       21182.0|           0.0|    1.0|
| PAYMENT|11668.14|      41554.0|      29885.86|           0.0|           0.0|    0.0|
+--------+--------+-------------+--------------+--------------+--------------+-------+
only showing top 5 rows



In [9]:
train_data, test_data = df_model.randomSplit([0.7,0.3])

In [10]:
train_data.show(5)

+-------+------+-------------+--------------+--------------+--------------+-------+
|   type|amount|oldbalanceOrg|newbalanceOrig|oldbalancedest|newbalanceDest|isFraud|
+-------+------+-------------+--------------+--------------+--------------+-------+
|CASH_IN|  5.44|          0.0|          5.44|     594031.45|    1014777.06|    0.0|
|CASH_IN|  8.44|      39384.0|      39392.44|    3314615.62|    3314607.18|    0.0|
|CASH_IN|  9.02|   2416260.59|    2416269.61|     342107.85|     215696.34|    0.0|
|CASH_IN| 13.86|   6868100.18|    6868114.04|    1085768.44|    1085754.58|    0.0|
|CASH_IN|  14.4|1.143460813E7| 1.143462253E7|       46093.0|     124958.22|    0.0|
+-------+------+-------------+--------------+--------------+--------------+-------+
only showing top 5 rows



In [11]:
train_data.schema

StructType(List(StructField(type,StringType,true),StructField(amount,DoubleType,true),StructField(oldbalanceOrg,DoubleType,true),StructField(newbalanceOrig,DoubleType,true),StructField(oldbalancedest,DoubleType,true),StructField(newbalanceDest,DoubleType,true),StructField(isFraud,DoubleType,true)))

## Models

In [12]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer)
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import Pipeline

### Transformers

In [13]:
type_indexer = StringIndexer(inputCol='type',outputCol='typeIndex')
type_encoder = OneHotEncoder(inputCol='typeIndex',outputCol='typeVec')

In [14]:
assembler = VectorAssembler(inputCols=['typeVec', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalancedest', 'newbalanceDest'],
                            outputCol='features')

### Logistic Regression

In [19]:
%%time
log_reg = LogisticRegression(featuresCol='features', labelCol='isFraud').setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
pipeline = Pipeline(stages=[type_indexer, type_encoder, assembler, log_reg])
fit_model = pipeline.fit(train_data)
results = fit_model.transform(test_data)
results.select('isFraud', 'prediction').show()

+-------+----------+
|isFraud|prediction|
+-------+----------+
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
|    0.0|       0.0|
+-------+----------+
only showing top 20 rows

Wall time: 1min 47s


In [20]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='isFraud')
AUC = my_eval.evaluate(results)
AUC

0.5

### Random Forest

In [16]:
%%time
rf = RandomForestClassifier().setLabelCol('isFraud').setFeaturesCol('features').setNumTrees(10)
pipeline = Pipeline(stages=[type_indexer, type_encoder, assembler, rf])
model = pipeline.fit(train_data)
results = model.transform(test_data)
results.show(5)

+-------+------+-------------+--------------+--------------+--------------+-------+---------+-------------+--------------------+--------------------+--------------------+----------+
|   type|amount|oldbalanceOrg|newbalanceOrig|oldbalancedest|newbalanceDest|isFraud|typeIndex|      typeVec|            features|       rawPrediction|         probability|prediction|
+-------+------+-------------+--------------+--------------+--------------+-------+---------+-------------+--------------------+--------------------+--------------------+----------+
|CASH_IN|  5.66|   5061561.06|    5061566.72|      60569.78|      60564.12|    0.0|      2.0|(4,[2],[1.0])|[0.0,0.0,1.0,0.0,...|[9.99874058036982...|[0.99987405803698...|       0.0|
|CASH_IN| 11.13|   4116439.74|    4116450.87|     347523.14|     347512.01|    0.0|      2.0|(4,[2],[1.0])|[0.0,0.0,1.0,0.0,...|[9.99874058036982...|[0.99987405803698...|       0.0|
|CASH_IN| 12.79|     601743.0|     601755.79|    2819794.75|    2819781.96|    0.0|      2

In [17]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [18]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='isFraud')
AUC = my_eval.evaluate(results)
AUC

0.8184765065543761