In [1]:
import findspark
findspark.init()

In [3]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils
import numpy as np
from pyspark.ml.feature import StandardScaler
import pyspark.sql.functions as f
import pyspark.sql.types
import pandas as pd
from pyspark.sql import Row

In [4]:
spark = SparkSession.builder.getOrCreate()
sc = SparkContext.getOrCreate()
from pyspark.ml.feature import VectorAssembler

In [5]:
trans_data = spark.read.csv("New_Aggregated_data_final.csv", inferSchema=True, header=True)

In [6]:
customer_data = spark.read.csv("Customer_data1.csv", inferSchema=True, header=True)

In [7]:
trans_data = trans_data.withColumn("sum_prev_day_onl", trans_data["sum_prev_day_onl"].cast("integer"))
trans_data = trans_data.withColumn("sum_prev_day_mon_onl", trans_data["sum_prev_day_mon_onl"].cast("integer"))

In [44]:
trans_data1 = trans_data.select('_c0','amt', 'Balance',
      'sum_prev_day', 'cnt_prev_day_onl', 'sum_prev_day_onl',
       '24hrsAvg','qtrAvg','wkAvg', 'monAvg','yrAvg').fillna(0)

In [39]:
# from pyspark.sql.functions import *
# train.where(col('cc_num').isNull()).count()
# df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()
# customer_data.select([count(when(col(c).isNull(),c)).alias(c) for c in customer_data.columns]).show()

In [40]:
# # merging the data together by their unique "id"
# train = trans_data.join(customer_data,how='left',on='cc_num')
# # all_data.show()

### PCA

In [61]:
#Create a single vector column
cols = trans_data1.drop('_c0').columns
cols

['amt',
 'Balance',
 'sum_prev_day',
 'cnt_prev_day_onl',
 'sum_prev_day_onl',
 '24hrsAvg',
 'qtrAvg',
 'wkAvg',
 'monAvg',
 'yrAvg']

In [62]:
assembler = VectorAssembler(inputCols=cols, outputCol='features')
output_dat = assembler.transform(trans_data1).select('_c0','features')
output_dat.show(5, truncate = False)

+---+-------------------------------------------------------------+
|_c0|features                                                     |
+---+-------------------------------------------------------------+
|0  |(10,[0,1],[95.0,942.0])                                      |
|1  |(10,[0],[90.0])                                              |
|2  |(10,[0,1],[188.0,6746.0])                                    |
|3  |[100.0,133.0,373.0,0.0,0.0,124.33,124.33,124.33,124.33,124.0]|
|4  |[79.0,3115.0,100.0,0.0,0.0,100.0,118.25,118.25,118.25,118.0] |
+---+-------------------------------------------------------------+
only showing top 5 rows



In [63]:
#Center and scale data
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(output_dat)

# Normalize each feature to have unit standard deviation.
scaledData = scalerModel.transform(output_dat)
scaledData.select(['_c0','scaledFeatures']).show(5, truncate=False) #sample centered data

+---+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|_c0|scaledFeatures                                                                                                                                                                                                |
+---+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0  |[-0.15324509141918674,-0.2082417185038452,-0.7332778530194066,-0.3436387790372,-0.21640155213233406,-0.6804918641670585,-1.2617558227807781,-1.150575009187998,-1.2597350985596223,-1.2583192791022992]       |
|1  |[-0.17600379267423114,-0.49825834867770463,-0.7332778530194066,-0.3436387790372,-0.21640155213233406,-0.6804918641670585,-1.2617558227807781,-1

In [64]:
#apply PCA
pca = PCA(k=8, inputCol=scaler.getOutputCol(), outputCol='pcaFeatures')

model = pca.fit(scaledData)
transformed_feature = model.transform(scaledData)

In [88]:
transformed_feature.select('pcaFeatures').show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|pcaFeatures                                                                                                                                                          |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[2.5256773608503233,0.8295170804016654,-0.1787406852652773,-0.29977534876299716,-0.044399620663500394,0.034012382923377465,-0.056329338674227125,0.35128814478512715]|
|[2.530499552908705,0.833973624625901,-0.46824336706349384,-0.27786286627386836,-0.06037674515106872,0.03324446594219013,-0.05765940257350882,0.3571751044550575]     |
|[2.440385761937181,0.8613738724395384,1.6014204629343514,-0.6470649527907247,0.21990249601630182,0.05257956359881902,-0.049043093109365,0.2875723163327102]    

In [83]:
#Correlation of PCs
np.round(np.corrcoef(np.round(model.pc.toArray(),1), rowvar=False), 4)

array([[ 1.    ,  0.378 ,  0.8452, -0.2243,  0.2349, -0.0256,  0.0256,
         0.    ],
       [ 0.378 ,  1.    , -0.0573, -0.0399,  0.0266, -0.0052, -0.0261,
        -0.0327],
       [ 0.8452, -0.0573,  1.    ,  0.0111, -0.0103,  0.0101, -0.0101,
         0.0211],
       [-0.2243, -0.0399,  0.0111,  1.    ,  0.001 ,  0.0895, -0.0392,
        -0.0126],
       [ 0.2349,  0.0266, -0.0103,  0.001 ,  1.    , -0.0552, -0.0196,
         0.0215],
       [-0.0256, -0.0052,  0.0101,  0.0895, -0.0552,  1.    ,  0.0468,
         0.0269],
       [ 0.0256, -0.0261, -0.0101, -0.0392, -0.0196,  0.0468,  1.    ,
         0.0307],
       [ 0.    , -0.0327,  0.0211, -0.0126,  0.0215,  0.0269,  0.0307,
         1.    ]])

In [65]:
# percentage of variance explained by each PC
np.round(100.00*model.explainedVariance.toArray(),4)

array([53.7645, 15.0258,  9.997 ,  9.0107,  6.3449,  2.2406,  1.8031,
        1.7031])

In [66]:
# compute loadings of each feature
pcs = np.round(model.pc.toArray(),4)
pcs

array([[-0.1963,  0.2095, -0.0126, -0.7499,  0.5859,  0.0489, -0.0032,
        -0.0969],
       [-0.0012, -0.0318,  0.9992, -0.0167,  0.0091, -0.0012,  0.0048,
        -0.0127],
       [-0.3   ,  0.4045,  0.0115,  0.4062,  0.1406,  0.4948,  0.0026,
        -0.5627],
       [-0.2639,  0.4167,  0.0089, -0.2755, -0.6029, -0.4606,  0.0824,
        -0.3136],
       [-0.3331,  0.4029,  0.0238, -0.0495, -0.2335,  0.3855, -0.1155,
         0.7121],
       [-0.3105,  0.241 ,  0.0118,  0.4313,  0.4535, -0.5859,  0.2194,
         0.2495],
       [-0.3878, -0.3387, -0.0131, -0.0391, -0.065 ,  0.0794,  0.2516,
        -0.0115],
       [-0.3784, -0.2318, -0.0041,  0.0623,  0.034 , -0.1801, -0.8713,
        -0.0737],
       [-0.3879, -0.3381, -0.0109, -0.0319, -0.061 ,  0.0607,  0.212 ,
        -0.0211],
       [-0.3878, -0.3386, -0.0136, -0.0398, -0.0648,  0.0796,  0.2537,
        -0.009 ]])

In [72]:
pcs = np.round(model.pc.toArray(),4)
df_pc = pd.DataFrame(pcs, columns = ['PC'+str(i) for i in range(1, 9)], index = cols)
df_pc

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8
amt,-0.1963,0.2095,-0.0126,-0.7499,0.5859,0.0489,-0.0032,-0.0969
Balance,-0.0012,-0.0318,0.9992,-0.0167,0.0091,-0.0012,0.0048,-0.0127
sum_prev_day,-0.3,0.4045,0.0115,0.4062,0.1406,0.4948,0.0026,-0.5627
cnt_prev_day_onl,-0.2639,0.4167,0.0089,-0.2755,-0.6029,-0.4606,0.0824,-0.3136
sum_prev_day_onl,-0.3331,0.4029,0.0238,-0.0495,-0.2335,0.3855,-0.1155,0.7121
24hrsAvg,-0.3105,0.241,0.0118,0.4313,0.4535,-0.5859,0.2194,0.2495
qtrAvg,-0.3878,-0.3387,-0.0131,-0.0391,-0.065,0.0794,0.2516,-0.0115
wkAvg,-0.3784,-0.2318,-0.0041,0.0623,0.034,-0.1801,-0.8713,-0.0737
monAvg,-0.3879,-0.3381,-0.0109,-0.0319,-0.061,0.0607,0.212,-0.0211
yrAvg,-0.3878,-0.3386,-0.0136,-0.0398,-0.0648,0.0796,0.2537,-0.009


In [70]:
df_pc['PC1']

amt                -0.1963
Balance            -0.0012
sum_prev_day       -0.3000
cnt_prev_day_onl   -0.2639
sum_prev_day_onl   -0.3331
24hrsAvg           -0.3105
qtrAvg             -0.3878
wkAvg              -0.3784
monAvg             -0.3879
yrAvg              -0.3878
Name: PC1, dtype: float64

### Feature Engineering

In [59]:
from pyspark.sql.functions import *
from pyspark.sql.functions import unix_timestamp

In [60]:
train = train.withColumn("Birthdate",from_unixtime(unix_timestamp(train['dob'], 'MM/dd/yyyy')))

In [61]:
train=(train.withColumn('Yearofbirth',year(train['Birthdate'])))

In [62]:
train = train.withColumn("trans_date",from_unixtime(unix_timestamp(train['trans_date'], 'MM/dd/yyyy')))

In [64]:
train=(train.withColumn('Month',month(train['fulltime'])))
train = train.withColumn("Time",hour(train["fulltime"]))

In [65]:
train = train.withColumn('today_date',lit(2019))
train = train.withColumn('Age',train['today_date']-train['Yearofbirth'])
train.select('today_date','Age','dob').show(5)

+----------+---+---------+
|today_date|Age|      dob|
+----------+---+---------+
|      2019| 45|9/23/1974|
|      2019| 45|9/23/1974|
|      2019| 45|9/23/1974|
|      2019| 45|9/23/1974|
|      2019| 45|9/23/1974|
+----------+---+---------+
only showing top 5 rows



In [68]:
#spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "false")
a=train.drop('first','last')
train2 = a.toPandas()

In [69]:
train2["fulltimepd"] =  pd.to_datetime(train2['unix_time'],unit='s')
train2["Weekday"] = train2["fulltimepd"].dt.strftime("%A")

In [70]:
#train["Time_short_for_grouping"] = train["Time_short_for_grouping"].astype(int)
bins = [2,6,11,18,22]
labels = ["Early Morning","Morning","Afternoon","Evening"]
train2["Time of day"] = pd.cut(train2.Time,bins=bins,labels=labels)
train2["Time of day"]=train2["Time of day"].cat.add_categories('Midnight') 
train2["Time of day"] = train2["Time of day"].fillna('Midnight')

In [71]:
train2["Month"] = train2["fulltimepd"].dt.strftime("%B")

In [72]:
def haversine_(lat1, lng1, lat2, lng2):
    """function to calculate haversine distance between two co-ordinates"""
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return(h)

In [73]:
def manhattan_distance_pd(lat1, lng1, lat2, lng2):
    """function to calculate manhatten distance between pick_drop"""
    a = haversine_(lat1, lng1, lat1, lng2)
    b = haversine_(lat1, lng1, lat2, lng1)
    return a + b

In [74]:
train2['hvsine']= haversine_(train2['lat'].values,
                                 train2['long'].values, train2['merch_lat'].values,
                                             train2['merch_long'].values)

In [75]:
train2['mnhtn']= manhattan_distance_pd(train2['lat'].values,
                                 train2['long'].values, train2['merch_lat'].values,
                                             train2['merch_long'].values)

In [76]:
train2['hvsine2']= haversine_(train2['prev_lat'].values,
                                 train2['prev_long'].values, train2['merch_lat'].values,
                                             train2['merch_long'].values)



In [77]:
train2['mnhtn2']= manhattan_distance_pd(train2['prev_lat'].values,
                                 train2['prev_long'].values, train2['merch_lat'].values,
                                             train2['merch_long'].values)



In [78]:
#Distance and time dfference
train2['distandtime1'] = train2['mnhtn2']/train2['time_diff_min']
# train2['distandtime2'] = train2['hvsine2']/train2['time_diff_min']
# train['distandtime3'] = train['bearing2']/train['time_diff_min']

#train2['distandtime'] =train2['manhtn']/train2['time_diff_min']

In [79]:
# Amount versus year average
train2['amt_yrAvg'] = train2['amt']/train2['yrAvg']

In [80]:
#
train3=train2.copy(deep=True)

# Final

In [81]:
trainimp_f = train3[['Channel', 'Transaction Type', 'gender', 'amt', 'Balance',
       'Month', 'Weekday','Time of day','Age','mnhtn','sum_prev_day', 'cnt_prev_day_onl', 'sum_prev_day_onl',
       '24hrsAvg','qtrAvg','wkAvg', 'monAvg','yrAvg','mnhtn2','distandtime1','amt_yrAvg','is_fraud']]

In [82]:
trainimp_f.isnull().sum()

Channel                 0
Transaction Type        0
gender                  0
amt                     0
Balance                 0
Month                   0
Weekday                 0
Time of day             0
Age                     0
mnhtn                   0
sum_prev_day         1496
cnt_prev_day_onl    15761
sum_prev_day_onl    17759
24hrsAvg             1496
qtrAvg                332
wkAvg                 332
monAvg                332
yrAvg                 332
mnhtn2                 98
distandtime1           98
amt_yrAvg             332
is_fraud                0
dtype: int64

In [83]:
colsna= ['sum_prev_day','cnt_prev_day_onl','sum_prev_day_onl',
         '24hrsAvg','qtrAvg','wkAvg','monAvg','yrAvg','mnhtn2','distandtime1','amt_yrAvg']

In [84]:
for col in colsna:
    trainimp_f[col] = trainimp_f[col].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [85]:
trainimp_f.head()

Unnamed: 0,Channel,Transaction Type,gender,amt,Balance,Month,Weekday,Time of day,Age,mnhtn,...,sum_prev_day_onl,24hrsAvg,qtrAvg,wkAvg,monAvg,yrAvg,mnhtn2,distandtime1,amt_yrAvg,is_fraud
0,POS,POS,F,95,942,January,Friday,Midnight,45,1.688628,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,POS,POS,F,90,0,January,Friday,Evening,45,2.709298,...,0.0,0.0,0.0,0.0,0.0,0.0,1.020535,0.001001,0.0,0
2,Web,Online,F,188,6746,January,Friday,Evening,45,1.880668,...,0.0,0.0,0.0,0.0,0.0,0.0,2.175526,0.022514,0.0,0
3,Mobile,USSD,F,100,133,January,Saturday,Afternoon,45,1.911466,...,0.0,124.33,124.33,124.33,124.33,124.0,3.791882,0.003087,0.806452,0
4,ATM,ATM,F,79,3115,January,Sunday,Morning,45,1.804176,...,0.0,100.0,118.25,118.25,118.25,118.0,1.52947,0.001447,0.669492,0


In [86]:
from pyspark.sql.types import (StructField, StringType,
                              IntegerType, StructType, DoubleType)

In [87]:
data_schema = StructType([StructField("Channel", StringType(), True),
               StructField("Transaction Type", StringType(), True),
               StructField("gender", StringType(), True),
               StructField("amt", IntegerType(), True),
               StructField("Balance", IntegerType(), True),
               StructField("Month", StringType(), True),
               StructField("Weekday", StringType(), True),
               StructField("Time of day", StringType(), True),
               StructField("Age", IntegerType(), True),
               StructField("mnhtn",  DoubleType(), True),
               StructField("sum_prev_day", DoubleType(), True),
               StructField("sum_prev_day_onl", DoubleType(), True),
               StructField("cnt_prev_day_onl", DoubleType(), True),
               StructField("24hrsAvg", DoubleType(), True),
               StructField("wkAvg", DoubleType(), True),
               StructField("monAvg", DoubleType(), True),
               StructField("qtrAvg", DoubleType(), True),
               StructField("yrAvg", DoubleType(), True),
               StructField("mnhtn2", DoubleType(), True),
               StructField("distandtime1", DoubleType(), True),
               StructField("amt_yrAvg", DoubleType(), True),
               StructField("is_fraud", IntegerType(), True)])

In [88]:
#Convert
#spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "false")
#from pyspark.sql.types import *
#from pyspark.sql import SqlContext
trainimp_f = spark.createDataFrame(trainimp_f,schema=data_schema)

In [89]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

### MLlib Transformer Pipeline

In [90]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import (RandomForestClassifier,
                                       GBTClassifier)

indexers = [StringIndexer(inputCol=column, 
                          outputCol=column+"_index",
                          handleInvalid="keep").fit(trainimp_f) 
            for column in list(['Transaction Type',
                         'gender',
                        "Time of day"]) ]



assembler = VectorAssembler(inputCols= ['amt',
                                        "Age",
                                        "Time of day_index",
                                        'amt_yrAvg','mnhtn2','distandtime1'], 
                            outputCol='features')


rfc = RandomForestClassifier(labelCol='is_fraud',
                             featuresCol= 'features',
                             maxDepth = 11,numTrees=40,seed=1)    

pipeline = Pipeline(stages=indexers+[assembler,rfc])

In [91]:
train_data,test_data = trainimp_f.randomSplit([0.7,0.3])

In [92]:
rfc_model2 = pipeline.fit(train_data)

In [94]:
rfc_preds3 = rfc_model2.transform(test_data)

In [96]:
rfc_preds3.select('amt','mnhtn2','distandtime1','amt_yrAvg','is_fraud','prediction').show()

+---+-------------------+--------------------+--------------------+--------+----------+
|amt|             mnhtn2|        distandtime1|           amt_yrAvg|is_fraud|prediction|
+---+-------------------+--------------------+--------------------+--------+----------+
|  6| 0.6004767007199595|7.269931350297886E-5| 0.06976744186046512|       0|       0.0|
|  6|  5.278624833901226| 0.04529064636551888|0.047619047619047616|       0|       0.0|
|  6|0.21698341972920043|0.007215943456242116| 0.07407407407407407|       0|       0.0|
|  6|  2.817629414586735|0.020340957367793348| 0.06451612903225806|       0|       0.0|
|  7| 0.6871846682760476|0.002072017694183771| 0.07865168539325842|       0|       0.0|
|  7| 1.3819106657909628|0.002747992892521...| 0.07777777777777778|       0|       0.0|
|  7|   3.03338696845326|0.003177818834480394| 0.12727272727272726|       0|       0.0|
|  8|   2.28200209680964|0.018108253426516747| 0.07920792079207921|       0|       0.0|
|  8|  1.996620501516868|0.02414

In [72]:
my_binary_eval = BinaryClassificationEvaluator(labelCol='is_fraud')
print(my_binary_eval.evaluate(rfc_preds))

0.9995282126010989


## Model Evaluation

In [73]:
tp = rfc_preds[(rfc_preds.is_fraud == 1) & (rfc_preds.prediction == 1)].count()
tn = rfc_preds[(rfc_preds.is_fraud == 0) & (rfc_preds.prediction == 0)].count()
fp = rfc_preds[(rfc_preds.is_fraud == 0) & (rfc_preds.prediction == 1)].count()
fn = rfc_preds[(rfc_preds.is_fraud == 1) & (rfc_preds.prediction == 0)].count()
print ("True Positives:", tp)
print ("True Negatives:", tn)
print ("False Positives:", fp)
print ("False Negatives:", fn)
print ("Total", rfc_preds.count())

True Positives: 170
True Negatives: 6058
False Positives: 19
False Negatives: 3
Total 6250


In [54]:
tp = gbt_preds[(gbt_preds.is_fraud == 1) & (gbt_preds.prediction == 1)].count()
tn = gbt_preds[(gbt_preds.is_fraud == 0) & (gbt_preds.prediction == 0)].count()
fp = gbt_preds[(gbt_preds.is_fraud == 0) & (gbt_preds.prediction == 1)].count()
fn = gbt_preds[(gbt_preds.is_fraud == 1) & (gbt_preds.prediction == 0)].count()
print ("True Positives:", tp)
print ("True Negatives:", tn)
print ("False Positives:", fp)
print ("False Negatives:", fn)
print ("Total", gbt_preds.count())

True Positives: 156
True Negatives: 6065
False Positives: 5
False Negatives: 19
Total 6245


## Saving & Loading Models

In [55]:
rfc_model.write().overwrite().save("models/rfc_model")

In [56]:
type(rfc_model)

pyspark.ml.classification.RandomForestClassificationModel

In [70]:
from pyspark.ml.feature import IndexToString, StringIndexer

df = spark.createDataFrame(
    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
    ["id", "category"])

indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = indexer.fit(df)
indexed = model.transform(df)

print("Transformed string column '%s' to indexed column '%s'"
      % (indexer.getInputCol(), indexer.getOutputCol()))
indexed.show()

print("StringIndexer will store labels in output column metadata\n")

converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
converted = converter.transform(indexed)

print("Transformed indexed column '%s' back to original string column '%s' using "
      "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
converted.select("id", "categoryIndex", "originalCategory").show()

Transformed string column 'category' to indexed column 'categoryIndex'
+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  0|       a|          0.0|
|  1|       b|          2.0|
|  2|       c|          1.0|
|  3|       a|          0.0|
|  4|       a|          0.0|
|  5|       c|          1.0|
+---+--------+-------------+

StringIndexer will store labels in output column metadata

Transformed indexed column 'categoryIndex' back to original string column 'originalCategory' using labels in metadata
+---+-------------+----------------+
| id|categoryIndex|originalCategory|
+---+-------------+----------------+
|  0|          0.0|               a|
|  1|          2.0|               b|
|  2|          1.0|               c|
|  3|          0.0|               a|
|  4|          0.0|               a|
|  5|          1.0|               c|
+---+-------------+----------------+

