In [1]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline

# iplot won't work because I've not installed the extension
import chart_studio.plotly as ply
import plotly.offline as plyoff
import plotly.graph_objects as go
import plotly.subplots as plysub

plyoff.init_notebook_mode(connected=True)
init = go.Figure(data=[go.Scatter(x=[1,2], y=[42,42])], layout=go.Layout(title='Init'))
plyoff.iplot(init)

In [2]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'cluster'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


In [3]:
''' get the data '''
# load the data
fil = '../data/credit_card_data.csv'
schem = StructType([StructField('CUST_ID', StringType()), StructField('BALANCE', FloatType()),
                    StructField('BALANCE_FREQUENCY', FloatType()), StructField('PURCHASES', FloatType()),
                    StructField('ONEOFF_PURCHASES', FloatType()), StructField('INSTALLMENTS_PURCHASES', FloatType()),
                    StructField('CASH_ADVANCE', FloatType()), StructField('PURCHASES_FREQUENCY', FloatType()),
                    StructField('ONEOFF_PURCHASES_FREQUENCY', FloatType()),
                    StructField('PURCHASES_INSTALLMENTS_FREQUENCY', FloatType()),
                    StructField('CASH_ADVANCE_FREQUENCY', FloatType()), StructField('CASH_ADVANCE_TRX', FloatType()),
                    StructField('PURCHASES_TRX', FloatType()), StructField('CREDIT_LIMIT', FloatType()),
                    StructField('PAYMENTS', FloatType()), StructField('MINIMUM_PAYMENTS', FloatType()),
                    StructField('PRC_FULL_PAYMENT', FloatType()), StructField('TENURE', IntegerType())])
cc = spark.read.format('csv').options(header=True).schema(schem).load(fil)

# talk
cnt = cc.count()
print('%d records'%cnt)
cc.show(truncate=False)

8950 records
+-------+----------+-----------------+---------+----------------+----------------------+------------+-------------------+--------------------------+--------------------------------+----------------------+----------------+-------------+------------+---------+----------------+----------------+------+
|CUST_ID|BALANCE   |BALANCE_FREQUENCY|PURCHASES|ONEOFF_PURCHASES|INSTALLMENTS_PURCHASES|CASH_ADVANCE|PURCHASES_FREQUENCY|ONEOFF_PURCHASES_FREQUENCY|PURCHASES_INSTALLMENTS_FREQUENCY|CASH_ADVANCE_FREQUENCY|CASH_ADVANCE_TRX|PURCHASES_TRX|CREDIT_LIMIT|PAYMENTS |MINIMUM_PAYMENTS|PRC_FULL_PAYMENT|TENURE|
+-------+----------+-----------------+---------+----------------+----------------------+------------+-------------------+--------------------------+--------------------------------+----------------------+----------------+-------------+------------+---------+----------------+----------------+------+
|C10001 |40.90075  |0.818182         |95.4     |0.0             |95.4                  

### Data Prep

In [4]:
''' handle missing values '''
# check for missing values
nullCounts = {colm:cc.select(colm).where(col(colm).isNull()).count() for colm in cc.columns}
nullCounts = {colm:(ncnt, ncnt/cnt) for (colm, ncnt) in nullCounts.items()}
nullCountsDF = pd.DataFrame(nullCounts).T.reset_index(drop=False).sort_values(1, ascending=False)
nullCountsDF.columns = ['Column', 'Freq.', 'Rel. Freq.']
nullCountsDF = nullCountsDF.merge(pd.DataFrame([[colm.name, colm.dataType] for colm in cc.schema], columns=['Column', 'Type']),
                                how='inner', on=['Column'])

# talk
display(nullCountsDF)

# remove
cc = cc.dropna(how='any')

# talk some more
print('%d records'%cc.count())

Unnamed: 0,Column,Freq.,Rel. Freq.,Type
0,MINIMUM_PAYMENTS,313.0,0.034972,FloatType
1,CREDIT_LIMIT,1.0,0.000112,FloatType
2,CUST_ID,0.0,0.0,StringType
3,BALANCE,0.0,0.0,FloatType
4,PRC_FULL_PAYMENT,0.0,0.0,FloatType
5,PAYMENTS,0.0,0.0,FloatType
6,PURCHASES_TRX,0.0,0.0,FloatType
7,CASH_ADVANCE_TRX,0.0,0.0,FloatType
8,CASH_ADVANCE_FREQUENCY,0.0,0.0,FloatType
9,PURCHASES_INSTALLMENTS_FREQUENCY,0.0,0.0,FloatType


8636 records


In [6]:
''' prepare the features '''
# get the features
features = [c for c in cc.columns if c not in ['CUST_ID']]

# create & scale the features vector
assr = VectorAssembler(inputCols=features, outputCol='features')
#scalr = MinMaxScaler(inputCol='features_raw', outputCol='features')
#pipe = Pipeline(stages=[assr, scalr]).fit(cc)
#cc = pipe.transform(cc).drop('features_raw')
cc = assr.transform(cc)

# talk
display(cc.limit(10).toPandas())
cc.select('CUST_ID', 'features').show(truncate=False)
cc.select('features').take(1)
print('First row features = %s'%cc.select('features').take(1)[0])

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE,features
0,C10001,40.900749,0.818182,95.400002,0.0,95.400002,0.0,0.166667,0.0,0.083333,0.0,0.0,2.0,1000.0,201.802078,139.509781,0.0,12,"[40.90074920654297, 0.8181819915771484, 95.400..."
1,C10002,3202.467529,0.909091,0.0,0.0,0.0,6442.945312,0.0,0.0,0.0,0.25,4.0,0.0,7000.0,4103.032715,1072.34021,0.222222,12,"(3202.467529296875, 0.9090909957885742, 0.0, 0..."
2,C10003,2495.148926,1.0,773.169983,773.169983,0.0,0.0,1.0,1.0,0.0,0.0,0.0,12.0,7500.0,622.066772,627.28479,0.0,12,"[2495.14892578125, 1.0, 773.1699829101562, 773..."
3,C10005,817.714355,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0.0,1.0,1200.0,678.334778,244.791245,0.0,12,"[817.71435546875, 1.0, 16.0, 16.0, 0.0, 0.0, 0..."
4,C10006,1809.828735,1.0,1333.280029,0.0,1333.280029,0.0,0.666667,0.0,0.583333,0.0,0.0,8.0,1800.0,1400.057739,2407.246094,0.0,12,"[1809.8287353515625, 1.0, 1333.280029296875, 0..."
5,C10007,627.260803,1.0,7091.009766,6402.629883,688.380005,0.0,1.0,1.0,1.0,0.0,0.0,64.0,13500.0,6354.314453,198.065887,1.0,12,"[627.2608032226562, 1.0, 7091.009765625, 6402...."
6,C10008,1823.65271,1.0,436.200012,0.0,436.200012,0.0,1.0,0.0,1.0,0.0,0.0,12.0,2300.0,679.065063,532.033997,0.0,12,"[1823.6527099609375, 1.0, 436.20001220703125, ..."
7,C10009,1014.926453,1.0,861.48999,661.48999,200.0,0.0,0.333333,0.083333,0.25,0.0,0.0,5.0,7000.0,688.278564,311.963409,0.0,12,"[1014.9264526367188, 1.0, 861.489990234375, 66..."
8,C10010,152.225967,0.545455,1281.599976,1281.599976,0.0,0.0,0.166667,0.166667,0.0,0.0,0.0,3.0,11000.0,1164.77063,100.302261,0.0,12,"[152.22596740722656, 0.5454549789428711, 1281...."
9,C10011,1293.125,1.0,920.119995,0.0,920.119995,0.0,1.0,0.0,1.0,0.0,0.0,12.0,1200.0,1083.301025,2172.697754,0.0,12,"[1293.125, 1.0, 920.1199951171875, 0.0, 920.11..."


+-------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|CUST_ID|features                                                                                                                                                                                            |
+-------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|C10001 |[40.90074920654297,0.8181819915771484,95.4000015258789,0.0,95.4000015258789,0.0,0.16666699945926666,0.0,0.08333300054073334,0.0,0.0,2.0,1000.0,201.8020782470703,139.50978088378906,0.0,12.0]       |
|C10002 |(17,[0,1,5,9,10,12,13,14,15,16],[3202.467529296875,0.9090909957885742,6442.9453125,0.25,4.0,7000.0,4103.03271484375,1072.3402099609375,0.2222220003604889,12.0])   

## Modeling

In [7]:
# split for cross-val
trainPerc = 0.7
randSeed = 42
tran, test = cc.select('CUST_ID', 'features').randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# talk
print('Training Cases')
tran.select('CUST_ID').show()
print('Testing Cases')
test.select('CUST_ID').show()

Training Cases
+-------+
|CUST_ID|
+-------+
| C10001|
| C10002|
| C10005|
| C10006|
| C10007|
| C10009|
| C10012|
| C10013|
| C10014|
| C10018|
| C10019|
| C10020|
| C10022|
| C10024|
| C10027|
| C10028|
| C10029|
| C10033|
| C10035|
| C10038|
+-------+
only showing top 20 rows

Testing Cases
+-------+
|CUST_ID|
+-------+
| C10003|
| C10008|
| C10010|
| C10011|
| C10015|
| C10016|
| C10017|
| C10021|
| C10023|
| C10025|
| C10026|
| C10030|
| C10031|
| C10032|
| C10034|
| C10036|
| C10037|
| C10041|
| C10045|
| C10046|
+-------+
only showing top 20 rows



In [9]:
''' evaluate different clustering cardinalities - kmeans '''
# setup range that will be tried
kMax = 41
xs = list(range(2, kMax))
kCost = np.ones(kMax)*np.inf
kSil = np.ones(kMax)*-1

# iterate over k
models = [None]*kMax
for k in range(2, kMax):
    print('Trying k = %d'%k)
    # fit the kmeans model on the training set
    kmeans = KMeans(k=k, seed=randSeed, featuresCol='features')
    models[k] = kmeans.fit(tran)
    # eval the model on the training set
    kCost[k] = models[k].summary.trainingCost
    print('\tTraining Cost = %0.3f'%kCost[k])
    # eval the model on the testing set 
    testPred = models[k].transform(test)
    kSil[k] = ClusteringEvaluator().evaluate(testPred)
    print('\tSilhouette score = %0.3f'%kSil[k])
    
# show the scree plot
fig = plysub.make_subplots(rows=2, cols=1, print_grid=False, subplot_titles=('Train SSE vs. k', 'Test Silhouette vs. k'))
fig.add_trace(go.Scatter(x=xs, y=kCost[2:], mode='markers+lines'), 1, 1)
fig.add_trace(go.Scatter(x=xs, y=kSil[2:], mode='markers+lines'), 2, 1)
fig['layout']['title'] = 'Kmeans Results'
plyoff.plot(fig)

# find the min
bestK = np.argmin(kCost)
print('Best model has %d clusters, with a cost of %0.3f'%(bestK, kCost[bestK]))

Trying k = 2
	Training Cost = 198883103490.410
	Silhouette score = 0.658
Trying k = 3
	Training Cost = 164165627669.426
	Silhouette score = 0.598
Trying k = 4
	Training Cost = 147002156377.305
	Silhouette score = 0.568
Trying k = 5
	Training Cost = 131300805025.705
	Silhouette score = 0.526
Trying k = 6
	Training Cost = 113478760787.669
	Silhouette score = 0.546
Trying k = 7
	Training Cost = 101128582903.553
	Silhouette score = 0.506
Trying k = 8
	Training Cost = 95925798274.583
	Silhouette score = 0.507
Trying k = 9
	Training Cost = 88496292478.927
	Silhouette score = 0.545
Trying k = 10
	Training Cost = 82305242416.357
	Silhouette score = 0.506
Trying k = 11
	Training Cost = 78405156111.143
	Silhouette score = 0.496
Trying k = 12
	Training Cost = 72216479764.587
	Silhouette score = 0.492
Trying k = 13
	Training Cost = 71241346013.984
	Silhouette score = 0.374
Trying k = 14
	Training Cost = 68601131368.074
	Silhouette score = 0.353
Trying k = 15
	Training Cost = 64945867797.233
	Silho

In [10]:
''' evaluate different clustering cardinalities  - bisecting kmeans '''
# setup range that will be tried
bkCost = np.ones(kMax)*np.inf
bkSil = np.ones(kMax)*-1

# iterate over k
bmodels = [None]*kMax
for k in range(2, kMax):
    print('Trying k = %d'%k)
    # fit the bisecting kmeans model on the training set
    kmeans = BisectingKMeans(k=k, seed=randSeed, featuresCol='features')
    bmodels[k] = kmeans.fit(tran)
    # eval the model on the training set
    bkCost[k] = bmodels[k].summary.trainingCost
    print('\tTraining Cost = %0.3f'%bkCost[k])
    # eval the model on the testing set 
    testPred = bmodels[k].transform(test)
    bkSil[k] = ClusteringEvaluator().evaluate(testPred)
    print('\tSilhouette score = %0.3f'%bkSil[k])
    
# show the scree plot
fig = plysub.make_subplots(rows=2, cols=1, print_grid=False, subplot_titles=('Train SSE vs. k', 'Test Silhouette vs. k'))
fig.add_trace(go.Scatter(x=xs, y=bkCost[2:], mode='markers+lines'), 1, 1)
fig.add_trace(go.Scatter(x=xs, y=bkSil[2:], mode='markers+lines'), 2, 1)
fig['layout']['title'] = 'Bisecting Kmeans Results'
plyoff.plot(fig)

# find the min
bestBK = np.argmin(bkCost)
print('Best model has %d clusters, with a cost of %0.3f'%(bestBK, bkCost[bestBK]))

Trying k = 2
	Training Cost = 198883103490.411
	Silhouette score = 0.658
Trying k = 3
	Training Cost = 183049788754.766
	Silhouette score = 0.327
Trying k = 4
	Training Cost = 150403430502.307
	Silhouette score = 0.357
Trying k = 5
	Training Cost = 142900976401.824
	Silhouette score = 0.377
Trying k = 6
	Training Cost = 138401316594.269
	Silhouette score = 0.374
Trying k = 7
	Training Cost = 124901568437.786
	Silhouette score = 0.404
Trying k = 8
	Training Cost = 111891064559.740
	Silhouette score = 0.406
Trying k = 9
	Training Cost = 109398716443.580
	Silhouette score = 0.268
Trying k = 10
	Training Cost = 106148210034.392
	Silhouette score = 0.262
Trying k = 11
	Training Cost = 100575058904.100
	Silhouette score = 0.279
Trying k = 12
	Training Cost = 86893344984.014
	Silhouette score = 0.302
Trying k = 13
	Training Cost = 85521048393.448
	Silhouette score = 0.299
Trying k = 14
	Training Cost = 85961183679.397
	Silhouette score = 0.272
Trying k = 15
	Training Cost = 77057331427.004
	S

In [12]:
''' Evaluate best model on test set '''
# get the best
bestK = int(input('Enter the "best" k'))
bestMod = input('Enter the best model ("k" or bk")')
if bestMod == 'k':
    # kmeans
    bestModel = models[bestK]
    cst = kCost[bestK]
else:
    # bisecting kmeans
    bestModel = bmodels[bestK]
    cst = bkCost[bestK]
print('Best %s model has %d clusters, with a cost of %0.3f'%(bestMod, bestK, cst))

# predict
testPred = bestModel.transform(test)
# eval
evalSil = ClusteringEvaluator()
silhouette = evalSil.evaluate(testPred)
print('Silhouette score for %s model with %d cluster = [-1, %0.3f, 1]'%(bestMod, bestK, silhouette))
# get the centers
cents = pd.DataFrame(index=list(range(bestK)), data=bestModel.clusterCenters(), columns=features)
display(cents)

Enter the "best" k 6
Enter the best model ("k" or bk") k


Best k model has 6 clusters, with a cost of 113478760787.669
Silhouette score for k model with 6 cluster = [-1, 0.546, 1]


Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,885.125388,0.877749,520.789541,254.259873,266.908278,522.593912,0.453244,0.13404,0.34799,0.115121,2.391327,9.756427,2306.067832,961.693053,508.73834,0.152709,11.411841
1,5544.311042,0.990431,26880.324681,21526.438425,5353.886316,1928.995464,0.93421,0.855263,0.741228,0.070175,3.105263,145.0,15473.684211,26231.987202,3746.668123,0.533892,12.0
2,5995.346297,0.978321,965.68101,613.125827,352.555183,3906.114271,0.386316,0.206737,0.269639,0.366853,10.28,14.911304,9729.304348,2769.08588,2032.775805,0.010739,11.605217
3,4162.668319,0.911015,4658.854711,2860.404946,1798.587246,6333.460749,0.590469,0.418067,0.473431,0.314137,11.7875,52.64375,10499.375,13198.580695,1826.988108,0.259879,11.79375
4,3777.277374,0.988215,957.584638,119.282223,838.302406,983.472515,0.479517,0.032969,0.459035,0.105359,3.037037,17.333333,4007.407407,1613.468025,20010.987865,0.003086,11.888889
5,1379.81297,0.906336,1714.87817,1075.327396,640.182679,527.534935,0.637986,0.371339,0.446599,0.082384,1.779432,23.966847,7618.172592,2073.782462,533.072375,0.229995,11.805142


In [13]:
# add predictions to entire dataset
ccpred = bestModel.transform(cc)
for feat in features:
    ccpred.groupBy('prediction').agg(min(col(feat)), mean(col(feat)), max(col(feat))).show()

+----------+------------+------------------+------------+
|prediction|min(BALANCE)|      avg(BALANCE)|max(BALANCE)|
+----------+------------+------------------+------------+
|         1|   1268.8091| 5405.330897013347|   19043.139|
|         3|    4.382924| 4189.620371494737|   15244.749|
|         5|         0.0|1370.0603074958822|   6257.3906|
|         4|    915.7177| 3731.417682423311|   11209.371|
|         2|   43.103897|  6036.56365059162|   18495.559|
|         0|         0.0| 883.5241210564602|     4931.59|
+----------+------------+------------------+------------+

+----------+----------------------+----------------------+----------------------+
|prediction|min(BALANCE_FREQUENCY)|avg(BALANCE_FREQUENCY)|max(BALANCE_FREQUENCY)|
+----------+----------------------+----------------------+----------------------+
|         1|              0.272727|     0.957954540848732|                   1.0|
|         3|              0.090909|    0.9121116585163183|                   1.0|
|        

In [None]:
sc.stop()