In [23]:
import pandas as pd
import numpy as np
import datetime as dt

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.clustering import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline

# iplot won't work because I've not installed the extension
import chart_studio.plotly as ply
import plotly.offline as plyoff
import plotly.graph_objects as go
import plotly.subplots as plysub

plyoff.init_notebook_mode(connected=True)
init = go.Figure(data=[go.Scatter(x=[1,2], y=[42,42])], layout=go.Layout(title='Init'))
plyoff.iplot(init)

In [2]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'gmm'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


In [20]:
''' get the data '''
# load the data - can't parse the timestamp for some reason, so just let it be string
fil = '../data/sales_data_sample.csv'
schem = StructType([StructField('ORDERNUMBER', IntegerType()), StructField('QUANTITYORDERED', IntegerType()),
                    StructField('PRICEEACH', FloatType()), StructField('ORDERLINENUMBER', IntegerType()),
                    StructField('SALES', FloatType()), StructField('ORDERDATE', StringType()),# TimestampType()),
                    StructField('STATUS', StringType()), StructField('QTR_ID', IntegerType()),
                    StructField('MONTH_ID', IntegerType()), StructField('YEAR_ID', IntegerType()),
                    StructField('PRODUCTLINE', StringType()), StructField('MSRP', FloatType()),
                    StructField('PRODUCTCODE', StringType()), StructField('CUSTOMERNAME', StringType()),
                    StructField('PHONE', IntegerType()), StructField('ADDRESSLINE1', StringType()),
                    StructField('ADDRESSLINE2', StringType()), StructField('CITY', StringType()),
                    StructField('STATE', StringType()), StructField('POSTALCODE', StringType()),
                    StructField('COUNTRY', StringType()), StructField('TERRITORY', StringType()),
                    StructField('CONTACTLASTNAME', FloatType()), StructField('CONTACTFIRSTNAME', StringType()),
                    StructField('DEALSIZE', StringType())])
sales = spark.read.format('csv').options(header=True, timestampFormat='M/d/yyyy HH:MM').schema(schem).load(fil)

# talk
cnt = sales.count()
print('%d records'%cnt)
sales.show(truncate=False)

2823 records
+-----------+---------------+---------+---------------+-------+---------------+-------+------+--------+-------+-----------+----+-----------+--------------------------+----------+-----------------------------+------------+-------------+--------+----------+---------+---------+---------------+----------------+--------+
|ORDERNUMBER|QUANTITYORDERED|PRICEEACH|ORDERLINENUMBER|SALES  |ORDERDATE      |STATUS |QTR_ID|MONTH_ID|YEAR_ID|PRODUCTLINE|MSRP|PRODUCTCODE|CUSTOMERNAME              |PHONE     |ADDRESSLINE1                 |ADDRESSLINE2|CITY         |STATE   |POSTALCODE|COUNTRY  |TERRITORY|CONTACTLASTNAME|CONTACTFIRSTNAME|DEALSIZE|
+-----------+---------------+---------+---------------+-------+---------------+-------+------+--------+-------+-----------+----+-----------+--------------------------+----------+-----------------------------+------------+-------------+--------+----------+---------+---------+---------------+----------------+--------+
|10107      |30             |95.7

In [36]:
# check out an order - will need to aggregate the data by orderid
display(sales.where(col('ORDERNUMBER')==10159).orderBy(col('ORDERLINENUMBER')).toPandas())

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,STATUS,QTR_ID,MONTH_ID,YEAR_ID,PRODUCTLINE,...,TERRITORY,DEALSIZE,STATUS_str,PRODUCTCODE_str,STATE_str,COUNTRY_str,POSTALCODE_str,DEALSIZE_str,TERRITORY_str,PRODUCTLINE_str
0,10159,50,69.800003,1,3490.0,Shipped,4,10,2003,Classic Cars,...,,Medium,0.0,56.0,1.0,0.0,4.0,0.0,1.0,0.0
1,10159,41,100.0,2,8296.349609,Shipped,4,10,2003,Classic Cars,...,,Large,0.0,69.0,1.0,0.0,4.0,2.0,1.0,0.0
2,10159,24,73.419998,3,1762.079956,Shipped,4,10,2003,Classic Cars,...,,Small,0.0,72.0,1.0,0.0,4.0,1.0,1.0,0.0
3,10159,25,100.0,4,3638.0,Shipped,4,10,2003,Classic Cars,...,,Medium,0.0,78.0,1.0,0.0,4.0,0.0,1.0,0.0
4,10159,21,81.209999,5,1705.410034,Shipped,4,10,2003,Classic Cars,...,,Small,0.0,90.0,1.0,0.0,4.0,1.0,1.0,0.0
5,10159,23,67.099998,6,1543.300049,Shipped,4,10,2003,Classic Cars,...,,Small,0.0,94.0,1.0,0.0,4.0,1.0,1.0,0.0
6,10159,32,100.0,7,4618.879883,Shipped,4,10,2003,Classic Cars,...,,Medium,0.0,91.0,1.0,0.0,4.0,0.0,1.0,0.0
7,10159,21,64.660004,8,1357.859985,Shipped,4,10,2003,Motorcycles,...,,Small,0.0,80.0,1.0,0.0,4.0,1.0,1.0,2.0
8,10159,35,35.400002,9,1239.0,Shipped,4,10,2003,Motorcycles,...,,Small,0.0,84.0,1.0,0.0,4.0,1.0,1.0,2.0
9,10159,31,71.599998,10,2219.600098,Shipped,4,10,2003,Motorcycles,...,,Small,0.0,86.0,1.0,0.0,4.0,1.0,1.0,2.0


### Data Prep

In [21]:
# drop some columns that I have no doubt will not be useful for modeling
sales = sales.drop('CONTACTLASTNAME', 'ADDRESSLINE2', 'CUSTOMERNAME', 'CONTACTFIRSTNAME', 'ADDRESSLINE1', 'ORDERDATE')

In [30]:
''' handle missing values '''
# presumably important columns (for modeling)
importantCols = ['STATE', 'POSTALCODE', 'ORDERNUMBER', 'TERRITORY', 'COUNTRY',
                 'CITY', 'PRODUCTCODE', 'QUANTITYORDERED', 'MSRP', 'PRODUCTLINE',
                 'YEAR_ID', 'MONTH_ID', 'QTR_ID', 'STATUS', 'SALES', 'ORDERLINENUMBER',
                 'PRICEEACH', 'DEALSIZE']

# check for missing values
nullCounts = {colm:sales.select(colm).where(col(colm).isNull()).count() for colm in sales.columns}
nullCounts = {colm:(ncnt, ncnt/cnt) for (colm, ncnt) in nullCounts.items()}
nullCountsDF = pd.DataFrame(nullCounts).T.reset_index(drop=False).sort_values(1, ascending=False)
nullCountsDF.columns = ['Column', 'Freq.', 'Rel. Freq.']
nullCountsDF = nullCountsDF.merge(pd.DataFrame([[colm.name, colm.dataType] for colm in sales.schema], columns=['Column', 'Type']),
                                how='inner', on=['Column'])
nullCountsDF['Important'] = [c in importantCols for c in nullCountsDF['Column']]

# talk
display(nullCountsDF)

# fill null states with 'NA' as they are ex-US
sales = sales.fillna(value='NA', subset='STATE').fillna(value='90001_96162', subset='POSTALCODE')

# talk some more
print('%d records'%sales.count())
display(sales.limit(10).toPandas())

Unnamed: 0,Column,Freq.,Rel. Freq.,Type,Important
0,PHONE,2589.0,0.917109,IntegerType,False
1,STATE,1486.0,0.52639,StringType,True
2,POSTALCODE,76.0,0.026922,StringType,True
3,ORDERNUMBER,0.0,0.0,IntegerType,True
4,MSRP,0.0,0.0,FloatType,True
5,TERRITORY,0.0,0.0,StringType,True
6,COUNTRY,0.0,0.0,StringType,True
7,CITY,0.0,0.0,StringType,True
8,PRODUCTCODE,0.0,0.0,StringType,True
9,PRODUCTLINE,0.0,0.0,StringType,True


2823 records


Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,STATUS,QTR_ID,MONTH_ID,YEAR_ID,PRODUCTLINE,MSRP,PRODUCTCODE,PHONE,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,DEALSIZE
0,10107,30,95.699997,2,2871.0,Shipped,1,2,2003,Motorcycles,95.0,S10_1678,2125558000.0,NYC,NY,10022,USA,,Small
1,10121,34,81.349998,5,2765.899902,Shipped,2,5,2003,Motorcycles,95.0,S10_1678,,Reims,,51100,France,EMEA,Small
2,10134,41,94.739998,2,3884.340088,Shipped,3,7,2003,Motorcycles,95.0,S10_1678,,Paris,,75508,France,EMEA,Medium
3,10145,45,83.260002,6,3746.699951,Shipped,3,8,2003,Motorcycles,95.0,S10_1678,,Pasadena,CA,90003,USA,,Medium
4,10159,49,100.0,14,5205.27002,Shipped,4,10,2003,Motorcycles,95.0,S10_1678,,San Francisco,CA,90001_96162,USA,,Medium
5,10168,36,96.660004,1,3479.76001,Shipped,4,10,2003,Motorcycles,95.0,S10_1678,,Burlingame,CA,94217,USA,,Medium
6,10180,29,86.129997,9,2497.77002,Shipped,4,11,2003,Motorcycles,95.0,S10_1678,,Lille,,59000,France,EMEA,Small
7,10188,48,100.0,1,5512.319824,Shipped,4,11,2003,Motorcycles,95.0,S10_1678,,Bergen,,N 5804,Norway,EMEA,Medium
8,10201,22,98.57,2,2168.540039,Shipped,4,12,2003,Motorcycles,95.0,S10_1678,,San Francisco,CA,90001_96162,USA,,Small
9,10211,41,100.0,14,4708.439941,Shipped,1,1,2004,Motorcycles,95.0,S10_1678,,Paris,,75016,France,EMEA,Medium


In [33]:
''' see some value counts '''
for colm in ['STATUS', 'PRODUCTLINE', 'PRODUCTCODE', 'STATE', 'POSTALCODE', 'COUNTRY', 'TERRITORY', 'DEALSIZE']:
    print(colm)
    sales.select(colm).groupBy(colm).count().show()

STATUS
+----------+-----+
|    STATUS|count|
+----------+-----+
|   Shipped| 2617|
|   On Hold|   44|
| Cancelled|   60|
|  Resolved|   47|
|In Process|   41|
|  Disputed|   14|
+----------+-----+

PRODUCTLINE
+----------------+-----+
|     PRODUCTLINE|count|
+----------------+-----+
|     Motorcycles|  331|
|    Vintage Cars|  607|
|           Ships|  234|
|Trucks and Buses|  301|
|    Classic Cars|  967|
|          Trains|   77|
|          Planes|  306|
+----------------+-----+

PRODUCTCODE
+-----------+-----+
|PRODUCTCODE|count|
+-----------+-----+
|   S18_4600|   27|
|   S18_1749|   22|
|   S12_3891|   26|
|   S18_2248|   22|
|  S700_1138|   26|
|   S32_1268|   27|
|   S12_1099|   25|
|   S18_2795|   26|
|   S24_1937|   25|
|   S32_3522|   27|
|   S18_1097|   28|
|   S18_1662|   26|
|   S12_1666|   28|
|   S24_3969|   22|
|   S24_1578|   26|
|   S24_4048|   26|
|   S18_3320|   26|
|   S24_3816|   26|
|   S18_3136|   27|
|   S32_2509|   28|
+-----------+-----+
only showing top 20 ro

In [31]:
''' index the string columns '''
# set the string columns to index
strCols = ['STATUS', 'PRODUCTLINE', 'PRODUCTCODE', 'STATE', 'POSTALCODE', 'COUNTRY', 'TERRITORY', 'DEALSIZE']
featCols = [c+'_str' for c in strCols]
# do the indexing
indxr = StringIndexer(inputCols=strCols, outputCols=featCols)
sales = indxr.fit(sales).transform(sales)
# talk
display(sales.limit(10).toPandas())

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,STATUS,QTR_ID,MONTH_ID,YEAR_ID,PRODUCTLINE,...,TERRITORY,DEALSIZE,STATUS_str,PRODUCTCODE_str,STATE_str,COUNTRY_str,POSTALCODE_str,DEALSIZE_str,TERRITORY_str,PRODUCTLINE_str
0,10107,30,95.699997,2,2871.0,Shipped,1,2,2003,Motorcycles,...,,Small,0.0,32.0,3.0,0.0,2.0,1.0,1.0,2.0
1,10121,34,81.349998,5,2765.899902,Shipped,2,5,2003,Motorcycles,...,EMEA,Small,0.0,32.0,0.0,2.0,17.0,1.0,0.0,2.0
2,10134,41,94.739998,2,3884.340088,Shipped,3,7,2003,Motorcycles,...,EMEA,Medium,0.0,32.0,0.0,2.0,60.0,0.0,0.0,2.0
3,10145,45,83.260002,6,3746.699951,Shipped,3,8,2003,Motorcycles,...,,Medium,0.0,32.0,1.0,0.0,33.0,0.0,1.0,2.0
4,10159,49,100.0,14,5205.27002,Shipped,4,10,2003,Motorcycles,...,,Medium,0.0,32.0,1.0,0.0,4.0,0.0,1.0,2.0
5,10168,36,96.660004,1,3479.76001,Shipped,4,10,2003,Motorcycles,...,,Medium,0.0,32.0,1.0,0.0,3.0,0.0,1.0,2.0
6,10180,29,86.129997,9,2497.77002,Shipped,4,11,2003,Motorcycles,...,EMEA,Small,0.0,32.0,0.0,2.0,59.0,1.0,0.0,2.0
7,10188,48,100.0,1,5512.319824,Shipped,4,11,2003,Motorcycles,...,EMEA,Medium,0.0,32.0,0.0,7.0,36.0,0.0,0.0,2.0
8,10201,22,98.57,2,2168.540039,Shipped,4,12,2003,Motorcycles,...,,Small,0.0,32.0,1.0,0.0,4.0,1.0,1.0,2.0
9,10211,41,100.0,14,4708.439941,Shipped,1,1,2004,Motorcycles,...,EMEA,Medium,0.0,32.0,0.0,2.0,38.0,0.0,0.0,2.0


In [None]:
''' prepare the features '''
# get the features
features = featCols + ['QUANTITYORDERED', 'SALES', 'QTR_ID', 'MONTH_ID', 'YEAR_ID']

# create & scale the features vector
assr = VectorAssembler(inputCols=features, outputCol='features_raw')
scalr = MinMaxScaler(inputCol='features_raw', outputCol='features')
pipe = Pipeline(stages=[assr, scalr]).fit(sales)
sales = pipe.transform(sales).drop('features_raw')
sales = assr.transform(sales)

# talk
display(sales.limit(10).toPandas())
sales.select('features').take(1)
print('First row features = %s'%sales.select('features').take(1)[0])

In [None]:
# check for multicollinearity
corr = Correlation.corr(sales, column='features', method='pearson')
corrdf = pd.DataFrame(index=features, data=corr.collect()[0][0].toArray(), columns=features)
display(corrdf)

## Modeling

In [None]:
# split for cross-val
trainPerc = 0.7
randSeed = 42
tran, test = cc.select('CUST_ID', 'features').randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# talk
print('Training Cases')
tran.select('CUST_ID').show()
print('Testing Cases')
test.select('CUST_ID').show()

In [None]:
''' evaluate different clustering cardinalities - kmeans '''
# setup range that will be tried
kMax = 41
xs = list(range(2, kMax))
kCost = np.ones(kMax)*np.inf
kSil = np.ones(kMax)*-1

# iterate over k
models = [None]*kMax
for k in range(2, kMax):
    print('Trying k = %d'%k)
    # fit the kmeans model on the training set
    kmeans = KMeans(k=k, seed=randSeed, featuresCol='features')
    models[k] = kmeans.fit(tran)
    # eval the model on the training set
    kCost[k] = models[k].summary.trainingCost
    print('\tTraining Cost = %0.3f'%kCost[k])
    # eval the model on the testing set 
    testPred = models[k].transform(test)
    kSil[k] = ClusteringEvaluator().evaluate(testPred)
    print('\tSilhouette score = %0.3f'%kSil[k])
    
# show the scree plot
fig = plysub.make_subplots(rows=2, cols=1, print_grid=False, subplot_titles=('Train SSE vs. k', 'Test Silhouette vs. k'))
fig.add_trace(go.Scatter(x=xs, y=kCost[2:], mode='markers+lines'), 1, 1)
fig.add_trace(go.Scatter(x=xs, y=kSil[2:], mode='markers+lines'), 2, 1)
fig['layout']['title'] = 'Kmeans Results'
plyoff.plot(fig)

# find the min
bestK = np.argmin(kCost)
print('Best model has %d clusters, with a cost of %0.3f'%(bestK, kCost[bestK]))

In [None]:
''' evaluate different clustering cardinalities  - bisecting kmeans '''
# setup range that will be tried
bkCost = np.ones(kMax)*np.inf
bkSil = np.ones(kMax)*-1

# iterate over k
bmodels = [None]*kMax
for k in range(2, kMax):
    print('Trying k = %d'%k)
    # fit the bisecting kmeans model on the training set
    kmeans = BisectingKMeans(k=k, seed=randSeed, featuresCol='features')
    bmodels[k] = kmeans.fit(tran)
    # eval the model on the training set
    bkCost[k] = bmodels[k].summary.trainingCost
    print('\tTraining Cost = %0.3f'%bkCost[k])
    # eval the model on the testing set 
    testPred = bmodels[k].transform(test)
    bkSil[k] = ClusteringEvaluator().evaluate(testPred)
    print('\tSilhouette score = %0.3f'%bkSil[k])
    
# show the scree plot
fig = plysub.make_subplots(rows=2, cols=1, print_grid=False, subplot_titles=('Train SSE vs. k', 'Test Silhouette vs. k'))
fig.add_trace(go.Scatter(x=xs, y=bkCost[2:], mode='markers+lines'), 1, 1)
fig.add_trace(go.Scatter(x=xs, y=bkSil[2:], mode='markers+lines'), 2, 1)
fig['layout']['title'] = 'Bisecting Kmeans Results'
plyoff.plot(fig)

# find the min
bestBK = np.argmin(bkCost)
print('Best model has %d clusters, with a cost of %0.3f'%(bestBK, bkCost[bestBK]))

In [None]:
''' Evaluate best model on test set '''
# get the best
bestK = int(input('Enter the "best" k'))
bestMod = input('Enter the best model ("k" or bk")')
if bestMod == 'k':
    # kmeans
    bestModel = models[bestK]
    cst = kCost[bestK]
else:
    # bisecting kmeans
    bestModel = bmodels[bestK]
    cst = bkCost[bestK]
print('Best %s model has %d clusters, with a cost of %0.3f'%(bestMod, bestK, cst))

# predict
testPred = bestModel.transform(test)
# eval
evalSil = ClusteringEvaluator()
silhouette = evalSil.evaluate(testPred)
print('Silhouette score for %s model with %d cluster = [-1, %0.3f, 1]'%(bestMod, bestK, silhouette))
# get the centers
cents = pd.DataFrame(index=list(range(bestK)), data=bestModel.clusterCenters(), columns=features)
display(cents)

In [None]:
# add predictions to entire dataset
ccpred = bestModel.transform(cc)
for feat in features:
    ccpred.groupBy('prediction').agg(min(col(feat)), mean(col(feat)), max(col(feat))).show()

In [None]:
sc.stop()