In [21]:
import pandas as pd
import numpy as np
import datetime as dt
from itertools import chain

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *
from pyspark.ml.feature import Bucketizer, StringIndexer, QuantileDiscretizer
from pyspark.ml.fpm import *
from pyspark.ml import Pipeline

In [2]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'fpminingassign'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


In [42]:
''' get the data '''
# load the data
fil = '../../data/Mall_Customers.csv'
schem = StructType([StructField('CustomerID', IntegerType()), StructField('Gender', StringType()),
                    StructField('Age', IntegerType()), StructField('Annual Income (k)', IntegerType()),
                    StructField('Spending Score (1-100)', IntegerType())])
mall = spark.read.format('csv').options(header=True).schema(schem).load(fil)

# rename stupid columns
mall = mall.select('CustomerId', 'Gender', 'Age', col('Annual Income (k)').alias('Income'), col('Spending Score (1-100)').alias('Score'))

# talk
cnt = mall.count()
print('%d records'%cnt)
mall.show(truncate=False)

200 records
+----------+------+---+------+-----+
|CustomerId|Gender|Age|Income|Score|
+----------+------+---+------+-----+
|1         |Male  |19 |15    |39   |
|2         |Male  |21 |15    |81   |
|3         |Female|20 |16    |6    |
|4         |Female|23 |16    |77   |
|5         |Female|31 |17    |40   |
|6         |Female|22 |17    |76   |
|7         |Female|35 |18    |6    |
|8         |Female|23 |18    |94   |
|9         |Male  |64 |19    |3    |
|10        |Female|30 |19    |72   |
|11        |Male  |67 |19    |14   |
|12        |Female|35 |19    |99   |
|13        |Female|58 |20    |15   |
|14        |Female|24 |20    |77   |
|15        |Male  |37 |20    |13   |
|16        |Male  |22 |20    |79   |
|17        |Female|35 |21    |35   |
|18        |Male  |20 |21    |66   |
|19        |Male  |52 |23    |29   |
|20        |Female|35 |23    |98   |
+----------+------+---+------+-----+
only showing top 20 rows



### Data Prep

In [13]:
''' handle missing values '''
# check for missing values
nullCounts = {colm:mall.select(colm).where(col(colm).isNull()).count() for colm in mall.columns}
nullCounts = {colm:(ncnt, ncnt/cnt) for (colm, ncnt) in nullCounts.items()}
nullCountsDF = pd.DataFrame(nullCounts).T.reset_index(drop=False).sort_values(1, ascending=False)
nullCountsDF.columns = ['Column', 'Freq.', 'Rel. Freq.']
nullCountsDF = nullCountsDF.merge(pd.DataFrame([[colm.name, colm.dataType] for colm in mall.schema], columns=['Column', 'Type']),
                                how='inner', on=['Column'])

# talk
display(nullCountsDF)

Unnamed: 0,Column,Freq.,Rel. Freq.,Type
0,CustomerId,0.0,0.0,IntegerType
1,Gender,0.0,0.0,StringType
2,Income,0.0,0.0,IntegerType
3,Score,0.0,0.0,IntegerType


In [43]:
''' prepare the features '''
# make an integer gender column
indxr = StringIndexer(inputCol='Gender', outputCol='Gender_Int')
# bucket the score by deciles
decs = list(range(0, 101, 10))
buckr = Bucketizer(splits=decs, inputCol='Score', outputCol='Score_dec')
# bucket the index by quartiles
qantr = QuantileDiscretizer(numBuckets=4, inputCol='Income', outputCol='Income_quart')
# bucket the age by deciles
declr = QuantileDiscretizer(numBuckets=10, inputCol='Age', outputCol='Age_dec')

# run the pipeline of transforms
pipe = Pipeline(stages=[buckr, qantr, declr, indxr]).fit(mall)
mall = pipe.transform(mall)

# scale up income, age, and score so they can't overlap
mall = mall.withColumn('Income_quart', col('Income_quart')+10).withColumn('Age_dec', col('Age_dec')+100).withColumn('Score_dec', col('Score_dec')+1000)

# features
features = ['Gender_Int', 'Income_quart', 'Age_dec', 'Score_dec']

# check out balance
for feat in features:
    print('%s tabulation'%feat)
    mall.select(feat).groupBy(feat).count().orderBy(feat).show()

# add the array of features
mall = mall.withColumn('items', array(*features))

# talk
mall.show(truncate=False)

Gender_Int tabulation
+----------+-----+
|Gender_Int|count|
+----------+-----+
|       0.0|  112|
|       1.0|   88|
+----------+-----+

Age_dec tabulation
+-------+-----+
|Age_dec|count|
+-------+-----+
|  100.0|   17|
|  101.0|   21|
|  102.0|   17|
|  103.0|   15|
|  104.0|   28|
|  105.0|   18|
|  106.0|   21|
|  107.0|   18|
|  108.0|   21|
|  109.0|   24|
+-------+-----+

Income_quart tabulation
+------------+-----+
|Income_quart|count|
+------------+-----+
|        10.0|   46|
|        11.0|   52|
|        12.0|   50|
|        13.0|   52|
+------------+-----+

Score_dec tabulation
+---------+-----+
|Score_dec|count|
+---------+-----+
|   1000.0|   14|
|   1001.0|   20|
|   1002.0|   12|
|   1003.0|   13|
|   1004.0|   39|
|   1005.0|   37|
|   1006.0|   11|
|   1007.0|   24|
|   1008.0|   14|
|   1009.0|   16|
+---------+-----+

+----------+------+---+------+-----+---------+------------+-------+----------+--------------------------+
|CustomerId|Gender|Age|Income|Score|Score_dec|

## Modeling

In [44]:
# split for cross-val
trainPerc = 0.7
randSeed = 42
tran, test = mall.select('CustomerId', 'items').randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# talk
print('Training Cases')
tran.select('CustomerId').show()
print('Testing Cases')
test.select('CustomerId').show()

Training Cases
+----------+
|CustomerId|
+----------+
|         1|
|         2|
|         4|
|         5|
|         6|
|         8|
|        11|
|        12|
|        13|
|        17|
|        18|
|        19|
|        21|
|        23|
|        26|
|        27|
|        28|
|        32|
|        34|
|        37|
+----------+
only showing top 20 rows

Testing Cases
+----------+
|CustomerId|
+----------+
|         3|
|         7|
|         9|
|        10|
|        14|
|        15|
|        16|
|        20|
|        22|
|        24|
|        25|
|        29|
|        30|
|        31|
|        33|
|        35|
|        36|
|        40|
|        43|
|        44|
+----------+
only showing top 20 rows



In [47]:
''' fit frequent pattern growth model'''
# fit on training set
supp = 0.2
conf = 0.4
fpg = FPGrowth(itemsCol='items', minSupport=supp, minConfidence=conf)
fpgmod = fpg.fit(tran)

# predict on testing set
preds = fpgmod.transform(test)
preds.show(n=20, truncate=False)

# frequent item sets mined - ordered by most frequent
answerpop = fpgmod.freqItemsets
answerpop = answerpop.withColumn('rel freq', col('freq')/mall.count())
answerpop.orderBy(col('freq').desc()).show(n=20, truncate=False)

# association rules - ordered by most confidence
assocrule = fpgmod.associationRules
assocrule.orderBy(col('confidence').desc()).show(n=20, truncate=False)

+----------+--------------------------+----------+
|CustomerId|items                     |prediction|
+----------+--------------------------+----------+
|3         |[0.0, 100.0, 10.0, 1000.0]|[]        |
|7         |[0.0, 104.0, 10.0, 1000.0]|[]        |
|9         |[1.0, 109.0, 10.0, 1000.0]|[]        |
|10        |[0.0, 103.0, 10.0, 1007.0]|[]        |
|14        |[0.0, 101.0, 10.0, 1007.0]|[]        |
|15        |[1.0, 105.0, 10.0, 1001.0]|[]        |
|16        |[1.0, 101.0, 10.0, 1007.0]|[]        |
|20        |[0.0, 104.0, 10.0, 1009.0]|[]        |
|22        |[1.0, 101.0, 10.0, 1007.0]|[]        |
|24        |[1.0, 103.0, 10.0, 1007.0]|[]        |
|25        |[0.0, 108.0, 10.0, 1001.0]|[]        |
|29        |[0.0, 106.0, 10.0, 1003.0]|[]        |
|30        |[0.0, 101.0, 10.0, 1008.0]|[]        |
|31        |[1.0, 109.0, 10.0, 1000.0]|[]        |
|33        |[1.0, 108.0, 10.0, 1000.0]|[]        |
|35        |[0.0, 107.0, 10.0, 1001.0]|[]        |
|36        |[0.0, 101.0, 10.0, 

In [None]:
sc.stop()