In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from itertools import chain

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, MinMaxScaler, StringIndexer
from pyspark.ml.fpm import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [2]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'fpmining'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


In [3]:
''' get the data '''
# load the data
fil = '../data/data-final.csv'
schem = StructType([StructField('EXT1', IntegerType()), StructField('EXT2', IntegerType()), StructField('EXT3', IntegerType()),
StructField('EXT4', IntegerType()), StructField('EXT5', IntegerType()), StructField('EXT6', IntegerType()),
StructField('EXT7', IntegerType()), StructField('EXT8', IntegerType()), StructField('EXT9', IntegerType()),
StructField('EXT10', IntegerType()), StructField('EST1', IntegerType()), StructField('EST2', IntegerType()),
StructField('EST3', IntegerType()), StructField('EST4', IntegerType()), StructField('EST5', IntegerType()),
StructField('EST6', IntegerType()), StructField('EST7', IntegerType()), StructField('EST8', IntegerType()),
StructField('EST9', IntegerType()), StructField('EST10', IntegerType()), StructField('AGR1', IntegerType()),
StructField('AGR2', IntegerType()), StructField('AGR3', IntegerType()), StructField('AGR4', IntegerType()),
StructField('AGR5', IntegerType()), StructField('AGR6', IntegerType()), StructField('AGR7', IntegerType()),
StructField('AGR8', IntegerType()), StructField('AGR9', IntegerType()), StructField('AGR10', IntegerType()),
StructField('CSN1', IntegerType()), StructField('CSN2', IntegerType()), StructField('CSN3', IntegerType()),
StructField('CSN4', IntegerType()), StructField('CSN5', IntegerType()), StructField('CSN6', IntegerType()),
StructField('CSN7', IntegerType()), StructField('CSN8', IntegerType()), StructField('CSN9', IntegerType()),
StructField('CSN10', IntegerType()), StructField('OPN1', IntegerType()), StructField('OPN2', IntegerType()),
StructField('OPN3', IntegerType()), StructField('OPN4', IntegerType()), StructField('OPN5', IntegerType()),
StructField('OPN6', IntegerType()), StructField('OPN7', IntegerType()), StructField('OPN8', IntegerType()),
StructField('OPN9', IntegerType()), StructField('OPN10', IntegerType()), StructField('EXT1_E', FloatType()),
StructField('EXT2_E', FloatType()), StructField('EXT3_E', FloatType()), StructField('EXT4_E', FloatType()),
StructField('EXT5_E', FloatType()), StructField('EXT6_E', FloatType()), StructField('EXT7_E', FloatType()),
StructField('EXT8_E', FloatType()), StructField('EXT9_E', FloatType()), StructField('EXT10_E', FloatType()),
StructField('EST1_E', FloatType()), StructField('EST2_E', FloatType()), StructField('EST3_E', FloatType()),
StructField('EST4_E', FloatType()), StructField('EST5_E', FloatType()), StructField('EST6_E', FloatType()),
StructField('EST7_E', FloatType()), StructField('EST8_E', FloatType()), StructField('EST9_E', FloatType()),
StructField('EST10_E', FloatType()), StructField('AGR1_E', FloatType()), StructField('AGR2_E', FloatType()),
StructField('AGR3_E', FloatType()), StructField('AGR4_E', FloatType()), StructField('AGR5_E', FloatType()),
StructField('AGR6_E', FloatType()), StructField('AGR7_E', FloatType()), StructField('AGR8_E', FloatType()),
StructField('AGR9_E', FloatType()), StructField('AGR10_E', FloatType()), StructField('CSN1_E', FloatType()),
StructField('CSN2_E', FloatType()), StructField('CSN3_E', FloatType()), StructField('CSN4_E', FloatType()),
StructField('CSN5_E', FloatType()), StructField('CSN6_E', FloatType()), StructField('CSN7_E', FloatType()),
StructField('CSN8_E', FloatType()), StructField('CSN9_E', FloatType()), StructField('CSN10_E', FloatType()),
StructField('OPN1_E', FloatType()), StructField('OPN2_E', FloatType()), StructField('OPN3_E', FloatType()),
StructField('OPN4_E', FloatType()), StructField('OPN5_E', FloatType()), StructField('OPN6_E', FloatType()),
StructField('OPN7_E', FloatType()), StructField('OPN8_E', FloatType()), StructField('OPN9_E', FloatType()),
StructField('OPN10_E', FloatType()), StructField('dateload', TimestampType()), StructField('screenw', IntegerType()),
StructField('screenh', IntegerType()), StructField('introelapse', IntegerType()), StructField('testelapse', FloatType()),
StructField('endelapse', FloatType()), StructField('IPC', IntegerType()), StructField('country', StringType()),
StructField('lat_appx_lots_of_err', FloatType()), StructField('long_appx_lots_of_err', FloatType())])
bigfive = spark.read.format('csv').options(header=True, delimiter='\t', timeStampFormat='yyyy-MM-dd HH:mm:ss').schema(schem).load(fil)

# add an ID - don't actually care if it's monotonic; also filter for IP count is 1
bigfive = bigfive.where(col('IPC') == 1).select(monotonically_increasing_id().alias('id'), '*')

# talk
cnt = bigfive.count()
print('%d records'%cnt)
bigfive.show(truncate=False)
#bigfive.printSchema()

696845 records
+---+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+----+----+----+----+----+----+----+----+----+-----+--------+------+-------+------+------+-------+------+-------+-------+-------+-------+------+-------+--------+------+------+-------+------+-------+-------+-------+-------+-------+------+------+-------+-------+-------+------+-------+-------+-------+------+-------+------+------+-------+-------+------+-------+-------+------+-------+------+------+-------+-------+-------+------+-------+-------------------+-------+-------+-----------+----------+---------+---+-------+--------------------+---------------------+
|id |EXT1|EXT2|EXT3|EXT4|EXT5|EXT6|EXT7|EXT8|EXT9|EXT10|EST1|EST2|EST3|EST4|EST5|EST6|EST7|EST8|EST9|EST10|AGR1|AGR2|AGR3|AGR4|AGR5|AGR6|AGR7|AGR8|AGR9|AGR10|CSN1|CSN2|CSN3|CSN4|CSN5|CSN6|CSN7|CSN8|CSN9|CSN10|OPN1|OPN

### Data Prep

In [4]:
''' handle missing values '''
# check for missing values
nullCounts = {colm:bigfive.select(colm).where(col(colm).isNull()).count() for colm in bigfive.columns}
nullCounts = {colm:(ncnt, ncnt/cnt) for (colm, ncnt) in nullCounts.items()}
nullCountsDF = pd.DataFrame(nullCounts).T.reset_index(drop=False).sort_values(1, ascending=False)
nullCountsDF.columns = ['Column', 'Freq.', 'Rel. Freq.']
nullCountsDF = nullCountsDF.merge(pd.DataFrame([[colm.name, colm.dataType] for colm in bigfive.schema], columns=['Column', 'Type']),
                                how='inner', on=['Column'])

# talk
display(nullCountsDF)

Unnamed: 0,Column,Freq.,Rel. Freq.,Type
0,long_appx_lots_of_err,9614.0,0.013796,FloatType
1,lat_appx_lots_of_err,9614.0,0.013796,FloatType
2,introelapse,1205.0,0.001729,IntegerType
3,screenh,1205.0,0.001729,IntegerType
4,screenw,1205.0,0.001729,IntegerType
...,...,...,...,...
106,dateload,0.0,0.000000,TimestampType
107,endelapse,0.0,0.000000,FloatType
108,IPC,0.0,0.000000,IntegerType
109,country,0.0,0.000000,StringType


In [5]:
# remove nulls - only small amounts of nulls, so dropping
bigfive = bigfive.dropna(how='any')

# talk some more
print('%d records'%bigfive.count())

685349 records


In [34]:
''' prepare the features '''
# sum the answers by "type"
qTypes = {'EXT':[[1,3,5,7,9], [2,4,6,8,10]],
          'EST':[[1,3,5,6,7,8,9,10], [2,4]],
          'AGR':[[1,3,4,7,9], [2,4,6,8,10]],
          'CSN':[[1,3,5,7,9,10], [2,4,6,8]],
          'OPN':[[1,3,5,7,8,9,10], [2,4,6]]}
# iterate over question types
features = list(chain.from_iterable([['%s_E'%q, '%s_0'%q, '%s_1'%q] for q in qTypes.keys()]))
for (indx, q) in enumerate(qTypes.keys()):
    allQs = ['%s%d_E'%(q, cnt) for cnt in range(1, 11)]
    q0 = ['%s%d'%(q, cnt) for cnt in qTypes[q][0]]
    q1 = ['%s%d'%(q, cnt) for cnt in qTypes[q][1]]
    # sum the elapsed times, 0s, 1s
    expr0 = 'cast(' + '+'.join(q0) + ' as string)'
    expr1 = 'cast(' + '+'.join(q1) + ' as string)'
    exprE = 'cast(' + '+'.join(allQs) + ' as string)'
    bigfive = bigfive.withColumn('%s_E'%q, expr(exprE)).withColumn('%s_0'%q, expr(expr0)).withColumn('%s_1'%q, expr(expr1))
    # update the q columns to be unique
    bigfive = bigfive.withColumn('%s_0'%q, concat_ws('_', lit('%s0'%q), '%s_0'%q)).withColumn('%s_1'%q, concat_ws('_', lit('%s1'%q), '%s_1'%q)).\
        withColumn('%s_E'%q, concat_ws('_', lit('%sE'%q), '%s_E'%q))

# add the screen area as a feature
bigfive = bigfive.withColumn('screen_area', col('screenw')*col('screenh'))
features.append('screen_area')

# finally add the rest of the features
features.extend(['introelapse', 'testelapse', 'endelapse', 'country'])

# add the array of features
bigfive = bigfive.withColumn('items', array(*features))

# talk
bigfive.select('id', 'country', 'dateload', 'items').show(truncate=False)

+---+-------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |country|dateload           |items                                                                                                                                                                                         |
+---+-------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0  |GB     |2016-03-03 02:01:01|[EXTE_46568.0, EXT0_23, EXT1_7, ESTE_47699.0, EST0_18, EST1_6, AGRE_52419.0, AGR0_13, AGR1_20, CSNE_49929.0, CSN0_20, CSN1_12, OPNE_35571.0, OPN0_30, OPN1_3, 786432, 9, 234.0, 6.0, GB]      |
|1  |MY     |2016-03-03 02:01:20|[EXTE_37527.0, EXT0_12, EXT1_22, ESTE_34479.0, EST0_17, EST1_4, AGR

## Modeling

In [35]:
# split for cross-val
trainPerc = 0.7
randSeed = 42
tran, test = bigfive.select('id', 'items').randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# talk
print('Training Cases')
tran.select('id').show()
print('Testing Cases')
test.select('id').show()

Training Cases
+---+
| id|
+---+
|  0|
|  1|
|  3|
|  4|
|  5|
|  7|
| 10|
| 11|
| 12|
| 16|
| 17|
| 18|
| 20|
| 22|
| 25|
| 26|
| 27|
| 31|
| 33|
| 36|
+---+
only showing top 20 rows

Testing Cases
+---+
| id|
+---+
|  2|
|  6|
|  8|
|  9|
| 13|
| 14|
| 15|
| 19|
| 21|
| 23|
| 24|
| 28|
| 29|
| 30|
| 32|
| 34|
| 35|
| 39|
| 42|
| 43|
+---+
only showing top 20 rows



In [None]:
''' fit frequent pattern growth model'''
# fit on training set
supp = 0.3
conf = 0.6
fpg = FPGrowth(itemsCol='items', minSupport=supp, minConfidence=conf)
fpgmod = fpg.fit(tran)

# predict on testing set
fpgmod.transform(test)

In [None]:
answerpop = fpgmod.freqItemSets

In [None]:
sc.stop()