In [None]:
import pandas as pd
import numpy as np
import datetime as dt
from itertools import chain

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, MinMaxScaler, StringIndexer
from pyspark.ml.fpm import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [None]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'fpmining'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

In [None]:
''' get the data '''
# load the data
fil = '../data/data-final.csv'
schem = StructType([StructField('EXT1', IntegerType()), StructField('EXT2', IntegerType()), StructField('EXT3', IntegerType()),
StructField('EXT4', IntegerType()), StructField('EXT5', IntegerType()), StructField('EXT6', IntegerType()),
StructField('EXT7', IntegerType()), StructField('EXT8', IntegerType()), StructField('EXT9', IntegerType()),
StructField('EXT10', IntegerType()), StructField('EST1', IntegerType()), StructField('EST2', IntegerType()),
StructField('EST3', IntegerType()), StructField('EST4', IntegerType()), StructField('EST5', IntegerType()),
StructField('EST6', IntegerType()), StructField('EST7', IntegerType()), StructField('EST8', IntegerType()),
StructField('EST9', IntegerType()), StructField('EST10', IntegerType()), StructField('AGR1', IntegerType()),
StructField('AGR2', IntegerType()), StructField('AGR3', IntegerType()), StructField('AGR4', IntegerType()),
StructField('AGR5', IntegerType()), StructField('AGR6', IntegerType()), StructField('AGR7', IntegerType()),
StructField('AGR8', IntegerType()), StructField('AGR9', IntegerType()), StructField('AGR10', IntegerType()),
StructField('CSN1', IntegerType()), StructField('CSN2', IntegerType()), StructField('CSN3', IntegerType()),
StructField('CSN4', IntegerType()), StructField('CSN5', IntegerType()), StructField('CSN6', IntegerType()),
StructField('CSN7', IntegerType()), StructField('CSN8', IntegerType()), StructField('CSN9', IntegerType()),
StructField('CSN10', IntegerType()), StructField('OPN1', IntegerType()), StructField('OPN2', IntegerType()),
StructField('OPN3', IntegerType()), StructField('OPN4', IntegerType()), StructField('OPN5', IntegerType()),
StructField('OPN6', IntegerType()), StructField('OPN7', IntegerType()), StructField('OPN8', IntegerType()),
StructField('OPN9', IntegerType()), StructField('OPN10', IntegerType()), StructField('EXT1_E', FloatType()),
StructField('EXT2_E', FloatType()), StructField('EXT3_E', FloatType()), StructField('EXT4_E', FloatType()),
StructField('EXT5_E', FloatType()), StructField('EXT6_E', FloatType()), StructField('EXT7_E', FloatType()),
StructField('EXT8_E', FloatType()), StructField('EXT9_E', FloatType()), StructField('EXT10_E', FloatType()),
StructField('EST1_E', FloatType()), StructField('EST2_E', FloatType()), StructField('EST3_E', FloatType()),
StructField('EST4_E', FloatType()), StructField('EST5_E', FloatType()), StructField('EST6_E', FloatType()),
StructField('EST7_E', FloatType()), StructField('EST8_E', FloatType()), StructField('EST9_E', FloatType()),
StructField('EST10_E', FloatType()), StructField('AGR1_E', FloatType()), StructField('AGR2_E', FloatType()),
StructField('AGR3_E', FloatType()), StructField('AGR4_E', FloatType()), StructField('AGR5_E', FloatType()),
StructField('AGR6_E', FloatType()), StructField('AGR7_E', FloatType()), StructField('AGR8_E', FloatType()),
StructField('AGR9_E', FloatType()), StructField('AGR10_E', FloatType()), StructField('CSN1_E', FloatType()),
StructField('CSN2_E', FloatType()), StructField('CSN3_E', FloatType()), StructField('CSN4_E', FloatType()),
StructField('CSN5_E', FloatType()), StructField('CSN6_E', FloatType()), StructField('CSN7_E', FloatType()),
StructField('CSN8_E', FloatType()), StructField('CSN9_E', FloatType()), StructField('CSN10_E', FloatType()),
StructField('OPN1_E', FloatType()), StructField('OPN2_E', FloatType()), StructField('OPN3_E', FloatType()),
StructField('OPN4_E', FloatType()), StructField('OPN5_E', FloatType()), StructField('OPN6_E', FloatType()),
StructField('OPN7_E', FloatType()), StructField('OPN8_E', FloatType()), StructField('OPN9_E', FloatType()),
StructField('OPN10_E', FloatType()), StructField('dateload', TimestampType()), StructField('screenw', IntegerType()),
StructField('screenh', IntegerType()), StructField('introelapse', IntegerType()), StructField('testelapse', FloatType()),
StructField('endelapse', FloatType()), StructField('IPC', IntegerType()), StructField('country', StringType()),
StructField('lat_appx_lots_of_err', FloatType()), StructField('long_appx_lots_of_err', FloatType())])
bigfive = spark.read.format('csv').options(header=True, delimiter='\t', timeStampFormat='yyyy-MM-dd HH:mm:ss').schema(schem).load(fil)

# add an ID - don't actually care if it's monotonic; also filter for IP count is 1
bigfive = bigfive.where(col('IPC') == 1).select(monotonically_increasing_id().alias('id'), '*')

# talk
cnt = bigfive.count()
print('%d records'%cnt)
bigfive.show(truncate=False)
#bigfive.printSchema()

### Data Prep

In [None]:
''' handle missing values '''
# check for missing values
nullCounts = {colm:bigfive.select(colm).where(col(colm).isNull()).count() for colm in bigfive.columns}
nullCounts = {colm:(ncnt, ncnt/cnt) for (colm, ncnt) in nullCounts.items()}
nullCountsDF = pd.DataFrame(nullCounts).T.reset_index(drop=False).sort_values(1, ascending=False)
nullCountsDF.columns = ['Column', 'Freq.', 'Rel. Freq.']
nullCountsDF = nullCountsDF.merge(pd.DataFrame([[colm.name, colm.dataType] for colm in bigfive.schema], columns=['Column', 'Type']),
                                how='inner', on=['Column'])

# talk
display(nullCountsDF)

In [None]:
# remove nulls - only small amounts of nulls, so dropping
bigfive = bigfive.dropna(how='any')

# talk some more
print('%d records'%bigfive.count())

In [None]:
''' Data Prep & Modeling with FP Growth '''

In [None]:
''' prepare the features '''
# average the answers by "type"
qTypes = {'EXT':[[1,3,5,7,9], [2,4,6,8,10]],
          'EST':[[1,3,5,6,7,8,9,10], [2,4]],
          'AGR':[[1,3,4,7,9], [2,4,6,8,10]],
          'CSN':[[1,3,5,7,9,10], [2,4,6,8]],
          'OPN':[[1,3,5,7,8,9,10], [2,4,6]]}
# iterate over question types - skipping elapsed times because can't get the features I want
features = list(qTypes.keys())
for q in qTypes.keys():
    print('Processing %s questions...'%q)
    allQs = ['%s%d_E'%(q, cnt) for cnt in range(1, 11)]
    q0 = ['%s%d'%(q, cnt) for cnt in qTypes[q][0]]
    q1 = ['%s%d'%(q, cnt) for cnt in qTypes[q][1]]
    # average the 0s, 1s scores, and sum the elapsed times (will use the latter later)
    expr0 = '(' + '+'.join(q0) + ')/%d'%len(q0)
    q0Nam = '%s_0'%q
    expr1 = '(' + '+'.join(q1) + ')/%d'%len(q1)
    q1Nam = '%s_1'%q
    exprC = '(case when %s > %s then "%s" when %s > %s then "%s" else %s end)'%(q0Nam, q1Nam, q0Nam, q1Nam, q1Nam, q1Nam, '"%s_neut"'%q)
    exprE = '+'.join(allQs)
    bigfive = bigfive.withColumn('%s_0'%q, expr(expr0)).withColumn('%s_1'%q, expr(expr1)).\
        withColumn(q, expr(exprC)).withColumn('%s_E'%q, expr(exprE))
    ''' old code wherein I kept the avg score & elapsed times separate - I think that data was too granular to give a resukt'''
    #expr0 = 'cast((' + '+'.join(q0) + ')/%d as string)'%len(q0)
    #expr1 = 'cast((' + '+'.join(q1) + ')/%d as string)'%len(q1)
    #exprE = 'cast((' + '+'.join(allQs) + ')/10 as string)'
    #bigfive = bigfive.withColumn('%s_E'%q, expr(exprE)).withColumn('%s_0'%q, expr(expr0)).withColumn('%s_1'%q, expr(expr1))
    # update the q columns to be unique
    #bigfive = bigfive.withColumn('%s_0'%q, concat_ws('_', lit('%s0'%q), '%s_0'%q)).withColumn('%s_1'%q, concat_ws('_', lit('%s1'%q), '%s_1'%q)).withColumn('%s_E'%q, concat_ws('_', lit('%sE'%q), '%s_E'%q))


# get the q types on which the most and least time was spent - doesn't work, and don't know how to get the index of the min / max from the array col
#bigfive = bigfive.withColumn('elapses', array(['%s_E'%q for q in qTypes.keys()]))
#bigfive = bigfive.withColumn('elapsemax', array_position('elapses', array_max('elapses')))\
#    .withColumn('elapsemax', array_position('elapses', array_min('elapses')))

    
# add the screen area as a feature
bigfive = bigfive.withColumn('screen_area', col('screenw')*col('screenh'))
features.append('screen_area')

# add total elapse as a feature
bigfive = bigfive.withColumn('totalelapse', col('introelapse')+col('endelapse')+col('testelapse'))

# finally add the rest of the features - ignoring introelapse & endelapse as they can be identical
features.extend(['totalelapse', 'testelapse', 'country'])

# add the array of features
bigfive = bigfive.withColumn('items', array(*features))

# talk
bigfive.select('id', 'country', 'dateload', 'items').show(truncate=False)

## Modeling

In [None]:
# split for cross-val
trainPerc = 0.7
randSeed = 42
tran, test = bigfive.select('id', 'items').randomSplit([trainPerc, 1.0 - trainPerc], seed=randSeed)

# talk
print('Training Cases')
tran.select('id').show()
print('Testing Cases')
test.select('id').show()

In [None]:
''' fit frequent pattern growth model'''
# fit on training set
supp = 0.4
conf = 0.6
fpg = FPGrowth(itemsCol='items', minSupport=supp, minConfidence=conf)
fpgmod = fpg.fit(tran)

# predict on testing set
preds = fpgmod.transform(test)
preds.show(n=20, truncate=False)

# frequent item sets mined - ordered by most frequent
answerpop = fpgmod.freqItemsets
answerpop = answerpop.withColumn('rel freq', col('freq')/bigfive.count())
answerpop.orderBy(col('freq').desc()).show(n=20, truncate=False)

# association rules - ordered by most confidence
assocrule = fpgmod.associationRules
assocrule.orderBy(col('confidence').desc()).show(n=20, truncate=False)

In [None]:
''' Data Prep & Modeling with Prefix Span'''

In [None]:
''' data prep '''
# prep the elapsed times by question type
bigfive = bigfive.withColumn('elapseMn', round(expr('(' + '+'.join(['EXT_E', 'EST_E', 'AGR_E', 'CSN_E', 'OPN_E']) + ')/5'), 0)).\
    withColumn('extE', (col('EXT_E')-col('elapseMn'))>0).withColumn('estE', (col('EST_E')-col('elapseMn'))>0).\
    withColumn('agrE', (col('AGR_E')-col('elapseMn'))>0).withColumn('csnE', (col('CSN_E')-col('elapseMn'))>0).\
    withColumn('opnE', (col('OPN_E')-col('elapseMn'))>0)

# get pairs of odd-even question answers - the prefix span model expects an array of arrays
bfps = bigfive.select('id', 'country', 'dateload', array(array('EXT1', 'EXT2'), array('EXT3', 'EXT4'), array('EXT5', 'EXT6'),
                                                         array('EXT5', 'EXT6'), array('EXT7', 'EXT8'), array('EXT9', 'EXT10'),
                                                         array('EST1', 'EST2'), array('EST3', 'EST4'), array('EST5', 'EST6'),
                                                         array('EST5', 'EST6'), array('EST7', 'EST8'), array('EST9', 'EST10'),
                                                         array('AGR1', 'AGR2'), array('AGR3', 'AGR4'), array('AGR5', 'AGR6'),
                                                         array('AGR5', 'AGR6'), array('AGR7', 'AGR8'), array('AGR9', 'AGR10'),
                                                         array('CSN1', 'CSN2'), array('CSN3', 'CSN4'), array('CSN5', 'CSN6'),
                                                         array('CSN5', 'CSN6'), array('CSN7', 'CSN8'), array('CSN9', 'CSN10'),
                                                         array('OPN1', 'OPN2'), array('OPN3', 'OPN4'), array('OPN5', 'OPN6'),
                                                         array('OPN5', 'OPN6'), array('OPN7', 'OPN8'), array('OPN9', 'OPN10'),
                                                         array(col('extE').cast(IntegerType()), col('estE').cast(IntegerType()),
                                                               col('agrE').cast(IntegerType()), col('csnE').cast(IntegerType()),
                                                               col('opnE').cast(IntegerType()))).alias('sequence'))
# talk
bfps.show(truncate=False)

In [None]:
''' fit prefix span model'''
# fit to the data
supp = 0.3
patt = 10
ps = PrefixSpan(sequenceCol='sequence', minSupport=supp, maxPatternLength=patt)
seqs = ps.findFrequentSequentialPatterns(bfps)

In [None]:
# get the frequent sequences
seqs = seqs.withColumn('rel freq', col('freq')/bfps.count()).withColumn('maxsizes', array_max(expr('transform(sequence, x ->size(x))'))).where(col('maxsizes') > 1)
seqs.orderBy(col('freq').desc()).show(n=20, truncate=False)

In [None]:
sc.stop()