<a href="https://colab.research.google.com/github/ZenMachina16/MLP-implentation/blob/main/mouse_dynamics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sn
import hashlib
import binascii
import math
from sklearn.decomposition import PCA as skPCA
from pyspark.ml.feature import PCA as spPCA
from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import CrossValidator, TrainValidationSplit, ParamGridBuilder, TrainValidationSplitModel
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.classification import MultilayerPerceptronClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.types import StructType, StringType, FloatType, LongType, IntegerType, StructField
from pyspark.sql import HiveContext, SparkSession
from pyspark.sql.functions import countDistinct, array_distinct, col, isnan, when, count, lit, array

plt.rcParams["figure.autolayout"] = True

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
schemaMousePos = StructType([
    StructField('uid', StringType(), False,),
    StructField('session_id', StringType(), False),
    StructField('user_id', StringType(), True),
    StructField('timestamp', LongType(), False),
    StructField('event_type', IntegerType(), False),
    StructField('screen_x', FloatType(), False),
    StructField('screen_y', FloatType(), False)
])
trainDs = spark.read.csv('/content/Train_Mouse.csv',header=True, schema=schemaMousePos)
trainDs.printSchema()

In [None]:
trainDs.groupBy('session_id').agg(countDistinct('user_id').alias('distinct_uids_per_session')).agg({'distinct_uids_per_session':'max'}).show()

In [None]:
trainDs.groupBy('user_id').agg(countDistinct('session_id')).show()

In [None]:
trainDs.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in trainDs.columns]).show()

In [None]:
plt.rcParams["figure.figsize"] = [5, 5]
eventMap = {1:'release', 2:'move', 3 : 'wheel', 4:'drag', 5 : 'click'}
# set colormap
colorsRevDict = {'#'+hashlib.md5((('{}-{}'.format(i, j))*16).encode()).hexdigest()[:6] : '{} -> {}'.format(eventMap[i],eventMap[j])  for i in range(1,6) for j in range(1,6)}
soa = np.array([[0,i,1,j-i] for i in range(1,6) for j in range(1,6)])
X, Y, U, V = zip(*soa)
plt.figure()
ax = plt.gca()
# generate unique color for each transition,
colors = ['#'+hashlib.md5((('{}-{}'.format(i, j))*16).encode()).hexdigest()[:6] for i in range(1,6) for j in range(1,6)]
ax.quiver(X, Y, U, V, angles='xy', scale_units='xy', scale=1, color=colors, linewidth=0.3)
ax.set_xlim([-1,2])
ax.set_ylim([0,6])
plt.draw()
plt.show()
print(eventMap)
print(colorsRevDict)

In [None]:
plt.rcParams["figure.figsize"] = [30, 15]
df = trainDs.toPandas().sort_values('timestamp')
# usersEncoder will simplify user_id strings into a small range values
usersEncoder = {k:i for i,k in enumerate(trainDs.select('user_id').rdd.flatMap(lambda x: x).distinct().collect())}
screenDims = ((df['screen_x'].min(),df['screen_x'].max()), (df['screen_y'].min(),df['screen_y'].max()))
for userId in usersEncoder.keys(): # df['user_id'].unique():
    portionDf = df[df['user_id']==userId]
    print(usersEncoder[userId], userId)
    for session in portionDf['session_id'].unique():
        portionDfSession = portionDf[portionDf['session_id']==session]
        XYs = np.array([(k[1].screen_x, k[1].screen_y) for k in portionDfSession.iterrows()]).astype(float) # xy
        evs = [k[1].event_type for k in portionDfSession.iterrows()] # events
        tss = [int(k[1].timestamp) for k in portionDfSession.iterrows()] # timestamps
        soa = np.array([[XYs[i][0],XYs[i][1], XYs[i+1][0]-XYs[i][0],XYs[i+1][1]-XYs[i][1]] for i in range(len(XYs)-1)])
        tsd = np.array([tss[i+1]-tss[i] for i in range(len(tss)-1)]).astype(int)

        X, Y, U, V = zip(*soa)
        plt.figure()
        ax = plt.gca()
        colors = ['#'+hashlib.md5((('{}-{}'.format(evs[i], evs[i+1]))*16).encode()).hexdigest()[:6] for i in range(len(evs)-1)]
        q = ax.quiver(X, Y, U, V, angles='xy', scale_units='xy', scale=1, color=colors, width=0.001) #, label=colors)
        ax.set_xlim([screenDims[0][0]-100,screenDims[0][1]+100])
        ax.set_ylim([screenDims[1][0]-100,screenDims[1][1]+100])
        custom_lines = [Line2D([0], [0], color=c, lw=4) for c in set(colors)]
        ax.legend(custom_lines, [colorsRevDict[c] for c in set(colors)])

        plt.draw()
        plt.show()

In [None]:

plt.rcParams["figure.figsize"] = [30, 15]
for userId in usersEncoder.keys():
    portionDf = df[df['user_id']==userId]
    print(usersEncoder[userId], userId)
    for session in portionDf['session_id'].unique():
        portionDfSession = portionDf[portionDf['session_id']==session]
        evs = [int(k[1].event_type) for k in portionDfSession.iterrows()] # events
        tss = [int(k[1].timestamp) for k in portionDfSession.iterrows()]
        # let's plt also the LogLog since, well, some users enjoy taking long breaks..
        tss1 = [math.log(math.log(10000+int(k)-tss[0])) for k in tss]
        f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
        ax1.plot(tss,evs) # base
        ax2.plot(tss1,evs) # loglog
        plt.show()

In [None]:
schemaFeatures = StructType([
    StructField('session_id', StringType(), False),
    StructField('user_id', StringType(), True),
    StructField('user_enc', FloatType(), True),
    StructField('center_x', FloatType(), False),
    StructField('center_y', FloatType(), False),
    StructField('center_click_x', FloatType(), False),
    StructField('center_click_y', FloatType(), False),
    StructField('first_x', FloatType(), False),
    StructField('first_y', FloatType(), False),
    StructField('radius', FloatType(), False),
    StructField('slope', FloatType(), False),
    StructField('narrow', FloatType(), False),
    StructField('ev1', FloatType(), False),
    StructField('ev2', FloatType(), False),
    StructField('ev3', FloatType(), False),
    StructField('ev4', FloatType(), False),
    StructField('ev5', FloatType(), False),
    StructField('stress', IntegerType(), False),
    StructField('chill', IntegerType(), False),
    StructField('nbpoints', IntegerType(), False),

])

In [None]:
def featurize(recordsIn):
    session_id, user_id = recordsIn[0]
    records = recordsIn[1]

    center = (lambda axisList: sum(axisList)/len(axisList))
    maxRadius = (lambda xc,yc,xList,yList: max([math.sqrt((xi-xc)**2+(yi-yc)**2) for xi, yi in zip(xList,yList)]))
    eventRatio = (lambda evKey, allEvents: len([1 for e in allEvents if e==evKey])/len(allEvents))

    # to be more precise, min/max duration of inter-events
    minSpeed = (lambda timestamps: min([timestamps[i+1]-timestamps[i] for i in range(len(timestamps)-1)]))
    maxSpeed = (lambda timestamps: max([timestamps[i+1]-timestamps[i] for i in range(len(timestamps)-1)]))

    def slope(xList, yList): # the overall curve direction
        x_avg = sum(xList)/len(xList)
        y_avg = sum(xList)/len(yList)
        u=sum([(xi-x_avg)*(yi-y_avg) for xi, yi in zip(xList,yList)])
        d=sum([(xi-x_avg)**2 for xi in xList])
        return u/d

    def narrow_spark(xList, yList):
        spark = SparkSession.builder.getOrCreate()
        data = [(Vectors.dense([xi,yi]),) for xi, yi in zip(xList, yList)]
        df = spark.createDataFrame(data,["features"])
        pca = spPCA(k=1, inputCol="features")
        model = pca.fit(df)
        return model.explainedVariance[0]

    def narrow_sklearn(xList, yList): # determine how compact is the curve, like is it line or cube shaped
        X = np.array([[xi,yi] for xi, yi in zip(xList, yList)])
        pca = skPCA(n_components=1)
        pca.fit(X)
        return pca.explained_variance_ratio_[0]

    xList = [record['screen_x'] for record in records]
    yList = [record['screen_y'] for record in records]
    # barycenter of all mouse registered positions
    centerX = center(xList)
    centerY = center(yList)

    # clicks come with interesting spots. let's use their 'barycenter'
    centerClickX = center((lambda x: x if x else [0])([record['screen_x'] for record in records if record['event_type']==5]))
    centerClickY = center((lambda x: x if x else [0])([record['screen_y'] for record in records if record['event_type']==5]))

    # The first move is always precious! it reflects the unconscious mind of the user once holds the mouse
    firstX = xList[0]
    firstY = yList[0]

    # how much space the user takes from the screen (as if we'll put all points inside an imaginary circle)
    tangentCircleRadius = maxRadius(centerX,centerY,xList,yList)

    # curve curvature
    slop = slope(xList, yList)
    nar = float(narrow_sklearn(xList, yList))

    allEvents = [record['event_type'] for record in records]
    # frequency of each event
    ev1,ev2,ev3,ev4,ev5 = [eventRatio(i, allEvents) for i in range(1,6)]
    # how relaxed is the user
    stress = minSpeed(sorted([record['timestamp'] for record in records if record['event_type']==2]))
    chill = maxSpeed(sorted([record['timestamp'] for record in records if record['event_type']==2])) # maybe we need to apply log here, since some users take long breaks..
    # some users use the mouse more often than others
    nbpoints = len(xList)

    # TODO: maybe we will need to add more temporal features later, like time center of actions, speed, acceleration, etc.

    if user_id:
        userEnc = float(usersEncoder[user_id])
    else:
        userEnc = None # will not be used since it's to be predicted
    return session_id, user_id, userEnc, centerX, centerY, centerClickX, centerClickY, firstX, firstY, tangentCircleRadius, slop, nar, ev1, ev2, ev3, ev4, ev5, stress, chill, nbpoints

In [None]:
featuresDataframe = spark.createDataFrame(
    trainDs.rdd.groupBy(lambda x: (x['session_id'], x['user_id'])).map(featurize), schema=schemaFeatures
)
featuresDataframe.show()

In [None]:
df_featued = featuresDataframe.toPandas()
for userId in usersEncoder.keys():
    portionDf = df_featued[df_featued['user_id']==userId]
    for session in portionDf['session_id'].unique():
        portionDfSession = portionDf[portionDf['session_id']==session]
        plt.scatter([1,2,3,4,5],portionDfSession[['ev1','ev2','ev3','ev4','ev5']])
    print(usersEncoder[userId], userId)
    plt.show()

In [None]:
# we compose our features vector column
in_col = ['center_x', 'center_y', 'center_click_x', 'center_click_y', 'first_x', 'first_y', 'radius', 'slope', 'narrow', 'ev1', 'ev2', 'ev3', 'ev4', 'ev5', 'stress', 'chill', 'nbpoints']
nbusers = featuresDataframe.select('user_enc').distinct().count()
assemble = VectorAssembler(inputCols=in_col, outputCol='assembled_features', handleInvalid='error')
a_data = assemble.transform(featuresDataframe)
scaler = MinMaxScaler(min=0.0, max=1.0, inputCol='assembled_features', outputCol='features')
fittedScaler = scaler.fit(a_data)
s_data = fittedScaler.transform(a_data)

In [None]:
# train-test split.
train_df,test_df = s_data.select('user_enc','features').randomSplit([0.80,0.20],89)
print(train_df.select('user_enc').distinct().count())
print(test_df.select('user_enc').distinct().count())
mlpc=MultilayerPerceptronClassifier( featuresCol='features',labelCol='user_enc',layers = [len(in_col),40,nbusers],maxIter=30000,blockSize=8,seed=7,solver='gd')
ann = mlpc.fit(train_df)

20
20


In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol='user_enc',predictionCol='prediction',metricName='f1')

In [None]:
def pltConfusion(pred):
    array = np.zeros((nbusers,nbusers), int)
    for k in pred.collect():
        array[int(k['user_enc']),int(k['prediction'])] = array[int(k['user_enc']),int(k['prediction'])]+1
    df_cm = pd.DataFrame(array, range(nbusers), range(nbusers))
    plt.figure(figsize=(10,7))
    sn.set(font_scale=1.4)
    sn.heatmap(df_cm, annot=True, annot_kws={"size": 16})
    plt.show()

In [None]:
pred = ann.transform(train_df)
ann_f1 = evaluator.evaluate(pred)
print('Train F1 =', ann_f1)
pltConfusion(pred)

In [None]:
pred = ann.transform(test_df)
ann_f1 = evaluator.evaluate(pred)
print('Test F1 =', ann_f1)
pltConfusion(pred)