# Importación de librerias

In [1]:
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score,accuracy_score
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, KNeighborsRegressor

# Lectura de archivo

In [2]:
conn = sqlite3.connect('./data/grey/keystroke.db')
df = pd.read_sql_query('select * from keystroke_datas', conn, parse_dates=['date'])
df.drop(df[df['password'] != 'greyc laboratory'].index, inplace = True)

tempData = []

n_data_rows = df.shape[0]
        
columns = ["user_id"]   

for i in range(60):
    columns.append("ft_" + str(i+1))
        
for i in range(n_data_rows):
    raw_id = [df.iloc[i]["id"]]
    user_id = [df.iloc[i]["user_id"]]
    time_to_type = [df.iloc[i]["time_to_type"]]
    
    vector = df.iloc[i]["vector"].split()      
 
    
    if(len(vector) == 60 ):
        tempData.append(user_id  + list(map(int, vector)))
                                                
df = pd.DataFrame(tempData, columns = columns  )

conn.close()

# Dataset procesado

In [3]:
df

Unnamed: 0,user_id,ft_1,ft_2,ft_3,ft_4,ft_5,ft_6,ft_7,ft_8,ft_9,...,ft_51,ft_52,ft_53,ft_54,ft_55,ft_56,ft_57,ft_58,ft_59,ft_60
0,1,2203168,600864,1101584,1602304,801152,2303312,1001440,2603744,1402016,...,3304752,2002880,3605184,2002880,2804032,2403456,2904176,2403456,2303312,2103024
1,1,2103024,500720,2703888,1602304,2103024,3404896,1402016,2603744,1402016,...,4105904,2303312,3505040,2103024,2203168,2603744,2804032,2603744,3104464,2103024
2,1,2203168,701008,1402016,1301872,2303312,3304752,901296,3004320,1502160,...,4005760,1902736,3905616,2203168,2303312,3104464,2603744,2203168,2203168,2603744
3,1,2103024,701008,1902736,1702448,1602304,2603744,1502160,3004320,1502160,...,3404896,2703888,4005760,2203168,2303312,2403456,3104464,2103024,2904176,2403456
4,1,2002880,600864,1201728,1602304,1502160,3004320,1101584,2904176,1602304,...,3605184,2203168,3905616,2303312,2603744,3104464,2603744,1902736,2503600,2203168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7543,125,3905616,3104464,4105904,3004320,3605184,3505040,3404896,3505040,3605184,...,4706768,4606624,4706768,4806912,5207488,4105904,4706768,4706768,7510800,4506480
7544,125,3304752,3004320,4907056,3505040,3605184,5307632,3304752,4806912,3905616,...,6208928,4206048,5608064,4907056,4506480,4606624,4506480,5407776,4105904,5007200
7545,125,3805472,2804032,3605184,3805472,3505040,3805472,3505040,3605184,3805472,...,4806912,4506480,4606624,4806912,5507920,4005760,5207488,4506480,4206048,4206048
7546,125,3705328,2904176,12718288,4005760,4506480,4206048,3805472,5107344,3705328,...,5107344,4606624,6008640,4706768,4806912,6809792,6509360,4606624,4606624,5107344


# Separación de la data de entrenamiento y de prueba
Se toma de forma aleatoria el 80% de los registros de cada usuario para considerarlos como data de entrenamiento, mientras que el restante forma a pasar a la data de prueba

In [4]:
subjects = df["user_id"].unique()

train_users = []
test_users = []

for subject in subjects:
    genuine_user_data = df.loc[df.user_id == subject, :]
            
    imposter_data = df.loc[df.user_id != subject, :]

    train, test_genuine  = train_test_split(genuine_user_data, train_size = 0.8, random_state=43, shuffle=True)

    train_users.append(train)
    test_users.append(test_genuine)


X = pd.concat(train_users)
y = pd.concat(test_users)

X_train = X.iloc[:, 1:,]
y_train = X["user_id"]

X_test = y.iloc[:,1:,]
y_test = y["user_id"]

# Entrenamiento del modelo KNeighbors Classifier

In [5]:
#Usando la distancia euclidanea
clf_euclidean = KNeighborsClassifier(n_neighbors=len(subjects),metric= 'euclidean' )
clf_euclidean.fit(X_train, y_train)
y_pred_euclidean = clf_euclidean.predict(X_test)

#Usando la distancia manhattan
clf_manhattan = KNeighborsClassifier(n_neighbors=len(subjects),metric= 'manhattan' )
clf_manhattan.fit(X_train, y_train)
y_pred_manhattan = clf_manhattan.predict(X_test)

# Resultados del modelo KNeighbors Classifier

In [6]:
print("Accuracy euclidean:",round(accuracy_score(y_test, y_pred_euclidean)*100,2),"%")
print("Accuracy manhattan:",round(accuracy_score(y_test, y_pred_manhattan)*100,2),"%")

Accuracy euclidean: 44.41 %
Accuracy manhattan: 59.25 %


# Entrenamiento del modelo NearestCentroid

In [7]:
clf_euclidean = NearestCentroid(metric= 'euclidean')
clf_euclidean.fit(X_train, y_train)
y_pred_euclidean = clf_euclidean.predict(X_test)

clf_manhattan = NearestCentroid(metric= 'manhattan')
clf_manhattan.fit(X_train, y_train)
y_pred_manhattan = clf_manhattan.predict(X_test)

# Resultados del modelo NearestCentroid

In [8]:
print("Accuracy euclidean:",round(accuracy_score(y_test, y_pred_euclidean)*100,2),"%")
print("Accuracy manhattan:",round(accuracy_score(y_test, y_pred_manhattan)*100,2),"%")

Accuracy euclidean: 52.06 %
Accuracy manhattan: 75.71 %
