In [2]:
import pandas as pd
import statistics
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import metrics
import pickle
import joblib

def main():
    
    # loading original dataset from csv file into pandas dataframe
    
    main_df = pd.read_csv("Train_keystroke.csv") 
    print(main_df.iloc[0,2])

    # creating empty dataframe for new dataset
    
    df = pd.DataFrame({'user': pd.Series(dtype='int'), 
                   'mean_HT': pd.Series(dtype='float'),
                   'mean_PPT': pd.Series(dtype='float'),
                   'mean_RRT': pd.Series(dtype='float'),
                   'mean_RPT': pd.Series(dtype='float'),
                   'sd_HT': pd.Series(dtype='float'),
                   'sd_PPT': pd.Series(dtype='float'),
                   'sd_RRT': pd.Series(dtype='float'),
                   'sd_RPT': pd.Series(dtype='float')})
    print(df)

    # filling new dataframe as training set with mean and stdev values of HT, PPT, RRT, and RPT
    
    for i in range(len(main_df)): 
        j = 0
        HT_values_list = []
        PPT_values_list = []
        RRT_values_list = []
        RPT_values_list = []
        while j < len(main_df.columns) - 3:
            HT_values_list.append(main_df.iloc[i,j+1] - main_df.iloc[i,j])
            PPT_values_list.append(main_df.iloc[i,j+2] - main_df.iloc[i,j])
            RRT_values_list.append(main_df.iloc[i,j+3] - main_df.iloc[i,j+1])
            RPT_values_list.append(main_df.iloc[i,j+2] - main_df.iloc[i,j+1])
 
            if j == len(main_df.columns) - 4:
                HT_values_list.append(main_df.iloc[i,j+3] - main_df.iloc[i,j+2])
            
            j += 2
    
        mean_HT = statistics.mean(HT_values_list)
        mean_PPT = statistics.mean(PPT_values_list)
        mean_RRT = statistics.mean(RRT_values_list)
        mean_RPT = statistics.mean(RPT_values_list)

        sd_HT = statistics.stdev(HT_values_list)
        sd_PPT = statistics.stdev(PPT_values_list)
        sd_RRT = statistics.stdev(RRT_values_list)
        sd_RPT = statistics.stdev(RPT_values_list)

        df = df.append({'user':main_df.iloc[i,0],
                        'mean_HT':mean_HT,
                        'mean_PPT':mean_PPT,
                        'mean_RRT':mean_RRT,
                        'mean_RPT':mean_RPT,
                        'sd_HT':sd_HT,
                        'sd_PPT':sd_PPT,
                        'sd_RRT':sd_RRT,
                        'sd_RPT':sd_RPT},ignore_index=True)

    print(df)
    
    # shuffling or sampling dataset
    
    df = df.sample(frac=1)

    # splitting X and y from training set 
    
    X_train = df.drop(["user"], axis=1)
    y_train = df["user"]
    
    # creating and training SVM model

    clf = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo')
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)

    print(y_pred_train)
    print("Accuracy:",metrics.accuracy_score(y_train, y_pred_train))

    # saving the SVM model

    joblib.dump(clf, open('SVM.pkl', 'wb'))
    
    # loading the SVM model

    clf = joblib.load(open('SVM.pkl', 'rb'))
    y_pred_test = clf.predict(X_train)
    print(y_pred_test)
    print("Accuracy:",metrics.accuracy_score(y_train, y_pred_test))            
            
    # creating and training RF model

    clf_rf = RandomForestClassifier(n_estimators=100)
    clf_rf.fit(X_train,y_train)
    y_pred_train = clf_rf.predict(X_train)

    print(y_pred_train)
    print("Accuracy:",metrics.accuracy_score(y_train, y_pred_train))      

    # saving the RF model

    joblib.dump(clf_rf, open('RF.pkl', 'wb'))

    # loading the RF model

    clf_rf = joblib.load(open('RF.pkl', 'rb'))
    y_pred_test = clf_rf.predict(X_train)
    print(y_pred_test)
    print("Accuracy:",metrics.accuracy_score(y_train, y_pred_test))

    # creating and training XGBoost model

    clf_xg = XGBClassifier()
    for i in range(len(y_train)): y_train.iloc[i] -= 1
    clf_xg.fit(X_train,y_train)
    y_pred_train = clf_xg.predict(X_train)
    for i in range(len(y_pred_train)): y_pred_train[i] += 1
    for i in range(len(y_train)): y_train.iloc[i] += 1    

    print(y_pred_train)
    print("Accuracy:",metrics.accuracy_score(y_train, y_pred_train))      

    # saving the XGBoost model

    joblib.dump(clf_xg, open('XGB.pkl', 'wb'))

    # loading the XGBoost model

    clf_xg = joblib.load(open('XGB.pkl', 'rb'))
    y_pred_test = clf_xg.predict(X_train)
    for i in range(len(y_pred_test)): y_pred_test[i] += 1
    print(y_pred_test)
    print("Accuracy:",metrics.accuracy_score(y_train, y_pred_test))


if __name__ == "__main__":
    main()

120
Empty DataFrame
Columns: [user, mean_HT, mean_PPT, mean_RRT, mean_RPT, sd_HT, sd_PPT, sd_RRT, sd_RPT]
Index: []
      user  mean_HT  mean_PPT  mean_RRT  mean_RPT       sd_HT      sd_PPT  \
0      1.0    147.0     219.0     229.0      72.0   81.565924   78.898669   
1      1.0    103.0     175.0     185.0      72.0   56.964901   47.518417   
2      1.0    146.0     229.0     239.0      82.0  140.929060  152.440152   
3      1.0    251.0     323.0     331.0      72.0  367.597606  357.145629   
4      1.0     93.0     166.0     173.0      73.0   55.982140   50.239427   
..     ...      ...       ...       ...       ...         ...         ...   
875  110.0     70.0     173.0     190.0     102.0  106.390789  114.520740   
876  110.0     95.0     195.0     213.0      99.0  141.499117  146.386475   
877  110.0     51.0     173.0     191.0     121.0   82.885463   96.239285   
878  110.0     59.0     164.0     181.0     105.0  104.144131  120.938001   
879  110.0     60.0     165.0     183

[ 69.  13.  45.  81.  32.  26.  27.  68.  24.  45.  80.  18.  40.  55.
  15.  86.  77.  86.  90.  98.  62.  40.  85. 109.  67.   3.  71.  79.
  81.  79. 100.  94.  99. 107. 104.  19.  55.  81.  85.  85.  89.  30.
   6.  53.  24. 109.   5.  43.  13.  95.   1.  23.  69.  32.  83.  32.
  66.  58.   3.  93.   6.  45.  10.  35.  50.   6.  73.  27.  86.  83.
  20.  89.  58. 109.  24.  90. 110.  72.  40.  65.  47.  21.  44.  39.
  65.  93.  97.  38.  83.  86. 101. 100.  94.  61.   3.  20.  79.  59.
   5.  13.  62. 110. 110.  89. 110.  42.  90.  82.  91.  32.  74.  73.
  59.  84.  88.   6.  26.  21.  64.  23.  98.  97.  28.  96.   4.  21.
  31.  65.  28.  88.  29.  18.  88.   4.   2.  61.  19.  84.  74.  65.
  52.  54.  57. 103.  13.  45.  36.  49.  14.  95.  67.  19.  46.   8.
  71.  97. 102.  62.   4.  94. 107.  22.  53.  90.  75.  26.  74.  72.
  33.  95.  15.   2.  56.  33.  52.  59.  50.  49.  17.  17.  66.   9.
  52.  88.  23.  82.  60.  45.  91.  62.  80.   9.  78.  10.  38.   8.
 103. 

[ 69  13  45  81  32  26  27  68  24  45  80  18  40  55  15  86  77  86
  90  98  62  40  85 109  67   3  71  79  81  79 100  94  99 107 104  19
  55  81  85  85  89  30   6  53  24 109   5  43  13  95   1  23  69  32
  83  32  66  58   3  93   6  45  10  35  50   6  73  27  86  83  20  89
  58 109  24  90 110  72  40  65  47  21  44  39  65  93  97  38  83  86
 101 100  94  61   3  20  79  59   5  13  62 110 110  89 110  42  90  82
  91  32  74  73  59  84  88   6  26  21  64  23  98  97  28  96   4  21
  31  65  28  88  29  18  88   4   2  61  19  84  74  65  52  54  57 103
  13  45  36  49  14  95  67  19  46   8  71  97 102  62   4  94 107  22
  53  90  75  26  74  72  33  95  15   2  56  33  52  59  50  49  17  17
  66   9  52  88  23  82  60  45  91  62  80   9  78  10  38   8 103  51
  25  29  80  41  35  35  74  75 107  92  46   4  43  28  35 106 108  80
  35  20  64   7   1 105  56   3  87  11  22  49  85  51   8  44 107   7
  45  79  50  68  21  35  64  77  61  82  56  56  5