In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
from mlxtend.plotting import plot_decision_regions
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [22]:
dataset = pd.read_csv(
    './data/ConfLongDemo_JSI.csv',
    sep=',',
    names=["seq", "tag", "timestamp", "date", "x", "y", "z", "activity"])

In [23]:
# Encoding seq
dataset['seq'] = dataset['seq'].astype('category')
dataset['seq'] = dataset['seq'].cat.codes
# Encoding activity
dataset['activity'] = dataset['activity'].astype('category')
dataset['activity'] = dataset['activity'].cat.codes
# Encoding tag
dataset['tag'] = dataset['tag'].astype('category')
dataset['tag'] = dataset['tag'].cat.codes
del dataset['timestamp']
# normalizing seq
dataset['seq'] = (dataset['seq'] -
                  dataset['seq'].mean()) / dataset['seq'].std()
# normalizing tag
dataset['tag'] = (dataset['tag'] -
                  dataset['tag'].mean()) / dataset['tag'].std()
# normalizing x
dataset['x'] = (dataset['x'] - dataset['x'].mean()) / dataset['x'].std()
# normalizing y
dataset['y'] = (dataset['y'] - dataset['y'].mean()) / dataset['y'].std()
# normalizing z
dataset['z'] = (dataset['z'] - dataset['z'].mean()) / dataset['z'].std()

print(dataset)

             seq       tag                     date         x         y  \
0      -1.774225 -1.302171  27.05.2009 14:03:25:127  1.366019  0.412769   
1      -1.774225  1.427813  27.05.2009 14:03:25:183  1.615982  0.177858   
2      -1.774225  0.517818  27.05.2009 14:03:25:210  1.689269  0.273506   
3      -1.774225 -1.302171  27.05.2009 14:03:25:237  1.393201  0.386521   
4      -1.774225 -0.392177  27.05.2009 14:03:25:263  1.651463  0.792755   
...          ...       ...                      ...       ...       ...   
164855  1.501425 -0.392177  27.05.2009 11:50:41:957  0.434528  0.733890   
164856  1.501425 -1.302171  27.05.2009 11:50:41:983  0.628153  0.649793   
164857  1.501425  1.427813  27.05.2009 11:50:42:010  0.412067  0.461913   
164858  1.501425 -0.392177  27.05.2009 11:50:42:063  0.369800  0.494516   
164859  1.501425 -1.302171  27.05.2009 11:50:42:090  0.435095  0.512274   

               z  activity  
0       0.235318        10  
1       2.443221        10  
2       1.45

In [24]:
s = dataset['date']
print(s,type(s))
s = s.tolist()
# this proves that date is same. only time differs
print(len(s))
r = ''
for i in range(len(s)):
    r = (s[i].split())[1]
    r = r.split(':')
    t = r[0]+r[1]+r[2]+r[3]
    s[i] = int(t)
del dataset['date']
dataset['date'] = s
dataset['date'] = (dataset['date'] - dataset['date'].mean()) / dataset['date'].std()

0         27.05.2009 14:03:25:127
1         27.05.2009 14:03:25:183
2         27.05.2009 14:03:25:210
3         27.05.2009 14:03:25:237
4         27.05.2009 14:03:25:263
                   ...           
164855    27.05.2009 11:50:41:957
164856    27.05.2009 11:50:41:983
164857    27.05.2009 11:50:42:010
164858    27.05.2009 11:50:42:063
164859    27.05.2009 11:50:42:090
Name: date, Length: 164860, dtype: object <class 'pandas.core.series.Series'>
164860


In [25]:
print(dataset)
target = dataset['activity']
del dataset['activity']

             seq       tag         x         y         z  activity      date
0      -1.774225 -1.302171  1.366019  0.412769  0.235318        10  0.634624
1      -1.774225  1.427813  1.615982  0.177858  2.443221        10  0.634628
2      -1.774225  0.517818  1.689269  0.273506  1.452321        10  0.634630
3      -1.774225 -1.302171  1.393201  0.386521  0.128646        10  0.634631
4      -1.774225 -0.392177  1.651463  0.792755  0.184251        10  0.634633
...          ...       ...       ...       ...       ...       ...       ...
164855  1.501425 -0.392177  0.434528  0.733890 -0.937181        10 -1.046730
164856  1.501425 -1.302171  0.628153  0.649793 -0.060796        10 -1.046729
164857  1.501425  1.427813  0.412067  0.461913  2.465883        10 -1.046727
164858  1.501425 -0.392177  0.369800  0.494516 -0.957927        10 -1.046723
164859  1.501425 -1.302171  0.435095  0.512274 -0.140938        10 -1.046722

[164860 rows x 7 columns]


In [26]:
x_train, x_test, y_train, y_test = train_test_split(dataset, target, test_size=0.25, random_state=0)

In [27]:
clf = LogisticRegression(penalty='l2', solver='saga',multi_class='multinomial')

In [28]:
clf.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
score = clf.score(x_test,y_test)
print(score)

0.4085405798859639


In [30]:
clf = LogisticRegression(penalty='l1', solver='saga',multi_class='multinomial')
clf.fit(x_train,y_train)
score = clf.score(x_test,y_test)
print(score)

0.4084920538638845


In [31]:
clf = LogisticRegression(penalty='l2', solver='newton-cg',multi_class='multinomial')
clf.fit(x_train,y_train)
score = clf.score(x_test,y_test)
print(score)

0.4085648428970035


In [32]:
clf = LogisticRegression(penalty='l2', solver='lbfgs',multi_class='multinomial')
clf.fit(x_train,y_train)
score = clf.score(x_test,y_test)
print(score)

0.4085405798859639
