In [39]:
import glob
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from statistics import mean, median
import pickle

In [2]:
# Read feature csv files and concate as single dataframe
full_path = r'D:/School/IOT/CA2 Assignment/'

path_f = full_path + 'features' # use your path
all_feature_files = glob.glob(path_f + "/*.csv")

df_f = pd.DataFrame()
df_l = pd.DataFrame()

li = []

for filename in all_feature_files:
    df = pd.read_csv(filename, index_col=None, header=0, dtype={'bookingID' : 'object'})
    li.append(df)

df_f = pd.concat(li, axis=0, ignore_index=True)

# Read label csv file
df_l = pd.read_csv(full_path + 'label/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv', index_col=None, header=0, dtype={'bookingID' : 'object'})

In [3]:
# sort according to bookingID & seconds
df_f = df_f.sort_values(['bookingID', 'second']).reset_index(drop=True)
df_l = df_l.sort_values(by='bookingID', ascending=True)

In [4]:
# Data cleaning
# identify all bookingID with conflicting labels (multiple labels) and remove
conflict_trips = df_l.loc[df_l.bookingID.duplicated(), 'bookingID']
df_f = df_f[~df_f['bookingID'].isin(conflict_trips)]
df_l = df_l[~df_l['bookingID'].isin(conflict_trips)]

# filter out GPS data with low accuracy and speed equals -1
df_f = df_f.loc[(df_f.Accuracy <= np.percentile(df_f.Accuracy, 95)) & (df_f.Speed != -1)]

In [5]:
# column names
COL_ACCE = ('acceleration_x', 'acceleration_y', 'acceleration_z')
COL_GYRO = ('gyro_x', 'gyro_y', 'gyro_z')

In [6]:
# Data transformation
# calculate magnitude of acceleration
df_f['acceleration'] = np.sqrt((df_f.loc[:, COL_ACCE] ** 2).sum(axis=1))

# transform gyro readings using Principal Component Analysis
pca_gyro = PCA().fit(df_f.loc[:, COL_GYRO])
df_f['gyro'] = 0 # initialize the column first (the next line of code won't work if not initialized)
df_f['gyro'] = pca_gyro.transform(df_f.loc[:, COL_GYRO])

In [7]:
# drop unused columns
df_f.drop(['Accuracy', 'Bearing', 'second'], axis=1, inplace=True)

In [8]:
df_f.head()

Unnamed: 0,bookingID,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,Speed,acceleration,gyro
0,0,0.818112,-9.941461,-2.014999,-0.016245,-0.09404,0.070732,3.442991,10.176551,-0.089178
1,0,0.546405,-9.83559,-2.038925,-0.047092,-0.078874,0.043187,0.228454,10.059553,-0.076696
2,0,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,0.228454,9.503762,-0.032301
3,0,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,0.228454,9.83032,0.002385
4,0,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,0.228454,9.967466,-0.045045


In [9]:
# Join dataframes, use inner join for model building. Use left join to keep bookings with no label as testing file.

df = pd.DataFrame()
df = pd.merge(left=df_f, right=df_l, how='inner')

In [10]:
feature_cols = ['Speed', 'acceleration', 'gyro', 'label', 'acceleration_x', 'acceleration_y', 'acceleration_z', 'gyro_x', 'gyro_y', 'gyro_z']

In [11]:
df_avg = pd.DataFrame()

In [12]:
df_avg = df.groupby('bookingID', as_index=False)[feature_cols].mean()

In [13]:
df_avg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19934 entries, 0 to 19933
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bookingID       19934 non-null  object 
 1   Speed           19934 non-null  float64
 2   acceleration    19934 non-null  float64
 3   gyro            19934 non-null  float64
 4   label           19934 non-null  int64  
 5   acceleration_x  19934 non-null  float64
 6   acceleration_y  19934 non-null  float64
 7   acceleration_z  19934 non-null  float64
 8   gyro_x          19934 non-null  float64
 9   gyro_y          19934 non-null  float64
 10  gyro_z          19934 non-null  float64
dtypes: float64(9), int64(1), object(1)
memory usage: 1.8+ MB


In [30]:
y = df_avg['label'].values
X = df_avg.drop(['label'], axis=1).values

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [32]:
lr = LogisticRegression()

In [33]:
lr.fit(X_train, y_train)

LogisticRegression()

In [34]:
test = lr.predict(X_test)

In [37]:
lr.score(X_test, y_test)

0.7562076749435666

In [40]:
pickle.dump(lr, open('model.dat', 'wb'))