## Objective:
## • Practice Naive Bayes algorithm based classification.
## • Identify the predictors that can be of influence by experiment.

In [1]:
## Importing Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### 1.Load the kinematics dataset as measured on mobile sensors from the file “run_or_walk.csv”. List out the columns in the dataset.

In [2]:
## Reading data from file into dataframe

df=pd.read_csv(r"./datasets/run_or_walk.csv")

In [3]:
df.head()

Unnamed: 0,date,time,username,wrist,activity,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z
0,2017-6-30,13:51:15:847724020,viktor,0,0,0.265,-0.7814,-0.0076,-0.059,0.0325,-2.9296
1,2017-6-30,13:51:16:246945023,viktor,0,0,0.6722,-1.1233,-0.2344,-0.1757,0.0208,0.1269
2,2017-6-30,13:51:16:446233987,viktor,0,0,0.4399,-1.4817,0.0722,-0.9105,0.1063,-2.4367
3,2017-6-30,13:51:16:646117985,viktor,0,0,0.3031,-0.8125,0.0888,0.1199,-0.4099,-2.9336
4,2017-6-30,13:51:16:846738994,viktor,0,0,0.4814,-0.9312,0.0359,0.0527,0.4379,2.4922


In [7]:
## List of all the columns in the dataframe
print(df.columns.to_list())

['date', 'time', 'username', 'wrist', 'activity', 'acceleration_x', 'acceleration_y', 'acceleration_z', 'gyro_x', 'gyro_y', 'gyro_z']


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88588 entries, 0 to 88587
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            88588 non-null  object 
 1   time            88588 non-null  object 
 2   username        88588 non-null  object 
 3   wrist           88588 non-null  int64  
 4   activity        88588 non-null  int64  
 5   acceleration_x  88588 non-null  float64
 6   acceleration_y  88588 non-null  float64
 7   acceleration_z  88588 non-null  float64
 8   gyro_x          88588 non-null  float64
 9   gyro_y          88588 non-null  float64
 10  gyro_z          88588 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 7.4+ MB


In [9]:
df.describe()

Unnamed: 0,wrist,activity,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z
count,88588.0,88588.0,88588.0,88588.0,88588.0,88588.0,88588.0,88588.0
mean,0.52217,0.500801,-0.074811,-0.562585,-0.313956,0.00416,0.037203,0.022327
std,0.499511,0.500002,1.009299,0.658458,0.486815,1.253423,1.198725,1.914423
min,0.0,0.0,-5.3505,-3.299,-3.7538,-4.4306,-7.4647,-9.48
25%,0.0,0.0,-0.3818,-1.0335,-0.376,-0.9207,-0.644825,-1.345125
50%,1.0,1.0,-0.0595,-0.7591,-0.221,0.0187,0.0393,0.0069
75%,1.0,1.0,0.3555,-0.241775,-0.0859,0.8888,0.7337,1.3982
max,1.0,1.0,5.6033,2.668,1.6403,4.8742,8.498,11.2662


In [15]:
df["activity"].value_counts()

1    44365
0    44223
Name: activity, dtype: int64

### 2. Let the target variable ‘y’ be the activity and assign all the columns after it to ‘x’.

In [16]:
y_col = ["activity"]
x_col = ['wrist','acceleration_x', 'acceleration_y', 'acceleration_z', 'gyro_x', 'gyro_y', 'gyro_z']

In [18]:
X = df[x_col]
Y = df[y_col]

In [19]:
Y = np.squeeze(Y)

### 3.Using Scikit-learn fit a Gaussian Naive Bayes model and observe the accuracy. Generate a classification report using scikit learn.

In [36]:
## importing test train split package
from sklearn.model_selection import train_test_split

In [78]:
## Function to perform test rain split
def do_test_train_split(df,y_col,x_col):
    X = df[x_col]
    Y = df[y_col]
    Y = np.squeeze(Y)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20,random_state=48, stratify = Y)
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    return (X_train, X_test, y_train, y_test)

In [79]:
X_train, X_test, y_train, y_test = do_test_train_split(df,y_col,x_col)

(70870, 3) (17718, 3) (70870,) (17718,)


In [80]:
## Apply Standardization 
from sklearn.preprocessing import StandardScaler

In [81]:
## Function to apply standardization on dataset
def apply_standardization(X_train,X_test):

    scaler = StandardScaler()
    # Fit on training set only.
    scaler.fit(X_train)
    # Apply transform to both the training set and the test set.
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    return (X_train,X_test)

In [82]:
X_train,X_test = apply_standardization(X_train,X_test)

In [83]:
## importing Gsussina Model
from sklearn.naive_bayes import GaussianNB

In [84]:
## Function to train Naive Bayes Model
def train_model(X_train, y_train):
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    return gnb

In [85]:
gnb = train_model(X_train, y_train)

In [86]:
y_pred = gnb.predict(X_test)

In [87]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [88]:
## function to geneerate model metrics
def generate_metrics(model,X_train, X_test, y_train, y_test,y_pred):
    ## Training Score
    print(f"Accuracy score of Model on Training Data = {round(model.score(X_train, y_train),3)} \n")
    
    print(f"Accuracy score of Model on Test Data = {round(accuracy_score(y_test,y_pred),3)} \n\n")
    
    print(classification_report(y_test, y_pred))
    

In [89]:
generate_metrics(gnb,X_train, X_test, y_train, y_test,y_pred)

Accuracy score of Model on Training Data = 0.957 

Accuracy score of Model on Test Data = 0.959 


              precision    recall  f1-score   support

           0       0.93      0.99      0.96      8845
           1       0.99      0.92      0.96      8873

    accuracy                           0.96     17718
   macro avg       0.96      0.96      0.96     17718
weighted avg       0.96      0.96      0.96     17718



## Accuracy of the model is 0.96

### 4.Repeat the model once using only the acceleration values as predictors and then using only the gyro values as predictors. Comment on the difference in accuracy between both the models.

#### if x columns are only accelerations

In [90]:
## if x columns are only accelerations
y_col = ["activity"]
x_col = ['acceleration_x', 'acceleration_y', 'acceleration_z']

In [91]:
## Perfromiing test rain split
X_train, X_test, y_train, y_test = do_test_train_split(df,y_col,x_col)

(70870, 3) (17718, 3) (70870,) (17718,)


In [92]:
## Performing Standardization on data
X_train,X_test = apply_standardization(X_train,X_test)

In [93]:
## training the model
gnb_acc = train_model(X_train, y_train)

In [94]:
## predicting on test dataset
y_pred = gnb_acc.predict(X_test)

In [95]:
generate_metrics(gnb_acc,X_train, X_test, y_train, y_test, y_pred)

Accuracy score of Model on Training Data = 0.957 

Accuracy score of Model on Test Data = 0.959 


              precision    recall  f1-score   support

           0       0.93      0.99      0.96      8845
           1       0.99      0.92      0.96      8873

    accuracy                           0.96     17718
   macro avg       0.96      0.96      0.96     17718
weighted avg       0.96      0.96      0.96     17718



#### if x columns are only accelerations

In [96]:
y_col = ["activity"]
x_col = ['gyro_x', 'gyro_y', 'gyro_z']

In [97]:
## Perfromiing test rain split
X_train, X_test, y_train, y_test = do_test_train_split(df,y_col,x_col)

(70870, 3) (17718, 3) (70870,) (17718,)


In [98]:
## Performing Standardization on data
X_train,X_test = apply_standardization(X_train,X_test)

In [99]:
## training the model
gnb_gyro = train_model(X_train, y_train)

In [100]:
## predicting on test dataset
y_pred = gnb_gyro.predict(X_test)

In [102]:
generate_metrics(gnb_gyro,X_train, X_test, y_train, y_test, y_pred)

Accuracy score of Model on Training Data = 0.649 

Accuracy score of Model on Test Data = 0.649 


              precision    recall  f1-score   support

           0       0.62      0.75      0.68      8845
           1       0.69      0.55      0.61      8873

    accuracy                           0.65     17718
   macro avg       0.66      0.65      0.65     17718
weighted avg       0.66      0.65      0.65     17718



## The accuracy of model containing all Acceleration columns is greater than the accuracy of model contaning all Gyro columns.
## In the first basic Naive Bayes model we could have removed all Gyro columns and the accuracy would have been equivalent to Acceleartion column model.