<a href="https://colab.research.google.com/github/arushi-lu/deep_learning/blob/main/CNN_wisdm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
!pip install pandas



In [32]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from scipy import stats

In [46]:
# Open and read the raw data file
file = open('/content/drive/MyDrive/WISDM_ar_v1.1_raw.txt')
lines = file.readlines()

processedList = []

# Process each line in the raw data file
for i, line in enumerate(lines):
    try:
        # Split each line into components
        line = line.split(',')
        # Extract and clean the z-axis value
        last = line[5].split(';')[0].strip()
        if last == '':
            break
        # Append the cleaned data to the list
        temp = [line[0], line[1], line[2], line[3], line[4], last]
        processedList.append(temp)
    except:
        print('Error at line number: ', i)

# Define column names and create a DataFrame
columns = ['user', 'activity', 'time', 'x', 'y', 'z']
data = pd.DataFrame(data=processedList, columns=columns)

Error at line number:  281873
Error at line number:  281874
Error at line number:  281875


In [47]:
# Display the first few rows and basic information about the dataset
print(data.head())

  user activity            time            x          y            z
0   33  Jogging  49105962326000   -0.6946377  12.680544   0.50395286
1   33  Jogging  49106062271000     5.012288  11.264028   0.95342433
2   33  Jogging  49106112167000     4.903325  10.882658  -0.08172209
3   33  Jogging  49106222305000  -0.61291564  18.496431    3.0237172
4   33  Jogging  49106332290000   -1.1849703  12.108489     7.205164


In [48]:
print(data.shape)

(343416, 6)


In [49]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343416 entries, 0 to 343415
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   user      343416 non-null  object
 1   activity  343416 non-null  object
 2   time      343416 non-null  object
 3   x         343416 non-null  object
 4   y         343416 non-null  object
 5   z         343416 non-null  object
dtypes: object(6)
memory usage: 15.7+ MB
None


In [50]:
print(data.isnull().sum())

user        0
activity    0
time        0
x           0
y           0
z           0
dtype: int64


In [51]:
# Convert x, y, z columns to float
data['x'] = data['x'].astype('float')
data['y'] = data['y'].astype('float')
data['z'] = data['z'].astype('float')

# Sampling frequency (not used here but could be used later)
Fs = 20

# Get the list of activities
activities = data['activity'].value_counts().index
print(activities)

Index(['Walking', 'Jogging', 'Upstairs', 'Downstairs', 'Sitting', 'Standing'], dtype='object', name='activity')


In [52]:
# Drop user and time columns for further processing
df = data.drop(['user', 'time'], axis=1).copy()
print(df.head())

  activity         x          y         z
0  Jogging -0.694638  12.680544  0.503953
1  Jogging  5.012288  11.264028  0.953424
2  Jogging  4.903325  10.882658 -0.081722
3  Jogging -0.612916  18.496431  3.023717
4  Jogging -1.184970  12.108489  7.205164


In [53]:
# Display the count of each activity
print(df['activity'].value_counts())

activity
Walking       137375
Jogging       129392
Upstairs       35137
Downstairs     33358
Sitting         4599
Standing        3555
Name: count, dtype: int64


In [54]:
# Balance the dataset by sampling an equal number of records for each activity
Walking = df[df['activity'] == 'Walking'].head(3555).copy()
Jogging = df[df['activity'] == 'Jogging'].head(3555).copy()
Upstairs = df[df['activity'] == 'Upstairs'].head(3555).copy()
Downstairs = df[df['activity'] == 'Downstairs'].head(3555).copy()
Sitting = df[df['activity'] == 'Sitting'].head(3555).copy()
Standing = df[df['activity'] == 'Standing'].copy()

balanced_data = pd.DataFrame()
balanced_data = pd.concat([balanced_data, Walking, Jogging, Upstairs, Downstairs, Sitting, Standing])

# Display the shape and value counts of the balanced dataset
print(balanced_data.shape)

(21330, 4)


In [55]:
print(balanced_data['activity'].value_counts())


activity
Walking       3555
Jogging       3555
Upstairs      3555
Downstairs    3555
Sitting       3555
Standing      3555
Name: count, dtype: int64


In [56]:
print(balanced_data.head())

    activity         x          y         z
597  Walking  0.844462   8.008764  2.792171
598  Walking  1.116869   8.621680  3.786457
599  Walking -0.503953  16.657684  1.307553
600  Walking  4.794363  10.760075 -1.184970
601  Walking -0.040861   9.234595 -0.694638


In [57]:
# Encode the activity labels
label = LabelEncoder()
balanced_data['label'] = label.fit_transform(balanced_data['activity'])
print(balanced_data.head())

    activity         x          y         z  label
597  Walking  0.844462   8.008764  2.792171      5
598  Walking  1.116869   8.621680  3.786457      5
599  Walking -0.503953  16.657684  1.307553      5
600  Walking  4.794363  10.760075 -1.184970      5
601  Walking -0.040861   9.234595 -0.694638      5


In [40]:
# Display the classes of the labels
print(label.classes_)

['Downstairs' 'Jogging' 'Sitting' 'Standing' 'Upstairs' 'Walking']


In [58]:
# Extract features and labels
X = balanced_data[['x', 'y', 'z']]
y = balanced_data['label']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Create a DataFrame for scaled features
scaled_X = pd.DataFrame(data=X, columns=['x', 'y', 'z'])
scaled_X['label'] = y.values
print(scaled_X.head())

          x         y         z  label
0  0.000503 -0.099190  0.337933      5
1  0.073590  0.020386  0.633446      5
2 -0.361275  1.588160 -0.103312      5
3  1.060258  0.437573 -0.844119      5
4 -0.237028  0.139962 -0.698386      5


In [59]:
scaled_X

Unnamed: 0,x,y,z,label
0,0.000503,-0.099190,0.337933,5
1,0.073590,0.020386,0.633446,5
2,-0.361275,1.588160,-0.103312,5
3,1.060258,0.437573,-0.844119,5
4,-0.237028,0.139962,-0.698386,5
...,...,...,...,...
21325,-0.470217,0.178084,0.261019,3
21326,-0.542658,0.193692,0.248875,3
21327,-0.628514,0.197593,0.261019,3
21328,-0.781444,0.049322,0.155768,3


In [65]:
# Define frame and hop sizes
Fs = 20
frame_size = Fs * 4  # 80 samples
hop_size = Fs * 2    # 40 samples

def get_frames(df, frame_size, hop_size):
    N_FEATURES = 3

    frames = []
    labels = []
    for i in range(0, len(df) - frame_size, hop_size):
        x = df['x'].values[i: i + frame_size]
        y = df['y'].values[i: i + frame_size]
        z = df['z'].values[i: i + frame_size]

        # Retrieve the most frequent label in the frame
        label = stats.mode(df['label'][i: i + frame_size])[0]
        # Convert the LabelEncoder object to a Series

        frames.append([x, y, z])
        labels.append(label)

    # Reshape frames to be compatible with CNN input
    frames = np.asarray(frames).reshape(-1, frame_size, N_FEATURES)
    labels = np.asarray(labels)

    return frames, labels

# Segment the data into frames
X, y = get_frames(scaled_X, frame_size, hop_size)
print(X.shape, y.shape)


(532, 80, 3) (532,)


In [66]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split

# Custom Dataset class for PyTorch
class WISDMDataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        label = self.labels[idx]
        if self.transform:
            sample = self.transform(sample)
        return sample, label

# Convert to PyTorch Datasets
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)
dataset = WISDMDataset(X_tensor, y_tensor)

# Split into training and testing datasets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [70]:
print("Training dataset size:", len(train_dataset))
print("Testing dataset size:", len(test_dataset))

Training dataset size: 425
Testing dataset size: 107


In [71]:
# Print shapes of batches
for batch in train_loader:
    X_batch, y_batch = batch
    print("Training batch - Input shape:", X_batch.shape, "Label shape:", y_batch.shape)
    break  # Print only the first batch for demonstration

for batch in test_loader:
    X_batch, y_batch = batch
    print("Testing batch - Input shape:", X_batch.shape, "Label shape:", y_batch.shape)
    break  # Print only the first batch for demonstration

Training batch - Input shape: torch.Size([64, 80, 3]) Label shape: torch.Size([64])
Testing batch - Input shape: torch.Size([64, 80, 3]) Label shape: torch.Size([64])


In [72]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [73]:
# Define the CNN model
class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=3, out_channels=16, kernel_size=5, stride=1, padding=2)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=5, stride=1, padding=2)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(32 * 20, 128)  # Calculate input size based on your frame_size
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(-1, 32 * 20)  # Flatten the output of conv layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [74]:
# Initialize the model
model = SimpleCNN(num_classes=6)  # Assuming 6 classes (activities)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Function to calculate accuracy
def accuracy(outputs, labels):
    _, predicted = torch.max(outputs, 1)
    correct = (predicted == labels).sum().item()
    return correct / labels.size(0)

In [79]:
num_epochs = 200
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for inputs, labels in train_loader:
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = model(inputs.permute(0, 2, 1))  # Permute input dimensions for Conv1d
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Track the accuracy and loss
        running_loss += loss.item()
        correct_predictions += (torch.argmax(outputs, dim=1) == labels).sum().item()
        total_predictions += labels.size(0)

    # Calculate train accuracy and loss
    train_accuracy = correct_predictions / total_predictions
    avg_loss = running_loss / len(train_loader)

    # Print statistics every epoch
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')

Epoch [1/200], Train Loss: 0.0001, Train Accuracy: 1.0000
Epoch [2/200], Train Loss: 0.0001, Train Accuracy: 1.0000
Epoch [3/200], Train Loss: 0.0001, Train Accuracy: 1.0000
Epoch [4/200], Train Loss: 0.0001, Train Accuracy: 1.0000
Epoch [5/200], Train Loss: 0.0001, Train Accuracy: 1.0000
Epoch [6/200], Train Loss: 0.0001, Train Accuracy: 1.0000
Epoch [7/200], Train Loss: 0.0001, Train Accuracy: 1.0000
Epoch [8/200], Train Loss: 0.0001, Train Accuracy: 1.0000
Epoch [9/200], Train Loss: 0.0001, Train Accuracy: 1.0000
Epoch [10/200], Train Loss: 0.0001, Train Accuracy: 1.0000
Epoch [11/200], Train Loss: 0.0001, Train Accuracy: 1.0000
Epoch [12/200], Train Loss: 0.0001, Train Accuracy: 1.0000
Epoch [13/200], Train Loss: 0.0001, Train Accuracy: 1.0000
Epoch [14/200], Train Loss: 0.0001, Train Accuracy: 1.0000
Epoch [15/200], Train Loss: 0.0001, Train Accuracy: 1.0000
Epoch [16/200], Train Loss: 0.0001, Train Accuracy: 1.0000
Epoch [17/200], Train Loss: 0.0001, Train Accuracy: 1.0000
Epoch 

In [80]:
# Evaluation on the test set
model.eval()  # Set the model to evaluation mode
test_correct = 0
test_total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs.permute(0, 2, 1))
        _, predicted = torch.max(outputs, 1)
        test_correct += (predicted == labels).sum().item()
        test_total += labels.size(0)

# Calculate test accuracy
test_accuracy = test_correct / test_total
print(f'Test Accuracy: {test_accuracy:.4f}')

Test Accuracy: 0.9720
