# **Name : Zaraar Malik**
# **Roll No # 21i-2705**
# **Section : BS-AI-A**


# **Data Exploration Process**

In [1]:
import os
import cv2
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader,Dataset

warnings.filterwarnings('ignore')

encoder=LabelEncoder()
os.mkdir('trained_Vision_Transformer')

---
# **Finding Total Number of the Classes**
---


In [2]:
path='/kaggle/input/hmdb-human-activity-recognition/HMDB_dataset'
classes=os.listdir(path)
print('Total Number of Classes : ',len(os.listdir(path)))

Total Number of Classes :  51


---
# **Finding Total Number of :**

* Video Samples
* Training Samples
* Testing Samples
---

In [3]:
path='/kaggle/input/hmdb-human-activity-recognition/HMDB_dataset'
dic={}
training=[]
testing=[]
dic['Class Names']=os.listdir(path)
dic['Total Video Samples']=[len(os.listdir(path+'/'+i)) for i in classes]
class_path=[path+'/'+i for i in classes]
for i in classes:
    train=0
    test=0
    for j in os.listdir(path+'/'+i):        
        if 'training' in j:
            train=train+1
        else:
            test=test+1
    training.append(train)
    testing.append(test)
dic['Total Training Samples']=training
dic['Total Testing Samples']=testing
dic['Class Path']=class_path
frame=pd.DataFrame(dic)
frame.head()

Unnamed: 0,Class Names,Total Video Samples,Total Training Samples,Total Testing Samples,Class Path
0,kick_ball,128,103,25,/kaggle/input/hmdb-human-activity-recognition/...
1,catch,102,72,30,/kaggle/input/hmdb-human-activity-recognition/...
2,shoot_ball,131,91,40,/kaggle/input/hmdb-human-activity-recognition/...
3,climb_stairs,112,82,30,/kaggle/input/hmdb-human-activity-recognition/...
4,punch,126,98,28,/kaggle/input/hmdb-human-activity-recognition/...


---
# **Preparing the Training, Validation and Testing Datasets**
---

In [4]:
path='/kaggle/input/hmdb-human-activity-recognition/HMDB_dataset'
train_dict={}
val_dict={}
training=[]
validation=[]
train_label=[]
val_label=[]
class_path=[path+'/'+i for i in classes]
for i in classes:
    for j in os.listdir(path+'/'+i):        
        if 'training' in j:
            training.append(path+'/'+i+'/'+j)
            train_label.append(i)
        else:
            validation.append(path+'/'+i+'/'+j)
            val_label.append(i)            
train_dict['Class Name']=train_label
train_dict['File Path']=training
train_dataset=pd.DataFrame(train_dict)
train_dataset = train_dataset.sample(frac=1).reset_index(drop=True)


val_dict['Class Name']=val_label
val_dict['File Path']=validation
val_dataset=pd.DataFrame(val_dict)

val_data, test_data = train_test_split(val_dataset, test_size=0.5, random_state=42)

validation_dataset = pd.DataFrame(val_data)
test_dataset = pd.DataFrame(test_data)


In [5]:
train_dataset.head()

Unnamed: 0,Class Name,File Path
0,walk,/kaggle/input/hmdb-human-activity-recognition/...
1,walk,/kaggle/input/hmdb-human-activity-recognition/...
2,walk,/kaggle/input/hmdb-human-activity-recognition/...
3,eat,/kaggle/input/hmdb-human-activity-recognition/...
4,stand,/kaggle/input/hmdb-human-activity-recognition/...


In [6]:
validation_dataset.head()

Unnamed: 0,Class Name,File Path
541,dive,/kaggle/input/hmdb-human-activity-recognition/...
1340,draw_sword,/kaggle/input/hmdb-human-activity-recognition/...
480,sword,/kaggle/input/hmdb-human-activity-recognition/...
165,smoke,/kaggle/input/hmdb-human-activity-recognition/...
248,somersault,/kaggle/input/hmdb-human-activity-recognition/...


In [7]:
test_dataset.head()

Unnamed: 0,Class Name,File Path
1520,shoot_bow,/kaggle/input/hmdb-human-activity-recognition/...
621,climb,/kaggle/input/hmdb-human-activity-recognition/...
352,push,/kaggle/input/hmdb-human-activity-recognition/...
353,push,/kaggle/input/hmdb-human-activity-recognition/...
513,kick,/kaggle/input/hmdb-human-activity-recognition/...


---
# **Encode Classes Using Label Encoder**
---

In [8]:
encoded_classes=encoder.fit_transform(classes)
vector_to_label={}
label_to_vector={}
for label,vector in zip(classes,encoded_classes):
    vector_to_label[int(vector)]=label
    label_to_vector[label]=int(vector)

---
# **Sample Processing Code for Video**
---

In [9]:
def video_to_limited_frames(video_path, num_frames=10):
    cap = cv2.VideoCapture(video_path)

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    interval = max(total_frames // num_frames, 1)

    frame_list=[]
    frame_count=0
    saved_frames=0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % interval == 0 and saved_frames < num_frames:
            frame_list.append(frame)
            saved_frames+=1
        frame_count+=1
    cap.release()
    return frame_list

In [10]:
frame_list=video_to_limited_frames('/kaggle/input/hmdb-human-activity-recognition/HMDB_dataset/brush_hair/testing_10.avi',num_frames=10)
print('Total Number of Extracted Frames : ',len(frame_list))

Total Number of Extracted Frames :  10


---
# **Custom Dataset Class**
---

In [11]:
import torch
from torch.utils.data import Dataset
import cv2
import numpy as np
from transformers import ViTFeatureExtractor

class VideoDataset(Dataset):
    def __init__(self, dataframe, label_dict, feature_extractor, num_frames=40):
        """
        Args:
            dataframe (pd.DataFrame): A dataframe with paths to video files and corresponding labels.
            label_dict (dict): A dictionary to map labels to vectors.
            feature_extractor (ViTFeatureExtractor): A ViT feature extractor for preprocessing.
            num_frames (int): Number of frames to extract per video.
            transform (callable, optional): Optional transform to be applied on a frame.
        """
        self.frame = dataframe
        self.label_dict = label_dict
        self.feature_extractor = feature_extractor
        self.num_frames = num_frames


    def __len__(self):
        return len(self.frame)
        
    def _extract_frames(self, video_path):
        """
        Extract a fixed number of frames from a video file.
        """
        cap = cv2.VideoCapture(video_path)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        interval = max(total_frames // self.num_frames, 1)
        
        frame_list=[]
        frame_count=0
        saved_frames=0
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            if frame_count % interval == 0 and saved_frames < self.num_frames:
                frame_list.append(frame)
                saved_frames+=1
            frame_count+=1
        cap.release()
        return frame_list
        
    def __getitem__(self, idx):
        video_path = self.frame.iloc[idx]['File Path']
        label = self.frame.iloc[idx]['Class Name']
        label = self.label_dict[label]

        if label is None:
            raise ValueError(f"Label '{label}' not found in label_dict")

        frames = self._extract_frames(video_path)

        # Apply feature extraction to frames, resizing as necessary for ViT
        processed_frames = [ self.feature_extractor(images=frame, return_tensors="pt")["pixel_values"].squeeze(0)  for frame in frames ]

        # Stack frames into a single tensor of shape (num_frames, C, H, W)
        frames_tensor = torch.stack(processed_frames)
        label_vector=[torch.tensor(label,dtype=torch.long) for i in range(len(frames_tensor))]
        label_vector=torch.tensor(label_vector)
        
        one_hot_labels = F.one_hot(label_vector, num_classes=len(self.label_dict.keys()))
        label_tensor=torch.tensor(one_hot_labels,dtype=torch.float32,requires_grad=True)
        
        return frames_tensor, label_tensor

---
# **Importing the Vision Transformer**
# **Importing the Feature Exractor for the Vision Transformer**
---

In [None]:
from transformers import ViTForImageClassification, ViTImageProcessor

model_name = "google/vit-base-patch16-224"  # or other ViT model

# Load the feature extractor and model
feature_extractor = ViTImageProcessor.from_pretrained(model_name)
model = ViTForImageClassification.from_pretrained(model_name)

num_classes = len(classes)  # Adjust based on your dataset
model.classifier = nn.Sequential(

    nn.Linear(model.classifier.in_features, 256),  # Second additional layer
    nn.ReLU(),  # Activation function
    
    nn.Linear(256, num_classes),  # Final linear layer to output the cla1ss scores
)
for name, param in model.named_parameters():
    if not name.startswith('classifier'):
        param.requires_grad = False
    else:
        param.requires_grad = True  # Only classifier layers should require gradients


---
# **Load Model**
---

In [22]:
PATH='/kaggle/input/latest_model_final/pytorch/default/1/ViT_Epoch_2.pth'
model.load_state_dict(torch.load(PATH, weights_only=True), strict=False)

<All keys matched successfully>

---
# **Checking the Classifier Layers in the Model**
---

In [23]:
model.classifier

Sequential(
  (0): Linear(in_features=768, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=51, bias=True)
)

---
# **Checking Total Number of Trainable Parameters**
---

In [24]:
num_params = sum([p.numel() for p in model.parameters()])
trainable_params = sum([p.numel() for p in model.parameters() if p.requires_grad])

print("Total Model Parameters :  "+"{:,}".format(num_params),'   |   ',"Total Trainable Parameters : "+"{:,}".format(trainable_params))

Total Model Parameters :  86,008,627    |    Total Trainable Parameters : 209,971


---
# **Datasets and Dataloaders**
---

In [25]:
batch_size = 1 
TRAIN_DATASET = VideoDataset(dataframe=train_dataset, label_dict=label_to_vector, feature_extractor=feature_extractor,num_frames=40)
VAL_DATASET   = VideoDataset(dataframe=validation_dataset, label_dict=label_to_vector, feature_extractor=feature_extractor,num_frames=40)
TEST_DATASET  = VideoDataset(dataframe=test_dataset, label_dict=label_to_vector, feature_extractor=feature_extractor,num_frames=40)

TRAIN_LOADER = DataLoader(TRAIN_DATASET, batch_size=batch_size, shuffle=True)
VAL_LOADER   = DataLoader(VAL_DATASET, batch_size=batch_size, shuffle=False)
TEST_LOADER  = DataLoader(TEST_DATASET, batch_size=batch_size, shuffle=False)

In [26]:
# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


# Define optimizer and loss function
optimizer = optim.Adam(model.classifier.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for classification

# **Experimentation**

In [30]:
# Training parameters
num_epochs = 5
SAVE_EPOCH_LOSS=[]
SAVE_VAL_LOSS=[]

model_name='ViT_Epoch'
PATH='/kaggle/working/trained_Vision_Transformer/'

for epoch in range(num_epochs):
    model.train()  
    running_loss = 0.0
    epoch_loss=0.0
    print('-------------TRAINING-------------')
    for i, (frames, labels) in enumerate(TRAIN_LOADER):

        frames = frames.to(device) 
        labels = labels.to(device)

 
        batch_size, num_frames, C, H, W = frames.shape
        frames = frames.view(batch_size * num_frames, C, H, W)

   
        optimizer.zero_grad()
        outputs = model(frames)
        logits = outputs.logits.view(batch_size, num_frames, -1)
        
        logits=logits.squeeze(0)
        labels=labels.squeeze(0)

        loss = criterion(logits, labels)  
 
        loss.backward() 
        optimizer.step() 
        
        running_loss += loss.item()

        if (i + 1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(TRAIN_LOADER)}], Loss: {running_loss / 100:.4f}")
            epoch_loss=epoch_loss+running_loss
            running_loss = 0.0
    print('-------------VAlIDAION-------------')
    model.eval()
    val_loss=0.0
    for i, (frames, labels) in enumerate (VAL_LOADER):
        
        frames = frames.to(device) 
        labels = labels.to(device)

        batch_size, num_frames, C, H, W = frames.shape
        frames = frames.view(batch_size * num_frames, C, H, W)

        outputs = model(frames)
        logits = outputs.logits.view(batch_size, num_frames, -1)
        
        logits=logits.squeeze(0)
        labels=labels.squeeze(0)

        loss = criterion(logits, labels)  
        running_loss += loss.item()

        if (i + 1) % 50 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(TRAIN_LOADER)}], Loss: {running_loss / 50:.4f}")
            val_loss=val_loss+running_loss
            running_loss = 0.0
    print('\n')
    print(f"Epoch [{epoch+1}/{num_epochs}] , Training Loss: {epoch_loss / len(TRAIN_LOADER):.4f},  Validation Loss:  , {val_loss/len(VAL_LOADER):.4f}")
    print('Saving Model')
    print('\n')
    saving_path=PATH+model_name+'_'+str(epoch)+'.pth'
    torch.save(model.state_dict(), saving_path)

    SAVE_EPOCH_LOSS.append(epoch_loss/len(TRAIN_LOADER))
    SAVE_VAL_LOSS.append(val_loss/len(VAL_LOADER))
    

-------------TRAINING-------------
Epoch [1/5], Step [100/5215], Loss: 0.7832
Epoch [1/5], Step [200/5215], Loss: 0.6472
Epoch [1/5], Step [300/5215], Loss: 0.6902
Epoch [1/5], Step [400/5215], Loss: 0.7593
Epoch [1/5], Step [500/5215], Loss: 0.6833
Epoch [1/5], Step [600/5215], Loss: 0.7291
Epoch [1/5], Step [700/5215], Loss: 0.6583
Epoch [1/5], Step [800/5215], Loss: 0.6489
Epoch [1/5], Step [900/5215], Loss: 0.8157
Epoch [1/5], Step [1000/5215], Loss: 0.7639
Epoch [1/5], Step [1100/5215], Loss: 0.7728
Epoch [1/5], Step [1200/5215], Loss: 0.6944
Epoch [1/5], Step [1300/5215], Loss: 0.6959
Epoch [1/5], Step [1400/5215], Loss: 0.8189
Epoch [1/5], Step [1500/5215], Loss: 0.7746
Epoch [1/5], Step [1600/5215], Loss: 0.6438
Epoch [1/5], Step [1700/5215], Loss: 0.7088
Epoch [1/5], Step [1800/5215], Loss: 0.6219
Epoch [1/5], Step [1900/5215], Loss: 0.7965
Epoch [1/5], Step [2000/5215], Loss: 0.7561
Epoch [1/5], Step [2100/5215], Loss: 0.8366
Epoch [1/5], Step [2200/5215], Loss: 0.8094
Epoch 

---
# **Save the model**
---

In [31]:
model_name='ViT_Epoch_4.pth'
PATH='/kaggle/working/trained_Vision_Transformer/'+model_name
torch.save(model.state_dict(), PATH)

---
# **Test the Model**
---

In [32]:
model_name = "google/vit-base-patch16-224"  # or other ViT model
MODEL = ViTForImageClassification.from_pretrained(model_name)

num_classes = len(classes) 

MODEL.classifier = nn.Sequential(
    nn.Linear(MODEL.classifier.in_features, 256), 
    nn.ReLU(), 
    nn.Linear(256, num_classes),  
)

MODEL.load_state_dict(torch.load(PATH, weights_only=True))


<All keys matched successfully>

In [53]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

MODEL.eval()  # Set the model to evaluation mode

test_loss = 0.0
running_loss = 0.0

all_preds = []  # To store all predictions
all_labels = []  # To store all true labels

for i, (frames, labels) in enumerate(TEST_LOADER):
    frames = frames.to(device)
    labels = labels.to(device)

    batch_size, num_frames, C, H, W = frames.shape
    frames = frames.view(batch_size * num_frames, C, H, W)

    with torch.no_grad():  # Disable gradient computation during evaluation
        outputs = model(frames)
        logits = outputs.logits.view(batch_size, num_frames, -1)

    logits = logits.squeeze(0)  # Adjust dimensions
    labels = labels.squeeze(0)

    # Compute loss
    loss = criterion(logits, labels)
    test_loss += loss.item()
    running_loss += loss.item()

    # Convert logits to predicted classes
    preds = torch.argmax(logits, dim=1)

# Convert one-hot encoded labels to class indices
    labels = torch.argmax(labels, dim=1)

    # Convert logits to predicted classes
    preds = torch.argmax(logits, dim=1)

    # Compute loss
    loss = criterion(logits, labels)
    test_loss += loss.item()
    running_loss += loss.item()

    # Store predictions and labels for metrics calculation
    all_preds.extend(preds.cpu().numpy())
    all_labels.extend(labels.cpu().numpy())

    if (i + 1) % 50 == 0:
        print(f"[{i+1}], Loss: {running_loss / 50:.4f}")
        running_loss = 0.0

# Calculate average test loss
average_test_loss = test_loss / len(TEST_LOADER)

# Calculate metrics
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average="weighted")
precision = precision_score(all_labels, all_preds, average="weighted")
recall = recall_score(all_labels, all_preds, average="weighted")

# Print results
print('\n')
print("Test Loss: ",average_test_loss)
print("Accuracy: ",np.round((accuracy*100),2))
print("F1 Score: ",np.round((f1*100),2))
print("Precision: ",np.round((precision*100),2))
print("Recall: ",np.round((recall*100),2))

[50], Loss: 0.5637
[100], Loss: 0.6513
[150], Loss: 0.5793
[200], Loss: 0.7420
[250], Loss: 0.7880
[300], Loss: 0.6474
[350], Loss: 0.9655
[400], Loss: 0.7393
[450], Loss: 1.4583
[500], Loss: 0.7041
[550], Loss: 1.0194
[600], Loss: 0.7351
[650], Loss: 0.5980
[700], Loss: 0.7031
[750], Loss: 0.6289


Test Loss:  0.7796184924486174
Accuracy:  89.56
F1 Score:  89.24
Precision:  89.73
Recall:  89.56
