In [16]:
import torch
import torchvision
from torchvision import transforms
from torchvision.io import read_video
from PIL import Image
import os
import networkx as nx
from torch_geometric.data import Data
import random
from tqdm import tqdm
import numpy as np

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

transform = transforms.Compose([ transforms.Resize((256, 256)),transforms.ToTensor()])

# Load faster detector model
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT)
model = model.to(device)  # Move model to GPU
model.eval()

# Default parameters
num_classes_to_select = 1#10  # Number of classes to be selcted from each dataset
num_videos_per_class = 1#8  # Number of videos to choose from each class
frames_per_second = 5
temporal_threshold = 0.5  # Time threhold (seconds)
spatial_threshold = 20.0  # Spatial distance threshold (in pixels)

# Filter videos shorter than 2 seconds
min_video_length = 2 

# Fırst of all you should download related open-source datasets then give propoper data paths. 
# You can combine different datasets as following or choose one of them based on your needs
datasets = {
    'Kinetics': '.../kinetics400_5per/train',
    'UCF-100': '.../UCF101/UCF-101',
    'HMDB51': '.../hmdb51_org/HMDB51'
}

def calculateEuclideanDistance(box1, box2):
    center1 = [(box1[0] + box1[2]) / 2, (box1[1] + box1[3]) / 2]
    center2 = [(box2[0] + box2[2]) / 2, (box2[1] + box2[3]) / 2]
    distance = np.sqrt((center1[0] - center2[0]) ** 2 + (center1[1] - center2[1]) ** 2)
    return distance

def processVideo(video_path, model, selected_classes, transform, temporal_threshod, spatial_threshold, activity_label):
    video, _, info = read_video(video_path)
    video_duration = video.size(0) / info['video_fps']

    # Video duration control to filter shorter videos
    if video_duration < min_video_length:
        print(f"Skipping {video_path} due to short duation ({video_duration:.2f} seconds)")
        return None, 0, 0

    video = video.permute(0, 3, 1, 2).to(device)  # adapt pytorch format
    fps = info['video_fps']
    frame_indices = sorted(random.sample(range(0, video.size(0)), min(video.size(0), int(fps * frames_per_second))))
    
    G = nx.Graph()
    node_idx = 0
    
    nodes_data = []

    for idx in frame_indices:
        frame = video[idx].permute(1, 2, 0).cpu().numpy()  # transfer GPU to CPU
        frame = Image.fromarray(frame.astype('uint8'), 'RGB')  # create frame
        frame = transform(frame).unsqueeze(0).to(device)  # transfer frame to GPU
        with torch.no_grad():
            predictions = model(frame)
        
# Getting detected objects and their properties
        boxes = predictions[0]['boxes'].cpu()  
        labels = predictions[0]['labels'].cpu()
        scores = predictions[0]['scores'].cpu()
        
# Filtering detected objects based on time-based threshold
        threshold = 0.5
        filtered_indices = [i for i, (score, label) in enumerate(zip(scores, labels)) if score > threshold and label.item() in selected_classes]
        filtered_boxes = boxes[filtered_indices]
        filtered_labels = labels[filtered_indices]
        
# Adding objects as nodes and storing data
        for i, (box, label) in enumerate(zip(filtered_boxes, filtered_labels)):
            G.add_node(node_idx, label=activity_label, features=box.tolist(), timestamp=idx / fps)
            nodes_data.append((node_idx, box.tolist(), idx / fps))
            node_idx += 1
    
# Adding edges (time + spatial knkwledge)
    for i in range(len(nodes_data)):
        for j in range(i + 1, len(nodes_data)):
            node_i, box_i, timestamp_i = nodes_data[i]
            node_j, box_j, timestamp_j = nodes_data[j]
            
            time_diff = abs(timestamp_i - timestamp_j)
            spatial_distance = calculateEuclideanDistance(box_i, box_j)
            if time_diff <= temporal_threshold and spatial_distance <= spatial_threshold:
                edge_weight = time_diff + spatial_distance
                G.add_edge(node_i, node_j, weight=edge_weight)   
    return G, len(nodes_data), len(G.edges)

graphs = []
labels = []
original_class_labels = []
classSource_info = []  # keep track of which class comes from which dataset
videoCounter = 0  # number of video counter
uniqueClass_id = 0  # keep track of unique class ID

# Operatons for each dataset
for dataset_name, video_dir in datasets.items():
    classDirs = [d for d in os.listdir(video_dir) if os.path.isdir(os.path.join(video_dir, d))]
    selectedClass_dirs = random.sample(classDirs, num_classes_to_select)
    validClasses = []

    while len(validClasses) < num_classes_to_select:
        remainingClasses = [class_ for class_ in classDirs if class_ not in selectedClass_dirs]
        
        for class_dir in selectedClass_dirs:
            if len(validClasses) >= num_classes_to_select:
                break
            print(f"\n{dataset_name}: Processing class: {class_dir}")
            classVideos = [f for f in os.listdir(os.path.join(video_dir, class_dir)) if f.endswith(('.avi', '.mp4', '.mkv'))]
            processedVideos = 0
            validGraphs = []
            totalNodes, totalEdges = 0, 0
            for video_file in classVideos:
                if processedVideos >= num_videos_per_class:
                    break

                videoCounter = videoCounter + 1
                print(f"Processing video {videoCounter}: {video_file}")
                activity_label = uniqueClass_id
                videoPath = os.path.join(video_dir, class_dir, video_file)
                G, num_nodes, num_edges = processVideo(videoPath, model, list(range(1, 11)), transform, temporal_threshold, spatial_threshold, activity_label)
                if G is not None and num_nodes > 0 and num_edges > 0:
                    validGraphs.append(G)
                    processedVideos += 1
                    totalNodes += num_nodes
                    totalEdges += num_edges
                else:
                    print(f"Skipping video {video_file} due to processing error or short duration")

# If there is not enough videos have been processed, do not evaluate this activity class
            if processedVideos == num_videos_per_class and totalNodes > 0 and totalEdges > 0:
                graphs.extend(validGraphs)
                labels.extend([activity_label] * processedVideos)
                original_class_labels.append(activity_label)
                classSource_info.append((activity_label, dataset_name, class_dir))
                validClasses.append(class_dir)
                uniqueClass_id += 1 
            else:
                print(f"Class {class_dir} skipped due to insufficient valid videos (found {processed_videos}/{num_videos_per_class})")
        
        if len(validClasses) < num_classes_to_select:
            print(f"Adding more classes from {dataset_name} to meet the required number of valid classes.")
            additionalClass = random.choice(remainingClasses)
            selected_class_dirs.append(additionalClass)

if len(graphs) == 0:
    raise ValueError("No video processing and graph generation possible. Please check the parameters and data.")

# Combine all the graphs and make a single graph
final_graph = nx.disjoint_union_all(graphs)
# Check the number of edges and nodes
print(f"\nFinal Graph: Total nodes = {final_graph.number_of_nodes()}, Total edges = {final_graph.number_of_edges()}")

edge_index = torch.tensor(list(final_graph.edges)).t().contiguous().to(device) 
x = torch.tensor([final_graph.nodes[i]['features'] for i in final_graph], dtype=torch.float).to(device) 
if len(final_graph.edges) > 0:
    edge_attr = torch.tensor([final_graph.edges[edge]['weight'] for edge in final_graph.edges], dtype=torch.float).unsqueeze(1).to(device)
else:
    edge_attr = torch.tensor([]).to(device)

y = torch.tensor([final_graph.nodes[i]['label'] for i in final_graph], dtype=torch.long).to(device)

# Renumbering class labels (from 0 to num_classes-1)
uniqueClasses = torch.tensor(original_class_labels).unique(sorted=True).to(device)
numClasses = len(uniqueClasses)
classMapping = {class_.item(): i for i, class_ in enumerate(uniqueClasses)}
y_mapped = torch.tensor([classMapping[class_.item()] for class_ in y]).to(device)

print("\nUnique classes with their original dataset information:")
for class_info in classSource_info:
    mapped_label = classMapping[class_info[0]]
    print(f"Class {class_info[2]} (ID: {class_info[0]}) from {class_info[1]} mapped to label {mapped_label}")

print("\nSummary:")
print("Unique classes:", uniqueClasses)
print("Number of classes:", numClasses)
print("Max class label:", uniqueClasses.max().item())
print("Min class label:", uniqueClasses.min().item())

data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y_mapped)

with open('.../node_labels.txt', 'w') as f:
    for i, label in enumerate(y_mapped.tolist()):
        f.write(f'{i} {label}\n')
with open('.../node_features.txt', 'w') as f:
    for i, features in enumerate(x.tolist()):
        f.write(f'{i} {" ".join(map(str, features))}\n')
with open('.../edges.txt', 'w') as f:
    for edge in edge_index.t().tolist():
        f.write(f'{edge[0]} {edge[1]}\n')
with open('.../edge_features.txt', 'w') as f:
    for i, edge in enumerate(edge_index.t().tolist()):
        f.write(f'{edge[0]} {edge[1]} {edge_attr[i].item() if edge_attr.size(0) > 0 else 0.0}\n')



Kinetics: Processing class: doing laundry
Processing video 1: fPRo-LAbQ0I.mp4

Final Graph: Total nodes = 299, Total edges = 1736

Unique classes with their original dataset information:
Class doing laundry (ID: 0) from Kinetics mapped to label 0

Summary:
Unique classes: tensor([0], device='cuda:0')
Number of classes: 1
Max class label: 0
Min class label: 0
