In [47]:
import numpy as np
import cv2
from google.colab.patches import cv2_imshow
import torch
import torchvision.transforms as transforms
from torchvision import models
from PIL import Image
import os
from segment_anything import SamPredictor, sam_model_registry
import matplotlib.pyplot as plt
from scenedetect import detect, AdaptiveDetector, split_video_ffmpeg
from google.cloud import speech_v1p1beta1 as speech
from google.oauth2 import service_account
from pydub import AudioSegment

In [None]:
!pip install scenedetect supervision
!pip install segment-anything ultralytics
!pip install google-cloud-speech google-auth pydub

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive/Github/cis5810

/content/drive/MyDrive/Github


In [36]:
"""
Function to determine camera shot movement (pan, tilt, zoom, or static)
"""

def detect_camera_movement(video_path):
    cap = cv2.VideoCapture(video_path)

    ret, prev_frame = cap.read()
    if not ret:
        print("Error reading video file")
        return

    prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)

    movement = {'pan': 0, 'tilt': 0, 'zoom': 0}

    while True:
        ret, next_frame = cap.read()
        if not ret:
            break

        next_gray = cv2.cvtColor(next_frame, cv2.COLOR_BGR2GRAY)

        flow = cv2.calcOpticalFlowFarneback(prev_gray, next_gray, None,
                                            0.5, 3, 15, 3, 5, 1.2, 0)
        # Horizontal flow
        flow_x = flow[..., 0]
        # Vertical flow
        flow_y = flow[..., 1]

        mean_flow_x = np.mean(flow_x)
        mean_flow_y = np.mean(flow_y)

        magnitude, angle = cv2.cartToPolar(flow_x, flow_y, angleInDegrees=True)
        mean_magnitude = np.mean(magnitude)

        # Case 1: Pan
        if abs(mean_flow_x) > abs(mean_flow_y):
            movement_type = "Pan" if mean_flow_x > 0 else "Reverse Pan"
            movement['pan'] += 1
        # Case 2: Tilt (vertical movement)
        elif abs(mean_flow_y) > abs(mean_flow_x):
            movement_type = "Tilt" if mean_flow_y > 0 else "Reverse Tilt"
            movement['tilt'] += 1
        # Case 3: Zoom
        else:
            if np.median(flow_x) < 0 and np.median(flow_y) < 0:
                movement_type = "Zoom In"
                movement['zoom'] += 1
            elif np.median(flow_x) > 0 and np.median(flow_y) > 0:
                movement_type = "Zoom Out"
                movement['zoom'] += 1
            else:
                movement_type = "Static/No significant movement"

        print(f"Detected Movement: {movement_type}")
        # Show the optical flow map (optional)
        hsv = np.zeros_like(prev_frame)
        hsv[..., 1] = 255
        hsv[..., 0] = angle / 2  # Hue corresponds to the direction of flow
        hsv[..., 2] = cv2.normalize(magnitude, None, 0, 255, cv2.NORM_MINMAX)
        flow_map = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
        cv2_imshow(next_frame)
        cv2_imshow(flow_map)

        # Update previous frame
        prev_gray = next_gray

        # Break loop on 'q' key press
        if cv2.waitKey(30) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

    print("Overall Movement Detected: ", movement)

In [None]:
scene_list = detect('tomJerry.mp4', AdaptiveDetector())
split_video_ffmpeg('tomJerry.mp4', scene_list)

In [None]:

# Load the pre-trained VGG16 model
vgg16 = models.vgg16(pretrained=True)
vgg16.eval()  # Set the model to evaluation mode

# Define a set of class labels (this is just an example, you may need to modify it)
class_labels = ['Wide Shot', 'Medium Shot', 'Close-Up', 'Over-the-Shoulder', 'POV', 'Cut-In', 'Establishing Shot']

# Define image transformations for the VGG16 input
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to classify an image
def classify_image(image):
    # Transform the image
    image = transform(image).unsqueeze(0)  # Add batch dimension

    # Make prediction
    with torch.no_grad():
        outputs = vgg16(image)
        _, predicted_class = outputs.max(1)

    # For demonstration purposes, map to custom labels (note: VGG16's original classes are ImageNet)
    # You would need a classifier trained for camera shot classification or use features from VGG16
    # and pass them to your custom classifier.
    label = class_labels[predicted_class.item() % len(class_labels)]
    return label

# Function to extract frames from a video and classify them
def classify_video(video_path, frame_interval=30):
    # Open the video file
    cap = cv2.VideoCapture(video_path)
    frame_count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Process every 'frame_interval' frames
        if frame_count % frame_interval == 0:
            # Convert the frame (BGR to RGB)
            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            label = classify_image(image)
            print(f"Frame {frame_count}: Classified as: {label}")

        frame_count += 1

    cap.release()
    cv2.destroyAllWindows()

classify_video('tomJerry-Scene-001.mp4', frame_interval=30)

In [None]:
import cv2
import numpy as np
import supervision as sv
from segment_anything import SamPredictor, sam_model_registry
import matplotlib.pyplot as plt

# Load the SAM model for object detection
sam = sam_model_registry["vit_b"]("../sam_vit_b_01ec64.pth")
sam_predictor = SamPredictor(sam)

# Function to extract frames from a video and detect objects
def detect_objects_in_video(video_path, frame_interval=30):
    # Open the video file
    cap = cv2.VideoCapture(video_path)
    frame_count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Process every 'frame_interval' frames
        if frame_count % frame_interval == 0:
            # Set SAM input image
            sam_predictor.set_image(frame)

            # Predict objects in the frame
            masks, _, _ = sam_predictor.predict(
                point_coords=None,
                point_labels=None,
                box=None,
                multimask_output=False
            )

            # Draw the detected masks on the frame
            annotated_frame = frame.copy()
            for mask in masks:
                color = np.random.randint(0, 255, size=(3,), dtype=np.uint8)
                mask = mask.astype(bool)
                annotated_frame[mask] = cv2.addWeighted(annotated_frame, 0.5, np.full_like(annotated_frame, color), 0.5, 0)[mask]

            # Convert the annotated frame to RGB (from BGR)
            annotated_frame_rgb = cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB)

            # Display the frame using matplotlib
            plt.figure(figsize=(10, 6))
            plt.imshow(annotated_frame_rgb)
            plt.axis('off')
            plt.show()

        frame_count += 1

    cap.release()

video_path = "tomJerry-Scene-001.mp4"
detect_objects_in_video(video_path, frame_interval=30)

In [None]:
import supervision as sv
from ultralytics import YOLO
import numpy as np

# Load YOLO model
model = YOLO('yolov8n.pt')
tracker = sv.ByteTrack()

box_annotator = sv.BoxAnnotator()
label_annotator = sv.LabelAnnotator()

# Callback function to process each frame
def callback(frame: np.ndarray, index: int) -> np.ndarray:
    results = model(frame)[0]
    detections = sv.Detections.from_ultralytics(results)
    detections = tracker.update_with_detections(detections)

    labels = [f"#{tracker_id}" for tracker_id in detections.tracker_id]

    annotated_frame = box_annotator.annotate(
        scene=frame.copy(), detections=detections)
    annotated_frame = label_annotator.annotate(
        scene=annotated_frame, detections=detections, labels=labels)
    return annotated_frame

# Process video and save output
sv.process_video(
    source_path="tomJerry-Scene-001.mp4",
    target_path="tomJerry-Scene-001-tracked.mp4",
    callback=callback
)

In [None]:
# convert mp4 to wav
!ffmpeg -i hunger_games_scene_2.mp4 -ab 160k -ac 2 -ar 44100 -vn hunger_games_scene_2.wav

In [None]:

credentials = service_account.Credentials.from_service_account_file('/content/cis5810-speech-sa-key.json')
client = speech.SpeechClient(credentials=credentials)

# Path to your audio file
audio_file = 'MyDrive/CIS5810/Videos/hunger_games_scene_2.wav'

# Convert stereo to mono using pydub
sound = AudioSegment.from_wav(audio_file)
sound = sound.set_channels(1)  # Convert to mono
mono_audio_file = 'MyDrive/CIS5810/Videos/hunger_games_scene_2_mono.wav'
sound.export(mono_audio_file, format="wav")

with open(mono_audio_file, 'rb') as audio:
    content = audio.read()

audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=44100,
    language_code='en-US'
)

response = client.recognize(config=config, audio=audio)

for result in response.results:
    print(result.alternatives[0].transcript)

In [55]:
! git add .

In [56]:
! git commit -m "added Tim's experiments"

On branch master
Your branch is up to date with 'cis5810/master'.

nothing to commit, working tree clean


In [None]:
! f