In [1]:
import cv2
import numpy as np
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# ================== Memory-Saving Parameters ==================
input_shape = (512, 512, 3)  # Reduced from 256x256
batch_size = 4             # Reduced from 32
latent_dim = 16              # More efficient compression
epochs = 1                  # Reduced epochs for initial testing

# ================== Data Generator ==================
class FrameGenerator:
    def __init__(self, video_path, batch_size):
        self.cap = cv2.VideoCapture(video_path)
        self.batch_size = batch_size
        self.frame_count = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
    def __len__(self):
        return self.frame_count // self.batch_size
    
    def __call__(self):
        while True:
            batch = []
            for _ in range(self.batch_size):
                ret, frame = self.cap.read()
                if not ret:
                    self.cap.release()
                    return
                frame = cv2.resize(frame, (input_shape[1], input_shape[0]))
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                batch.append(frame.astype(np.float32) / 255.0)
            yield np.array(batch), np.array(batch)

# ================== Simplified Autoencoder with Skip Connections ==================
def build_autoencoder():
    inputs = Input(shape=input_shape)
    
    # Encoder
    x1 = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    x = MaxPooling2D((2, 2), padding='same')(x1)
    x2 = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2), padding='same')(x2)
    encoded = Conv2D(latent_dim, (3, 3), activation='relu', padding='same')(x)
    
    # Decoder with skip connections
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(encoded)
    x = UpSampling2D((2, 2))(x)
    x = concatenate([x, x2])  # Skip connection
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)
    x = concatenate([x, x1])  # Skip connection
    decoded = Conv2D(3, (3, 3), activation='sigmoid', padding='same')(x)
    
    autoencoder = Model(inputs, decoded)
    autoencoder.compile(optimizer=Adam(learning_rate=1e-4), loss='mse')
    return autoencoder

# ================== Training with Generator ==================
def main():
    # Initialize generator
    generator = FrameGenerator('input.mp4', batch_size)
    
    # Build and train model
    autoencoder = build_autoencoder()
    autoencoder.fit(
        generator(),
        epochs=epochs,
        steps_per_epoch=len(generator),
        shuffle=True
    )
    
    # Reconstruct video
    process_video(autoencoder, './input.mp4', './output.mp4')

# ================== Memory-Efficient Video Processing ==================
def process_video(model, input_path, output_path):
    cap = cv2.VideoCapture(input_path)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, 30.0, (input_shape[1], input_shape[0]))
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
            
        # Process frame
        processed = cv2.resize(frame, (input_shape[1], input_shape[0]))
        processed = cv2.cvtColor(processed, cv2.COLOR_BGR2RGB)
        processed = np.expand_dims(processed, axis=0) / 255.0
        
        # Predict and write
        reconstructed = model.predict(processed)[0]
        reconstructed = (reconstructed * 255).astype(np.uint8)
        out.write(cv2.cvtColor(reconstructed, cv2.COLOR_RGB2BGR))
    
    cap.release()
    out.release()

if __name__ == "__main__":
    main()
    print("Processing complete! Output saved as output.mp4")

Processing complete! Output saved as output.mp4


# aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa

# ####################################

In [2]:
import cv2
import os

# Video file name
video_path = "./input.mp4"
output_folder = "./frames"

# Create the output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Open the video file
cap = cv2.VideoCapture(video_path)

frame_number = 0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break  # Stop if no more frames

    # Save frame as an image
    frame_filename = os.path.join(output_folder, f"frame_{frame_number:05d}.jpg")
    cv2.imwrite(frame_filename, frame)
    
    frame_number += 1

cap.release()
cv2.destroyAllWindows()

print(f"Extracted {frame_number} frames and saved in '{output_folder}'")


Extracted 374 frames and saved in './frames'


In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision.transforms import functional as F
from torchvision.io import read_image
from torchmetrics.image import PeakSignalNoiseRatio, StructuralSimilarityIndexMeasure
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

from PIL import Image

class PaddedImageDataset(Dataset):
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.image_files = [os.path.join(root_dir, f) for f in os.listdir(root_dir) 
                          if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        img_path = self.image_files[idx]
        try:
            # Use PIL to check if the image is valid
            with Image.open(img_path) as img:
                img.verify()  # Verify that the file is not corrupted
            image = read_image(img_path).float() / 255.0
            padded = F.pad(image, (0, 0, 0, 8), padding_mode='reflect')
            return padded, image
        except Exception as e:
            print(f"Error loading image {img_path}: {e}")
            # Return a dummy image or skip this file
            dummy_image = torch.zeros((3, 1088, 1920))  # Adjust dimensions as needed
            return dummy_image, dummy_image[:, :1080, :]

class CompressionAutoencoder(nn.Module):
    def __init__(self):
        super().__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 64, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 128, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 256, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 512, 3, stride=2, padding=1),
            nn.ReLU(),
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(512, 256, 3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(256, 128, 3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(128, 64, 3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(64, 3, 3, stride=2, padding=1, output_padding=1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

class AutoencoderSystem(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = CompressionAutoencoder()
        self.criterion = nn.MSELoss()
        self.psnr = PeakSignalNoiseRatio(data_range=1.0)
        self.ssim = StructuralSimilarityIndexMeasure(data_range=1.0)

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self.model(inputs)
        outputs_cropped = outputs[:, :, :1080, :]  # Remove padding
        loss = self.criterion(outputs_cropped, targets)
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self.model(inputs)
        outputs_cropped = outputs[:, :, :1080, :]
        
        loss = self.criterion(outputs_cropped, targets)
        psnr = self.psnr(outputs_cropped, targets)
        ssim = self.ssim(outputs_cropped, targets)
        
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_psnr', psnr, prog_bar=True)
        self.log('val_ssim', ssim, prog_bar=True)
        return {'val_loss': loss, 'val_psnr': psnr, 'val_ssim': ssim}

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=1e-3)

class ImageDataModule(pl.LightningDataModule):
    def __init__(self, data_dir='frames', batch_size=4):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size

    def setup(self, stage=None):
        full_dataset = PaddedImageDataset(self.data_dir)
        train_size = int(0.8 * len(full_dataset))
        val_size = len(full_dataset) - train_size
        self.train_dataset, self.val_dataset = random_split(
            full_dataset, [train_size, val_size])

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, 
                         shuffle=True, num_workers=4, pin_memory=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size,
                         shuffle=False, num_workers=4, pin_memory=True)

def main():
    data_module = ImageDataModule(data_dir='./frames', batch_size=2)
    
    model = AutoencoderSystem()
    
    checkpoint_callback = ModelCheckpoint(
        monitor='val_loss',
        dirpath='checkpoints',
        filename='autoencoder-{epoch:02d}-{val_loss:.2f}',
        save_top_k=3,
        mode='min',
    )
    
    trainer = pl.Trainer(
        max_epochs=50,
        callbacks=[checkpoint_callback],
        logger=TensorBoardLogger('logs/'),
        accelerator='gpu' if torch.cuda.is_available() else 'cpu',
        devices=1,
    )
    
    trainer.fit(model, datamodule=data_module)

if __name__ == '__main__':
    main()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name      | Type                             | Params | Mode 
-----------------------------------------------------------------------
0 | model     | CompressionAutoencoder           | 3.1 M  | train
1 | criterion | MSELoss                          | 0      | train
2 | psnr      | PeakSignalNoiseRatio             | 0      | train
3 | ssim      | StructuralSimilarityIndexMeasure | 0      | train
-----------------------------------------------------------------------
3.1 M     Trainable params
0         Non-trainable params
3.1 M     Total params
12.406    Total estimated model params size (MB)
22        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\anish\anaconda3\envs\tf_env\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:420: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


RuntimeError: DataLoader worker (pid(s) 33484, 29884, 30044, 29272) exited unexpectedly

In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models
import os

# Define a lighter autoencoder model
def build_autoencoder(input_shape=(512, 960, 3)):  # Reduced resolution
    """
    Builds a convolutional autoencoder with fewer filters for lower memory usage.
    
    Args:
        input_shape (tuple): Shape of input images (height, width, channels).
    
    Returns:
        tf.keras.Model: Compiled autoencoder model.
    """
    inputs = layers.Input(shape=input_shape)

    # Encoder
    x = layers.Conv2D(32, (3, 3), strides=2, padding='same', activation='relu')(inputs)  # 256x480x32
    x = layers.Conv2D(64, (3, 3), strides=2, padding='same', activation='relu')(x)      # 128x240x64
    x = layers.Conv2D(128, (3, 3), strides=2, padding='same', activation='relu')(x)     # 64x120x128
    latent = layers.Conv2D(16, (3, 3), strides=2, padding='same', activation='relu')(x)  # 32x60x16 (latent space)

    # Decoder
    x = layers.Conv2DTranspose(128, (3, 3), strides=2, padding='same', activation='relu')(latent)  # 64x120x128
    x = layers.Conv2DTranspose(64, (3, 3), strides=2, padding='same', activation='relu')(x)       # 128x240x64
    x = layers.Conv2DTranspose(32, (3, 3), strides=2, padding='same', activation='relu')(x)       # 256x480x32
    outputs = layers.Conv2DTranspose(3, (3, 3), strides=2, padding='same', activation='sigmoid')(x)  # 512x960x3

    model = models.Model(inputs, outputs, name='autoencoder')
    return model

# Load and preprocess the dataset
def load_dataset(folder_path, target_size=(512, 960), batch_size=1):
    """
    Loads and resizes images from the folder.
    
    Args:
        folder_path (str): Path to the folder containing images.
        target_size (tuple): Desired (height, width) for resized images.
        batch_size (int): Number of images per batch.
    
    Returns:
        tf.data.Dataset: Dataset yielding (input, target) pairs.
    """
    file_paths = tf.data.Dataset.list_files(os.path.join(folder_path, '*.jpg'), shuffle=True)

    def load_image(file_path):
        img = tf.io.read_file(file_path)
        img = tf.image.decode_image(img, channels=3, expand_animations=False)
        img = tf.image.resize(img, target_size, method='bilinear')  # Resize to target size
        img = tf.cast(img, tf.float32) / 255.0  # Normalize to [0, 1]
        return img

    dataset = file_paths.map(load_image, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda x: (x, x), num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

# Main execution
if __name__ == '__main__':
    # Parameters
    INPUT_SHAPE = (512, 960, 3)  # Reduced resolution
    BATCH_SIZE = 1  # Reduced to fit in memory
    EPOCHS = 50
    LEARNING_RATE = 1e-4
    FOLDER_PATH = './frames'

    # Enable mixed precision (optional, if supported)
    tf.keras.mixed_precision.set_global_policy('mixed_float16')

    # Build and compile the model
    autoencoder = build_autoencoder(input_shape=INPUT_SHAPE)
    autoencoder.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
        loss='mse'
    )
    autoencoder.summary()

    # Load the dataset
    train_dataset = load_dataset(FOLDER_PATH, target_size=INPUT_SHAPE[:2], batch_size=BATCH_SIZE)

    # Train the model
    try:
        autoencoder.fit(
            train_dataset,
            epochs=EPOCHS,
            verbose=1
        )
    except tf.errors.ResourceExhaustedError as e:
        print(f"OOM Error: {e}. Try reducing batch size or input resolution further.")
        exit(1)

    # Save the model
    autoencoder.save('autoencoder_model.h5')
    print("Model saved as 'autoencoder_model.h5'")

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU, compute capability 8.6
Model: "autoencoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 512, 960, 3)]     0         
                                                                 
 conv2d (Conv2D)             (None, 256, 480, 32)      896       
                                                                 
 conv2d_1 (Conv2D)           (None, 128, 240, 64)      18496     
                                                                 
 conv2d_2 (Conv2D)           (None, 64, 120, 128)      73856     
                                                                 
 conv2d_3 (Conv2D)           (None, 32, 60, 16)        18448    

In [3]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import os

# Load and preprocess the dataset
def load_dataset(folder_path, target_size=(512, 960), batch_size=1):
    """
    Loads and resizes images from the folder for visualization.
    
    Args:
        folder_path (str): Path to the folder containing images.
        target_size (tuple): Desired (height, width) for resized images.
        batch_size (int): Number of images per batch.
    
    Returns:
        tf.data.Dataset: Dataset yielding images.
    """
    file_paths = tf.data.Dataset.list_files(os.path.join(folder_path, '*.jpg'), shuffle=True)

    def load_image(file_path):
        img = tf.io.read_file(file_path)
        img = tf.image.decode_image(img, channels=3, expand_animations=False)
        img = tf.image.resize(img, target_size, method='bilinear')
        img = tf.cast(img, tf.float32) / 255.0  # Normalize to [0, 1]
        return img

    dataset = file_paths.map(load_image, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

# Visualization function
def visualize_reconstructions(model, dataset, num_images=3, save_path=None):
    """
    Visualizes original vs. reconstructed images.
    
    Args:
        model (tf.keras.Model): Trained autoencoder model.
        dataset (tf.data.Dataset): Dataset to draw images from.
        num_images (int): Number of image pairs to display.
        save_path (str, optional): Path to save the figure (e.g., 'reconstructions.png').
    """
    # Take a batch from the dataset
    for batch in dataset.take(1):
        original_images = batch  # Single batch of images
        reconstructed_images = model.predict(original_images)

    # Convert to numpy and ensure float32 type, then clip values to [0, 1]
    original_images = original_images.numpy().astype(np.float32)
    reconstructed_images = np.clip(reconstructed_images.astype(np.float32), 0, 1)

    # Plot original and reconstructed images
    fig, axes = plt.subplots(2, num_images, figsize=(num_images * 5, 10))
    for i in range(min(num_images, original_images.shape[0])):
        # Original image
        axes[0, i].imshow(original_images[i])
        axes[0, i].set_title(f"Original {i+1}")
        axes[0, i].axis('off')

        # Reconstructed image
        axes[1, i].imshow(reconstructed_images[i])
        axes[1, i].set_title(f"Reconstructed {i+1}")
        axes[1, i].axis('off')

    plt.tight_layout()

    # Save the figure if a path is provided
    if save_path:
        plt.savefig(save_path)
        print(f"Figure saved to {save_path}")
    else:
        plt.show()

    plt.close(fig)  # Close the figure to free memory

# Main execution
if __name__ == '__main__':
    # Parameters
    INPUT_SHAPE = (512, 960, 3)  # Must match the model's input shape
    FOLDER_PATH = 'frames'       # Folder with original images
    MODEL_PATH = 'autoencoder_model.h5'  # Path to the trained model
    NUM_IMAGES = 3               # Number of images to visualize
    SAVE_PATH = 'reconstructions.png'  # Optional: save the figure

    # Load the trained model
    try:
        autoencoder = tf.keras.models.load_model(MODEL_PATH)
        print(f"Model loaded from {MODEL_PATH}")
    except Exception as e:
        print(f"Error loading model: {e}")
        exit(1)

    # Load the dataset
    viz_dataset = load_dataset(FOLDER_PATH, target_size=INPUT_SHAPE[:2], batch_size=NUM_IMAGES)

    # Visualize original vs reconstructed images
    visualize_reconstructions(autoencoder, viz_dataset, num_images=NUM_IMAGES, save_path=SAVE_PATH)

Model loaded from autoencoder_model.h5
Figure saved to reconstructions.png


In [6]:
import os
import numpy as np
from PIL import Image
import cv2
from sklearn.decomposition import PCA

# Parameters
input_folder = "./frames"
output_folder = "compressed_frames"
output_video = "output_video.mp4"
n_components = 50  # Number of PCA components
subset_size = 50   # Number of frames to fit PCA (must be >= n_components)
frame_size = (1920, 1080)
fps = 30

# Create output folder if it doesn’t exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Load image files
image_files = sorted([f for f in os.listdir(input_folder) if f.endswith(('.png', '.jpg', '.jpeg'))])
n_frames = len(image_files)

# Step 1: Fit PCA on a subset of frames
subset_frames = []
for i in range(min(subset_size, n_frames)):
    img_path = os.path.join(input_folder, image_files[i])
    img = Image.open(img_path).convert('RGB')
    img_array = np.array(img).flatten()  # Shape: (1080*1920*3,)
    subset_frames.append(img_array)

subset_frames = np.array(subset_frames)  # Shape: (subset_size, 1080*1920*3)
pca = PCA(n_components=n_components)
pca.fit(subset_frames)  # Fit PCA on the subset

# Step 2: Process each frame one at a time
height, width, channels = 1080, 1920, 3
for i, file in enumerate(image_files):
    # Load frame
    img_path = os.path.join(input_folder, file)
    img = Image.open(img_path).convert('RGB')
    img_array = np.array(img)  # Shape: (1080, 1920, 3)
    
    # Flatten frame
    flattened_frame = img_array.reshape(1, height * width * channels)  # Shape: (1, 1080*1920*3)
    
    # Apply PCA
    compressed_frame = pca.transform(flattened_frame)  # Compress
    reconstructed_frame = pca.inverse_transform(compressed_frame)  # Reconstruct
    
    # Reshape and clip
    reconstructed_frame = reconstructed_frame.reshape(height, width, channels)
    reconstructed_frame = np.clip(reconstructed_frame, 0, 255).astype(np.uint8)
    
    # Save reconstructed frame
    output_path = os.path.join(output_folder, f"frame_{i:04d}.png")
    Image.fromarray(reconstructed_frame).save(output_path)
    
    print(f"Processed frame {i+1}/{n_frames}")

# Step 3: Create MP4 video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(output_video, fourcc, fps, frame_size)

for i in range(n_frames):
    frame_path = os.path.join(output_folder, f"frame_{i:04d}.png")
    frame = cv2.imread(frame_path)
    video_writer.write(frame)

video_writer.release()
print(f"Video saved as {output_video}")

MemoryError: Unable to allocate 9.27 GiB for an array with shape (50, 24883200) and data type float64

In [6]:
import cv2

def process_video():
    # Hardcoded file names
    input_file = "input.mp4"
    output_file = "output.mp4"
    
    cap = cv2.VideoCapture(input_file)
    if not cap.isOpened():
        print("Error: Could not open", input_file)
        return
    
    # Get video properties.
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
    
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print(f"Processing {frame_count} frames from '{input_file}'...")

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Convert from BGR to YUV.
        yuv = cv2.cvtColor(frame, cv2.COLOR_BGR2YUV)
        y, u, v = cv2.split(yuv)
        
        # Reduce chrominance details by downscaling U and V channels
        u_small = cv2.resize(u, (width // 4, height // 4), interpolation=cv2.INTER_LINEAR)
        v_small = cv2.resize(v, (width // 4, height // 4), interpolation=cv2.INTER_LINEAR)
        u_restored = cv2.resize(u_small, (width, height), interpolation=cv2.INTER_LINEAR)
        v_restored = cv2.resize(v_small, (width, height), interpolation=cv2.INTER_LINEAR)
        
        # Merge the original luminance with the degraded chrominance channels.
        yuv_modified = cv2.merge([y, u_restored, v_restored])
        
        # Convert back to BGR and write the frame.
        frame_modified = cv2.cvtColor(yuv_modified, cv2.COLOR_YUV2BGR)
        out.write(frame_modified)
    
    cap.release()
    out.release()
    print(f"Processing complete. Output saved to '{output_file}'.")

# Run the video processing function.
process_video()


Processing 374 frames from 'input.mp4'...
Processing complete. Output saved to 'output.mp4'.


In [7]:
import cv2

def process_video_compression():
    input_file = "input.mp4"
    output_file = "output.mp4"
    
    cap = cv2.VideoCapture(input_file)
    if not cap.isOpened():
        print("Error: Could not open", input_file)
        return
    
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    # Using mp4v codec; adjust if needed.
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_file, fourcc, fps, (width, height))
    
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print(f"Processing {frame_count} frames with simulated compression...")

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Compress the frame to JPEG format at low quality (simulate compression)
        ret_enc, compressed = cv2.imencode('.jpg', frame, [cv2.IMWRITE_JPEG_QUALITY, 30])
        if not ret_enc:
            print("Frame compression failed, skipping frame...")
            continue
        
        # Reconstruct the frame by decoding it back
        frame_reconstructed = cv2.imdecode(compressed, cv2.IMREAD_COLOR)
        
        # Write the reconstructed frame to the output video.
        out.write(frame_reconstructed)
    
    cap.release()
    out.release()
    print(f"Processing complete. Output saved to '{output_file}'.")

process_video_compression()


Processing 374 frames with simulated compression...
Processing complete. Output saved to 'output.mp4'.


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import cv2
import numpy as np
import os

# Define a simple VAE model for 64x64 RGB images.
class VAE(nn.Module):
    def __init__(self, latent_dim=32):
        super(VAE, self).__init__()
        self.latent_dim = latent_dim
        
        # Encoder
        self.enc_conv1 = nn.Conv2d(3, 16, 4, stride=2, padding=1)   # 64->32
        self.enc_conv2 = nn.Conv2d(16, 32, 4, stride=2, padding=1)  # 32->16
        self.enc_conv3 = nn.Conv2d(32, 64, 4, stride=2, padding=1)  # 16->8
        self.enc_conv4 = nn.Conv2d(64, 128, 4, stride=2, padding=1) # 8->4
        
        # Fully connected layers for mu and logvar (flattened feature size = 128*4*4 = 2048)
        self.fc_mu = nn.Linear(128 * 4 * 4, latent_dim)
        self.fc_logvar = nn.Linear(128 * 4 * 4, latent_dim)
        
        # Decoder
        self.fc_dec = nn.Linear(latent_dim, 128 * 4 * 4)
        self.dec_conv1 = nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1)  # 4->8
        self.dec_conv2 = nn.ConvTranspose2d(64, 32, 4, stride=2, padding=1)   # 8->16
        self.dec_conv3 = nn.ConvTranspose2d(32, 16, 4, stride=2, padding=1)   # 16->32
        self.dec_conv4 = nn.ConvTranspose2d(16, 3, 4, stride=2, padding=1)    # 32->64

    def encode(self, x):
        h = F.relu(self.enc_conv1(x))
        h = F.relu(self.enc_conv2(h))
        h = F.relu(self.enc_conv3(h))
        h = F.relu(self.enc_conv4(h))
        h = h.view(h.size(0), -1)
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        h = F.relu(self.fc_dec(z))
        h = h.view(-1, 128, 4, 4)
        h = F.relu(self.dec_conv1(h))
        h = F.relu(self.dec_conv2(h))
        h = F.relu(self.dec_conv3(h))
        h = torch.sigmoid(self.dec_conv4(h))
        return h

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        x_recon = self.decode(z)
        return x_recon, mu, logvar

# Function to convert video frames to latent representations and save them.
def video_to_latent(vae, video_path, latent_save_path, frame_size=(64, 64), device='cpu'):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Cannot open video file:", video_path)
        return
    latents = []
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print("Processing video into latent space. Total frames:", frame_count)
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        # Resize frame and convert from BGR to RGB.
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame_resized = cv2.resize(frame_rgb, frame_size)
        
        # Convert to tensor and normalize.
        frame_tensor = torch.from_numpy(frame_resized).permute(2, 0, 1).float() / 255.0
        frame_tensor = frame_tensor.unsqueeze(0).to(device)
        
        # Get latent representation (using mu).
        with torch.no_grad():
            mu, _ = vae.encode(frame_tensor)
        latents.append(mu.cpu().numpy())
    
    cap.release()
    latents = np.concatenate(latents, axis=0)  # shape: (num_frames, latent_dim)
    np.save(latent_save_path, latents)
    print("Latent representations saved to", latent_save_path)
    return latents

# Function to reconstruct video from saved latent representations.
def latent_to_video(vae, latent_file, output_video_path, frame_size=(64, 64), fps=25, device='cpu'):
    latents = np.load(latent_file)
    num_frames = latents.shape[0]
    print("Reconstructing video from latent representations. Total frames:", num_frames)
    
    # Set up the VideoWriter.
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_video_path, fourcc, fps, frame_size)
    
    for i in range(num_frames):
        latent = torch.tensor(latents[i:i+1], dtype=torch.float).to(device)
        with torch.no_grad():
            recon = vae.decode(latent)
        # Convert tensor to image format.
        recon_img = recon.squeeze(0).permute(1, 2, 0).cpu().numpy() * 255
        recon_img = np.clip(recon_img, 0, 255).astype(np.uint8)
        # Convert RGB back to BGR.
        recon_img = cv2.cvtColor(recon_img, cv2.COLOR_RGB2BGR)
        out.write(recon_img)
    out.release()
    print("Reconstructed video saved to", output_video_path)

# Example usage.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
vae = VAE(latent_dim=32).to(device)

# NOTE: In a real project, you must train your VAE on video frames first.
# For this demonstration, we're using an untrained model (which means quality won't be good).

# Compress video into latent space.
latent_file = "latents.npy"
video_to_latent(vae, "input.mp4", latent_file, frame_size=(64, 64), device=device)

# Check the file size of the latent representations.
latent_size = os.path.getsize(latent_file)
print("Latent representation file size:", latent_size, "bytes")

# Reconstruct video from the latent representations.
latent_to_video(vae, latent_file, "reconstructed_output.mp4", frame_size=(64, 64), fps=25, device=device)


Processing video into latent space. Total frames: 374
Latent representations saved to latents.npy
Latent representation file size: 48000 bytes
Reconstructing video from latent representations. Total frames: 374
Reconstructed video saved to reconstructed_output.mp4


In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import cv2
import numpy as np
from torch.utils.data import Dataset, DataLoader
import os

# Define the VAE architecture.
class VAE(nn.Module):
    def __init__(self, latent_dim=32):
        super(VAE, self).__init__()
        self.latent_dim = latent_dim
        
        # Encoder: downscaling from 64x64 to 4x4 feature maps.
        self.enc_conv1 = nn.Conv2d(3, 16, 4, stride=2, padding=1)   # 64->32
        self.enc_conv2 = nn.Conv2d(16, 32, 4, stride=2, padding=1)  # 32->16
        self.enc_conv3 = nn.Conv2d(32, 64, 4, stride=2, padding=1)  # 16->8
        self.enc_conv4 = nn.Conv2d(64, 128, 4, stride=2, padding=1) # 8->4
        
        # Fully connected layers to produce mu and logvar.
        self.fc_mu = nn.Linear(128 * 4 * 4, latent_dim)
        self.fc_logvar = nn.Linear(128 * 4 * 4, latent_dim)
        
        # Decoder: from latent vector back to image.
        self.fc_dec = nn.Linear(latent_dim, 128 * 4 * 4)
        self.dec_conv1 = nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1)  # 4->8
        self.dec_conv2 = nn.ConvTranspose2d(64, 32, 4, stride=2, padding=1)   # 8->16
        self.dec_conv3 = nn.ConvTranspose2d(32, 16, 4, stride=2, padding=1)   # 16->32
        self.dec_conv4 = nn.ConvTranspose2d(16, 3, 4, stride=2, padding=1)    # 32->64

    def encode(self, x):
        h = F.relu(self.enc_conv1(x))
        h = F.relu(self.enc_conv2(h))
        h = F.relu(self.enc_conv3(h))
        h = F.relu(self.enc_conv4(h))
        h = h.view(h.size(0), -1)
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        h = F.relu(self.fc_dec(z))
        h = h.view(-1, 128, 4, 4)
        h = F.relu(self.dec_conv1(h))
        h = F.relu(self.dec_conv2(h))
        h = F.relu(self.dec_conv3(h))
        h = torch.sigmoid(self.dec_conv4(h))
        return h

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        x_recon = self.decode(z)
        return x_recon, mu, logvar

# Create a custom dataset that extracts frames from the video.
class VideoFrameDataset(Dataset):
    def __init__(self, video_path, frame_size=(64, 64)):
        self.frames = []
        self.frame_size = frame_size
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise ValueError("Cannot open video file: " + video_path)
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            # Convert BGR to RGB and resize.
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_resized = cv2.resize(frame_rgb, self.frame_size)
            # Normalize to [0, 1].
            frame_normalized = frame_resized.astype(np.float32) / 255.0
            self.frames.append(frame_normalized)
        cap.release()
    
    def __len__(self):
        return len(self.frames)
    
    def __getitem__(self, idx):
        frame = self.frames[idx]
        frame_tensor = torch.from_numpy(frame).permute(2, 0, 1)  # (H, W, C) -> (C, H, W)
        return frame_tensor

# Define the VAE loss: reconstruction (MSE) + KL divergence.
def loss_function(recon_x, x, mu, logvar):
    # Reconstruction loss (sum over all pixels).
    recon_loss = F.mse_loss(recon_x, x, reduction='sum')
    # KL divergence loss.
    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss + kl_loss

# Training loop for the VAE.
def train_vae(vae, dataloader, num_epochs=20, learning_rate=1e-3, device='cpu'):
    optimizer = optim.Adam(vae.parameters(), lr=learning_rate)
    vae.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in dataloader:
            batch = batch.to(device)
            optimizer.zero_grad()
            recon_batch, mu, logvar = vae(batch)
            loss = loss_function(recon_batch, batch, mu, logvar)
            loss.backward()
            total_loss += loss.item()
            optimizer.step()
        avg_loss = total_loss / len(dataloader.dataset)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

# Convert the video into latent representations (using the encoder) and save them.
def video_to_latent(vae, video_path, latent_save_path, frame_size=(64, 64), device='cpu'):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Cannot open video file:", video_path)
        return
    latents = []
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print("Encoding video into latent space. Total frames:", frame_count)
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame_resized = cv2.resize(frame_rgb, frame_size)
        frame_tensor = torch.from_numpy(frame_resized).permute(2, 0, 1).float() / 255.0
        frame_tensor = frame_tensor.unsqueeze(0).to(device)
        with torch.no_grad():
            mu, _ = vae.encode(frame_tensor)
        latents.append(mu.cpu().numpy())
    cap.release()
    latents = np.concatenate(latents, axis=0)  # Shape: (num_frames, latent_dim)
    np.save(latent_save_path, latents)
    print("Latent representations saved to", latent_save_path)
    return latents

# Reconstruct a video from the saved latent representations.
def latent_to_video(vae, latent_file, output_video_path, frame_size=(64, 64), fps=25, device='cpu'):
    latents = np.load(latent_file)
    num_frames = latents.shape[0]
    print("Reconstructing video from latent representations. Total frames:", num_frames)
    
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(output_video_path, fourcc, fps, frame_size)
    
    for i in range(num_frames):
        latent = torch.tensor(latents[i:i+1], dtype=torch.float).to(device)
        with torch.no_grad():
            recon = vae.decode(latent)
        # Convert tensor to image.
        recon_img = recon.squeeze(0).permute(1, 2, 0).cpu().numpy() * 255
        recon_img = np.clip(recon_img, 0, 255).astype(np.uint8)
        # Convert RGB back to BGR.
        recon_img = cv2.cvtColor(recon_img, cv2.COLOR_RGB2BGR)
        out.write(recon_img)
    out.release()
    print("Reconstructed video saved to", output_video_path)

# Main execution: training and inference.
if __name__ == "__main__":
    # Choose device.
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    vae = VAE(latent_dim=32).to(device)
    
    # Create dataset and dataloader from the video frames.
    dataset = VideoFrameDataset("input.mp4", frame_size=(64, 64))
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
    
    # Train the VAE.
    print("Starting training...")
    train_vae(vae, dataloader, num_epochs=20, learning_rate=1e-3, device=device)
    
    # Optionally, save the trained model.
    torch.save(vae.state_dict(), "vae_model.pth")
    print("Model saved to vae_model.pth")
    
    # Convert the video into latent representations and save to disk.
    latent_file = "latents.npy"
    video_to_latent(vae, "input.mp4", latent_file, frame_size=(64, 64), device=device)
    latent_size = os.path.getsize(latent_file)
    print("Latent representation file size:", latent_size, "bytes")
    
    # Reconstruct the video from the latent representations.
    latent_to_video(vae, latent_file, "reconstructed_output.mp4", frame_size=(64, 64), fps=25, device=device)


Starting training...
Epoch 1/20, Loss: 707.8670
Epoch 2/20, Loss: 551.4452
Epoch 3/20, Loss: 492.9435
Epoch 4/20, Loss: 446.2162
Epoch 5/20, Loss: 427.1008
Epoch 6/20, Loss: 421.1564
Epoch 7/20, Loss: 415.6964
Epoch 8/20, Loss: 405.6358
Epoch 9/20, Loss: 396.7384
Epoch 10/20, Loss: 383.3513
Epoch 11/20, Loss: 348.4491
Epoch 12/20, Loss: 329.1100
Epoch 13/20, Loss: 309.2149
Epoch 14/20, Loss: 291.4519
Epoch 15/20, Loss: 267.1868
Epoch 16/20, Loss: 241.2050
Epoch 17/20, Loss: 215.9315
Epoch 18/20, Loss: 193.8725
Epoch 19/20, Loss: 179.9552
Epoch 20/20, Loss: 167.3749
Model saved to vae_model.pth
Encoding video into latent space. Total frames: 374
Latent representations saved to latents.npy
Latent representation file size: 48000 bytes
Reconstructing video from latent representations. Total frames: 374
Reconstructed video saved to reconstructed_output.mp4


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import cv2
import numpy as np
from torch.utils.data import Dataset, DataLoader
import os

# Enhanced VAE with deeper architecture
class VAE(nn.Module):
    def __init__(self, latent_dim=128):
        super(VAE, self).__init__()
        self.latent_dim = latent_dim
        
        # Encoder
        self.enc_conv1 = nn.Conv2d(3, 32, 4, stride=2, padding=1)  # 64x64 -> 32x32
        self.enc_conv2 = nn.Conv2d(32, 32, 3, stride=1, padding=1)
        self.enc_bn1 = nn.BatchNorm2d(32)
        
        self.enc_conv3 = nn.Conv2d(32, 64, 4, stride=2, padding=1)  # 32x32 -> 16x16
        self.enc_conv4 = nn.Conv2d(64, 64, 3, stride=1, padding=1)
        self.enc_bn2 = nn.BatchNorm2d(64)
        
        self.enc_conv5 = nn.Conv2d(64, 128, 4, stride=2, padding=1)  # 16x16 -> 8x8
        self.enc_conv6 = nn.Conv2d(128, 128, 3, stride=1, padding=1)
        self.enc_bn3 = nn.BatchNorm2d(128)
        
        self.enc_conv7 = nn.Conv2d(128, 256, 4, stride=2, padding=1)  # 8x8 -> 4x4
        self.enc_conv8 = nn.Conv2d(256, 256, 3, stride=1, padding=1)
        self.enc_bn4 = nn.BatchNorm2d(256)
        
        # Latent space
        self.fc_mu = nn.Linear(256*4*4, latent_dim)
        self.fc_logvar = nn.Linear(256*4*4, latent_dim)
        
        # Decoder
        self.fc_dec = nn.Linear(latent_dim, 256*4*4)
        
        self.dec_conv1 = nn.ConvTranspose2d(256, 128, 4, stride=2, padding=1)  # 4x4 -> 8x8
        self.dec_conv2 = nn.Conv2d(128, 128, 3, stride=1, padding=1)
        self.dec_bn1 = nn.BatchNorm2d(128)
        
        self.dec_conv3 = nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1)  # 8x8 -> 16x16
        self.dec_conv4 = nn.Conv2d(64, 64, 3, stride=1, padding=1)
        self.dec_bn2 = nn.BatchNorm2d(64)
        
        self.dec_conv5 = nn.ConvTranspose2d(64, 32, 4, stride=2, padding=1)  # 16x16 -> 32x32
        self.dec_conv6 = nn.Conv2d(32, 32, 3, stride=1, padding=1)
        self.dec_bn3 = nn.BatchNorm2d(32)
        
        self.dec_conv7 = nn.ConvTranspose2d(32, 16, 4, stride=2, padding=1)  # 32x32 -> 64x64
        self.dec_conv8 = nn.Conv2d(16, 3, 3, stride=1, padding=1)

        self.init_weights()

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.constant_(m.bias, 0)

    def encode(self, x):
        # Encoder forward pass
        h = F.leaky_relu(self.enc_conv1(x), 0.2)
        h = F.leaky_relu(self.enc_bn1(self.enc_conv2(h)), 0.2)
        h = F.leaky_relu(self.enc_conv3(h), 0.2)
        h = F.leaky_relu(self.enc_bn2(self.enc_conv4(h)), 0.2)
        h = F.leaky_relu(self.enc_conv5(h), 0.2)
        h = F.leaky_relu(self.enc_bn3(self.enc_conv6(h)), 0.2)
        h = F.leaky_relu(self.enc_conv7(h), 0.2)
        h = F.leaky_relu(self.enc_bn4(self.enc_conv8(h)), 0.2)
        h = h.view(h.size(0), -1)
        return self.fc_mu(h), self.fc_logvar(h)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        # Decoder forward pass
        h = F.leaky_relu(self.fc_dec(z), 0.2)
        h = h.view(-1, 256, 4, 4)
        h = F.leaky_relu(self.dec_conv1(h), 0.2)
        h = F.leaky_relu(self.dec_bn1(self.dec_conv2(h)), 0.2)
        h = F.leaky_relu(self.dec_conv3(h), 0.2)
        h = F.leaky_relu(self.dec_bn2(self.dec_conv4(h)), 0.2)
        h = F.leaky_relu(self.dec_conv5(h), 0.2)
        h = F.leaky_relu(self.dec_bn3(self.dec_conv6(h)), 0.2)
        h = F.leaky_relu(self.dec_conv7(h), 0.2)
        return torch.sigmoid(self.dec_conv8(h))

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

# Dataset and DataLoader
class VideoFrameDataset(Dataset):
    def __init__(self, video_path, frame_size=(64, 64)):
        self.frames = []
        self.frame_size = frame_size
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise ValueError("Cannot open video file: " + video_path)
        
        print(f"Loading video: {video_path}")
        frame_count = 0
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_resized = cv2.resize(frame_rgb, self.frame_size)
            frame_normalized = frame_resized.astype(np.float32) / 255.0
            self.frames.append(frame_normalized)
            frame_count += 1
            if frame_count % 100 == 0:
                print(f"Loaded {frame_count} frames...")
        
        cap.release()
        print(f"Finished loading. Total frames: {frame_count}")
    
    def __len__(self):
        return len(self.frames)
    
    def __getitem__(self, idx):
        frame = self.frames[idx]
        return torch.from_numpy(frame).permute(2, 0, 1)

# Loss function
def loss_function(recon_x, x, mu, logvar):
    BCE = F.binary_cross_entropy(recon_x, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD

# Training loop
def train_vae(vae, dataloader, num_epochs=50, learning_rate=1e-3, device='cuda'):
    optimizer = optim.Adam(vae.parameters(), lr=learning_rate, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)
    
    vae.train()
    for epoch in range(num_epochs):
        print(f"Starting epoch {epoch+1}/{num_epochs}")
        total_loss = 0
        for batch_idx, batch in enumerate(dataloader):
            batch = batch.to(device)
            optimizer.zero_grad()
            recon_batch, mu, logvar = vae(batch)
            loss = loss_function(recon_batch, batch, mu, logvar)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
            if batch_idx % 10 == 0:
                print(f"Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}")
        
        avg_loss = total_loss / len(dataloader.dataset)
        scheduler.step(avg_loss)
        current_lr = optimizer.param_groups[0]['lr']
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, LR: {current_lr:.2e}')

# Main execution
if __name__ == "__main__":
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")
    
    # Initialize model
    vae = VAE(latent_dim=128).to(device)
    
    # Load dataset
    print("Loading dataset...")
    dataset = VideoFrameDataset("input.mp4", frame_size=(64, 64))
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0)
    
    # Train the model
    print("Starting training...")
    train_vae(vae, dataloader, num_epochs=50, learning_rate=3e-4, device=device)
    
    # Save the model
    torch.save(vae.state_dict(), "improved_vae_model.pth")
    print("Model saved to improved_vae_model.pth")

Using device: cpu
Loading dataset...
Loading video: input.mp4
Loaded 100 frames...
Loaded 200 frames...
Loaded 300 frames...
Finished loading. Total frames: 374
Starting training...
Starting epoch 1/50
Epoch 1, Batch 0, Loss: 300004.8438
Epoch 1, Batch 10, Loss: 267183.8438
Epoch 1/50, Loss: 8743.9752, LR: 3.00e-04
Starting epoch 2/50
Epoch 2, Batch 0, Loss: 264572.5625
Epoch 2, Batch 10, Loss: 250934.7031
Epoch 2/50, Loss: 8022.7605, LR: 3.00e-04
Starting epoch 3/50
Epoch 3, Batch 0, Loss: 249717.4688
Epoch 3, Batch 10, Loss: 243038.8594
Epoch 3/50, Loss: 7683.5009, LR: 3.00e-04
Starting epoch 4/50
Epoch 4, Batch 0, Loss: 242437.8125
Epoch 4, Batch 10, Loss: 240148.0156
Epoch 4/50, Loss: 7509.6561, LR: 3.00e-04
Starting epoch 5/50
Epoch 5, Batch 0, Loss: 238242.6875
Epoch 5, Batch 10, Loss: 235813.5625
Epoch 5/50, Loss: 7415.5092, LR: 3.00e-04
Starting epoch 6/50
Epoch 6, Batch 0, Loss: 235611.7500
Epoch 6, Batch 10, Loss: 235423.3281
Epoch 6/50, Loss: 7359.9569, LR: 3.00e-04
Starting

In [None]:
import numpy as np
import os
import cv2
from tensorflow.keras.layers import Input, Conv2D, Conv2DTranspose, ReLU
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import subprocess
import os
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
# 1. Data Preparation
def load_and_preprocess_images(folder_path):
    image_files = sorted(os.listdir(folder_path))
    images = []
    for file in image_files:
        img = load_img(os.path.join(folder_path, file), target_size=(1080, 1920))
        img_array = img_to_array(img) / 255.0
        images.append(img_array)
    return np.array(images)

# Load all frames (be careful with memory)
try:
    frames = load_and_preprocess_images('./frames')
except Exception as e:
    print(f"Error loading images: {e}")
    exit()

# 2. Build Autoencoder
def build_autoencoder():
    input_img = Input(shape=(1080, 1920, 3))
    
    # Encoder
    x = Conv2D(32, (3, 3), strides=2, padding='same')(input_img)
    x = ReLU()(x)
    x = Conv2D(64, (3, 3), strides=2, padding='same')(x)
    x = ReLU()(x)
    x = Conv2D(128, (3, 3), strides=2, padding='same')(x)
    encoded = ReLU()(x)
    
    # Decoder
    x = Conv2DTranspose(64, (3, 3), strides=2, padding='same')(encoded)
    x = ReLU()(x)
    x = Conv2DTranspose(32, (3, 3), strides=2, padding='same')(x)
    x = ReLU()(x)
    decoded = Conv2DTranspose(3, (3, 3), strides=2, padding='same', activation='sigmoid')(x)
    
    autoencoder = Model(input_img, decoded)
    autoencoder.compile(optimizer='adam', loss='mse')
    return autoencoder

autoencoder = build_autoencoder()
autoencoder.summary()

# 3. Train Autoencoder
autoencoder.fit(frames, frames,
                epochs=50,
                batch_size=2,  # Reduce batch size if memory constrained
                shuffle=True,
                validation_split=0.1)

# 4. Save Latent Representations
encoder = Model(inputs=autoencoder.input, outputs=autoencoder.layers[-4].output)
os.makedirs('./latent_vectors', exist_ok=True)

for i, img in enumerate(frames):
    latent = encoder.predict(img[np.newaxis, ...])
    np.save(f'./latent_vectors/frame_{i:04d}.npy', latent)

# 5. Reconstruct Frames
decoder_input = Input(shape=(135, 240, 128))
decoder_layers = autoencoder.layers[-3](decoder_input)
decoder_layers = autoencoder.layers[-2](decoder_layers)
decoder_layers = autoencoder.layers[-1](decoder_layers)
decoder = Model(decoder_input, decoder_layers)

os.makedirs('./reconstructed_frames', exist_ok=True)

for i in range(len(frames)):
    latent = np.load(f'./latent_vectors/frame_{i:04d}.npy')
    reconstructed = decoder.predict(latent)
    reconstructed_img = (reconstructed[0] * 255).astype('uint8')
    cv2.imwrite(f'./reconstructed_frames/frame_{i:04d}.png', cv2.cvtColor(reconstructed_img, cv2.COLOR_RGB2BGR))

# 6. Create Video
subprocess.run([
    'ffmpeg', '-framerate', '30', '-i', './reconstructed_frames/frame_%04d.png',
    '-c:v', 'libx264', '-pix_fmt', 'yuv420p', '-vf', 'scale=1920:1080',
    'output.mp4'
])

print("Processing complete! Output video saved as output.mp4")

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1080, 1920, 3)]   0         
                                                                 
 conv2d (Conv2D)             (None, 540, 960, 32)      896       
                                                                 
 re_lu (ReLU)                (None, 540, 960, 32)      0         
                                                                 
 conv2d_1 (Conv2D)           (None, 270, 480, 64)      18496     
                                                                 
 re_lu_1 (ReLU)              (None, 270, 480, 64)      0         
                                                                 
 conv2d_2 (Conv2D)           (None, 135, 240, 128)     73856     
                                                                 
 re_lu_2 (ReLU)              (None, 135, 240, 128)     0     

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.