In [10]:
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import gc
import os

# Configuration
num_frames = 50
target_height = 140
batch_size = 1  # Minimum batch size for maximum memory efficiency

def extract_frames(video_path, num_frames):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while len(frames) < num_frames and cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    cap.release()
    return frames

def create_video(frames, output_path, fps, resolution):
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    height, width = resolution
    writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    for frame in frames:
        writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
    writer.release()

def build_minimal_sr_model(lr_shape, hr_shape):
    inputs = Input(shape=lr_shape)
    
    # Feature extraction
    x = Conv2D(8, 3, padding='same', activation='relu')(inputs)
    
    # Upscaling to target resolution
    x = Lambda(lambda x: tf.image.resize(x, hr_shape[:2], 
               method='bicubic'))(x)
    
    # Final convolution
    outputs = Conv2D(3, 3, padding='same', activation='sigmoid')(x)
    
    return Model(inputs, outputs)

# ----------------- GPU Configuration -----------------
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Configure GPU with memory limits
        tf.config.set_logical_device_configuration(
            gpus[0],
            [tf.config.LogicalDeviceConfiguration(memory_limit=2048)]  # 2GB limit
        )
    except RuntimeError as e:
        print(f"GPU configuration error: {e}")

# ----------------- Data Pipeline -----------------
class FrameGenerator:
    def __init__(self, lr_frames, hr_frames):
        self.lr_frames = lr_frames
        self.hr_frames = hr_frames
        
    def __len__(self):
        return len(self.lr_frames)
    
    def __getitem__(self, idx):
        lr = self.lr_frames[idx] / 255.0
        hr = self.hr_frames[idx] / 255.0
        return np.expand_dims(lr, axis=0), np.expand_dims(hr, axis=0)

# ----------------- Processing -----------------
# Extract frames
hr_frames = extract_frames('input.mp4', num_frames)
if not hr_frames:
    raise ValueError("No frames extracted from input video")

# Get video properties
original_height, original_width = hr_frames[0].shape[:2]
cap = cv2.VideoCapture('input.mp4')
fps = cap.get(cv2.CAP_PROP_FPS)
cap.release()

# Create low-res frames
lr_frames = []
for frame in hr_frames:
    aspect_ratio = original_width / original_height
    new_width = int(target_height * aspect_ratio)
    lr_frame = cv2.resize(frame, (new_width, target_height), interpolation=cv2.INTER_AREA)
    lr_frames.append(lr_frame)

# Create compressed video
create_video(lr_frames, 'compressed.mp4', fps, (target_height, new_width))

# Prepare data generator
train_gen = FrameGenerator(lr_frames, hr_frames)

# Clear CPU memory
del hr_frames
gc.collect()

# ----------------- Training Setup -----------------
try:
    # Build model with correct output dimensions
    model = build_minimal_sr_model(lr_frames[0].shape, (original_height, original_width, 3))
    
    # Compile with reduced precision
    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    model.compile(
        optimizer=Adam(learning_rate=1e-4),
        loss='mae'
    )
    
    # Training loop with progress monitoring
    for epoch in range(20):  # Reduced epochs
        print(f"\nEpoch {epoch+1}/20")
        epoch_loss = 0
        
        for i in range(len(train_gen)):
            x, y = train_gen[i]
            loss = model.train_on_batch(x, y)
            epoch_loss += loss
            
            # Clear memory
            tf.keras.backend.clear_session()
            gc.collect()
            
            # Print progress
            if (i + 1) % 10 == 0:
                print(f"Processed {i+1}/{len(train_gen)} frames - Loss: {loss:.4f}")
        
        # Save model checkpoint
        model.save(f'sr_model_epoch_{epoch+1}.h5')
        print(f"Epoch {epoch+1} completed - Average Loss: {epoch_loss/len(train_gen):.4f}")
            
except Exception as e:
    print(f"Training failed: {str(e)}")

# ----------------- Inference -----------------
predicted = []
for i in range(len(lr_frames)):
    x = np.expand_dims(lr_frames[i] / 255.0, axis=0)
    pred = model.predict(x, verbose=0)
    predicted.append((pred[0] * 255).astype(np.uint8))
    
    # Clear memory
    tf.keras.backend.clear_session()
    gc.collect()
    
    # Print progress
    if (i + 1) % 10 == 0:
        print(f"Processed {i+1}/{len(lr_frames)} frames for reconstruction")

# Create output video
create_video(predicted, 'output.mp4', fps, (original_height, original_width))

print("Processing complete. Results saved in output.mp4")

GPU configuration error: Virtual devices cannot be modified after being initialized

Epoch 1/20


KeyboardInterrupt: 

In [4]:
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, Add, Conv2DTranspose, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import VGG19
from tensorflow.keras.callbacks import ReduceLROnPlateau
import gc
import os

# Configuration
num_frames = 50
target_height = 140
batch_size = 1  # Minimum batch size for maximum memory efficiency

# ----------------- Helper Functions -----------------
def extract_frames(video_path, num_frames):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while len(frames) < num_frames and cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    cap.release()
    return frames

def create_video(frames, output_path, fps, resolution):
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    height, width = resolution
    writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    for frame in frames:
        writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
    writer.release()

# ----------------- Improved Model Architecture -----------------
def build_improved_sr_model(lr_shape, hr_shape):
    inputs = Input(shape=lr_shape)
    
    # Initial feature extraction
    x = Conv2D(64, 3, padding='same', activation='relu')(inputs)
    
    # Residual blocks
    for _ in range(4):
        residual = x
        x = Conv2D(64, 3, padding='same', activation='relu')(x)
        x = Conv2D(64, 3, padding='same')(x)
        x = Add()([x, residual])
    
    # Upsampling
    x = Conv2DTranspose(64, 3, strides=2, padding='same', activation='relu')(x)
    x = Conv2DTranspose(64, 3, strides=2, padding='same', activation='relu')(x)
    
    # Final convolution
    outputs = Conv2D(3, 3, padding='same', activation='sigmoid')(x)
    
    return Model(inputs, outputs)

# ----------------- Perceptual Loss -----------------
def perceptual_loss(y_true, y_pred):
    vgg = VGG19(include_top=False, weights='imagenet', input_shape=(None, None, 3))
    vgg = Model(vgg.input, vgg.layers[20].output)
    vgg.trainable = False
    
    true_features = vgg(y_true)
    pred_features = vgg(y_pred)
    
    return tf.reduce_mean(tf.square(true_features - pred_features))

# ----------------- Data Pipeline -----------------
class FrameGenerator:
    def __init__(self, lr_frames, hr_frames):
        self.lr_frames = lr_frames
        self.hr_frames = hr_frames
        
    def __len__(self):
        return len(self.lr_frames)
    
    def __getitem__(self, idx):
        lr = self.lr_frames[idx] / 255.0
        hr = self.hr_frames[idx] / 255.0
        return np.expand_dims(lr, axis=0), np.expand_dims(hr, axis=0)

# ----------------- PSNR and SSIM Calculation -----------------
def calculate_psnr(img1, img2):
    mse = np.mean((img1 - img2) ** 2)
    if mse == 0:
        return float('inf')
    max_pixel = 255.0
    return 20 * np.log10(max_pixel / np.sqrt(mse))

def calculate_ssim(img1, img2):
    # SSIM calculation using OpenCV
    return cv2.SSIM(img1, img2)

# ----------------- Processing -----------------
# Extract frames
hr_frames = extract_frames('input.mp4', num_frames)
if not hr_frames:
    raise ValueError("No frames extracted from input video")

# Get video properties
original_height, original_width = hr_frames[0].shape[:2]
cap = cv2.VideoCapture('input.mp4')
fps = cap.get(cv2.CAP_PROP_FPS)
cap.release()

# Create low-res frames
lr_frames = []
for frame in hr_frames:
    aspect_ratio = original_width / original_height
    new_width = int(target_height * aspect_ratio)
    lr_frame = cv2.resize(frame, (new_width, target_height), interpolation=cv2.INTER_AREA)
    lr_frames.append(lr_frame)

# Create compressed video
create_video(lr_frames, 'compressed.mp4', fps, (target_height, new_width))

# Prepare data generator
train_gen = FrameGenerator(lr_frames, hr_frames)

# Clear CPU memory
del hr_frames
gc.collect()

# ----------------- Training Setup -----------------
try:
    # Build improved model
    model = build_improved_sr_model(lr_frames[0].shape, (original_height, original_width, 3))
    
    # Compile with perceptual loss and Adam optimizer
    model.compile(
        optimizer=Adam(learning_rate=1e-4),
        loss=perceptual_loss
    )
    
    # Learning rate scheduler
    reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, min_lr=1e-6)
    
    # Training loop with progress monitoring
    for epoch in range(20):  # Reduced epochs
        print(f"\nEpoch {epoch+1}/20")
        epoch_loss = 0
        
        for i in range(len(train_gen)):
            x, y = train_gen[i]
            loss = model.train_on_batch(x, y)
            epoch_loss += loss
            
            # Clear memory
            tf.keras.backend.clear_session()
            gc.collect()
            
            # Print progress
            if (i + 1) % 10 == 0:
                print(f"Processed {i+1}/{len(train_gen)} frames - Loss: {loss:.4f}")
        
        # Save model checkpoint
        model.save(f'sr_model_epoch_{epoch+1}.h5')
        print(f"Epoch {epoch+1} completed - Average Loss: {epoch_loss/len(train_gen):.4f}")
            
except Exception as e:
    print(f"Training failed: {str(e)}")

# ----------------- Inference -----------------
predicted = []
for i in range(len(lr_frames)):
    x = np.expand_dims(lr_frames[i] / 255.0, axis=0)
    pred = model.predict(x, verbose=0)
    predicted.append((pred[0] * 255).astype(np.uint8))
    
    # Clear memory
    tf.keras.backend.clear_session()
    gc.collect()
    
    # Print progress
    if (i + 1) % 10 == 0:
        print(f"Processed {i+1}/{len(lr_frames)} frames for reconstruction")

# Create output video
create_video(predicted, 'output.mp4', fps, (original_height, original_width))

# Evaluate PSNR and SSIM
psnr_values = []
ssim_values = []
for i in range(len(predicted)):
    psnr_values.append(calculate_psnr(lr_frames[i], predicted[i]))
    ssim_values.append(calculate_ssim(lr_frames[i], predicted[i]))

print(f"Average PSNR: {np.mean(psnr_values):.2f}")
print(f"Average SSIM: {np.mean(ssim_values):.4f}")

print("Processing complete. Results saved in output.mp4")

OpenCV bindings requires "numpy" package.
Install it via command:
    pip install numpy


ModuleNotFoundError: No module named 'numpy'

In [5]:
import tensorflow as tf
import tensorflow_compression as tfc

# -------------------------------
# Helper: GDN and Inverse GDN layers from tfc
# -------------------------------
GDN = tfc.layers.GDN
IGDN = tfc.layers.GDN(inverse=True)

# -------------------------------
# Module 1: Motion Estimation Network (Pyramid Network)
# -------------------------------
class PyramidMotionEstimator(tf.keras.Model):
    def __init__(self):
        super(PyramidMotionEstimator, self).__init__()
        # For simplicity, we implement a single-scale network
        # In practice, you would build a 5-level pyramid. Here we mimic one level.
        self.conv1 = tf.keras.layers.Conv2D(32, kernel_size=7, padding='same', activation='relu')
        self.conv2 = tf.keras.layers.Conv2D(64, kernel_size=7, padding='same', activation='relu')
        self.conv3 = tf.keras.layers.Conv2D(32, kernel_size=7, padding='same', activation='relu')
        self.conv4 = tf.keras.layers.Conv2D(16, kernel_size=7, padding='same', activation='relu')
        self.conv5 = tf.keras.layers.Conv2D(2,  kernel_size=7, padding='same', activation=None)
    
    def call(self, current_frame, ref_frame):
        # In the real model, a multi-scale pyramid would be used.
        # Here, we simply concatenate and pass through a series of conv layers.
        x = tf.concat([current_frame, ref_frame], axis=-1)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        flow = self.conv5(x)  # output: estimated motion (flow)
        return flow

# -------------------------------
# Module 2: Motion Compression (Autoencoder)
# -------------------------------
class MotionCompressionAutoencoder(tf.keras.Model):
    def __init__(self):
        super(MotionCompressionAutoencoder, self).__init__()
        # Encoder: 4 conv layers with downsampling (×2 each time)
        self.encoder_layers = [
            tf.keras.layers.Conv2D(128, kernel_size=3, strides=2, padding='same'),
            GDN(),
            tf.keras.layers.Conv2D(128, kernel_size=3, strides=2, padding='same'),
            GDN(),
            tf.keras.layers.Conv2D(128, kernel_size=3, strides=2, padding='same'),
            GDN(),
            tf.keras.layers.Conv2D(128, kernel_size=3, strides=2, padding='same')
        ]
        # Decoder: 4 conv layers with upsampling (×2 each time)
        self.decoder_layers = [
            tf.keras.layers.Conv2DTranspose(128, kernel_size=3, strides=2, padding='same'),
            IGDN(),
            tf.keras.layers.Conv2DTranspose(128, kernel_size=3, strides=2, padding='same'),
            IGDN(),
            tf.keras.layers.Conv2DTranspose(128, kernel_size=3, strides=2, padding='same'),
            IGDN(),
            tf.keras.layers.Conv2DTranspose(2,   kernel_size=3, strides=2, padding='same')
        ]
    
    def call(self, motion):
        # Encode motion
        x = motion
        for layer in self.encoder_layers:
            x = layer(x)
        # Quantization could be added here (e.g., via rounding or learned quantization)
        motion_latent = x
        # Decode motion latent representation
        x = motion_latent
        for layer in self.decoder_layers:
            x = layer(x)
        # x approximates the compressed motion (flow)
        return x, motion_latent  # return both for potential rate estimation

# -------------------------------
# Module 3: Motion Compensation Network
# -------------------------------
class MotionCompensationNetwork(tf.keras.Model):
    def __init__(self):
        super(MotionCompensationNetwork, self).__init__()
        # Architecture per Figure 2: series of conv layers with 3x3 filters.
        self.conv1 = tf.keras.layers.Conv2D(64, kernel_size=3, padding='same', activation='relu')
        self.conv2 = tf.keras.layers.Conv2D(64, kernel_size=3, padding='same', activation='relu')
        self.conv3 = tf.keras.layers.Conv2D(64, kernel_size=3, padding='same', activation='relu')
        self.conv4 = tf.keras.layers.Conv2D(3,  kernel_size=3, padding='same', activation=None)
    
    def call(self, ref_frame, warped_frame, motion_comp):
        # Concatenate the reference, warped reference and the motion information.
        x = tf.concat([ref_frame, warped_frame, motion_comp], axis=-1)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        compensated_frame = self.conv4(x)
        return compensated_frame

# -------------------------------
# Module 4: Residual Compression (Autoencoder)
# -------------------------------
class ResidualCompressionAutoencoder(tf.keras.Model):
    def __init__(self):
        super(ResidualCompressionAutoencoder, self).__init__()
        # Similar structure as motion compression, but using 5x5 filters.
        # Encoder: 4 conv layers with downsampling
        self.encoder_layers = [
            tf.keras.layers.Conv2D(128, kernel_size=5, strides=2, padding='same'),
            GDN(),
            tf.keras.layers.Conv2D(128, kernel_size=5, strides=2, padding='same'),
            GDN(),
            tf.keras.layers.Conv2D(128, kernel_size=5, strides=2, padding='same'),
            GDN(),
            tf.keras.layers.Conv2D(128, kernel_size=5, strides=2, padding='same')
        ]
        # Decoder: 4 conv layers with upsampling
        self.decoder_layers = [
            tf.keras.layers.Conv2DTranspose(128, kernel_size=5, strides=2, padding='same'),
            IGDN(),
            tf.keras.layers.Conv2DTranspose(128, kernel_size=5, strides=2, padding='same'),
            IGDN(),
            tf.keras.layers.Conv2DTranspose(128, kernel_size=5, strides=2, padding='same'),
            IGDN(),
            tf.keras.layers.Conv2DTranspose(3,   kernel_size=5, strides=2, padding='same')
        ]
    
    def call(self, residual):
        # Encode residual
        x = residual
        for layer in self.encoder_layers:
            x = layer(x)
        residual_latent = x
        # Quantization could be added here too
        # Decode latent residual representation
        x = residual_latent
        for layer in self.decoder_layers:
            x = layer(x)
        reconstructed_residual = x
        return reconstructed_residual, residual_latent

# -------------------------------
# Full OpenDVC Model
# -------------------------------
class OpenDVCModel(tf.keras.Model):
    def __init__(self):
        super(OpenDVCModel, self).__init__()
        self.motion_estimator = PyramidMotionEstimator()
        self.motion_compressor = MotionCompressionAutoencoder()
        self.motion_compensator = MotionCompensationNetwork()
        self.residual_compressor = ResidualCompressionAutoencoder()
    
    def call(self, current_frame, ref_frame):
        # 1. Estimate motion (optical flow) between current and previous frame.
        estimated_flow = self.motion_estimator(current_frame, ref_frame)
        
        # 2. Compress the motion:
        #    (i) Encode and (ii) decode (simulate quantization with a straight-through pass)
        compressed_flow, motion_latent = self.motion_compressor(estimated_flow)
        
        # 3. Warp the reference frame using the (compressed) motion.
        # Here we use a simple bilinear sampler provided by tf.contrib (or tf-addons) if available.
        # For our example, we use tf.image.resize as a dummy warping (this is NOT real warping).
        # In practice, use tf.contrib.image.dense_image_warp or equivalent.
        warped_ref = tf.image.resize(ref_frame, tf.shape(ref_frame)[1:3])
        
        # 4. Motion Compensation: use the reference frame, warped frame and motion information.
        compensated_frame = self.motion_compensator(ref_frame, warped_ref, compressed_flow)
        
        # 5. Compute the residual (difference between current frame and compensated frame).
        residual = current_frame - compensated_frame
        
        # 6. Compress the residual.
        reconstructed_residual, residual_latent = self.residual_compressor(residual)
        
        # 7. Reconstruct the final frame.
        reconstructed_frame = compensated_frame + reconstructed_residual
        
        # For training, we would also output rate estimations (R) from the latents.
        # Here we simply return latents as placeholders.
        return {
            'reconstructed_frame': reconstructed_frame,
            'estimated_flow': estimated_flow,
            'compressed_flow': compressed_flow,
            'motion_latent': motion_latent,
            'residual_latent': residual_latent
        }

# -------------------------------
# Loss and Training Step (Simplified)
# -------------------------------
def compute_loss(current_frame, reconstructed_frame, motion_latent, residual_latent, lambda_rd=256):
    # Distortion term: Mean Squared Error (MSE)
    distortion = tf.reduce_mean(tf.square(current_frame - reconstructed_frame))
    
    # Rate term: we “simulate” the bit-rate loss by the L1 norm of the latents (placeholder)
    rate_motion = tf.reduce_mean(tf.abs(motion_latent))
    rate_residual = tf.reduce_mean(tf.abs(residual_latent))
    rate_loss = rate_motion + rate_residual
    
    loss = lambda_rd * distortion + rate_loss
    return loss

@tf.function
def train_step(model, optimizer, current_frame, ref_frame, lambda_rd=256):
    with tf.GradientTape() as tape:
        outputs = model(current_frame, ref_frame)
        loss = compute_loss(
            current_frame,
            outputs['reconstructed_frame'],
            outputs['motion_latent'],
            outputs['residual_latent'],
            lambda_rd
        )
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

# -------------------------------
# Example Usage
# -------------------------------
if __name__ == "__main__":
    # Create the model and optimizer
    model = OpenDVCModel()
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    
    # Dummy input: batch of 4 frames (height=256, width=256, channels=3)
    current_frame = tf.random.normal([4, 256, 256, 3])
    ref_frame = tf.random.normal([4, 256, 256, 3])
    
    # One training step (for demonstration; training loops need more bells and whistles)
    loss = train_step(model, optimizer, current_frame, ref_frame)
    print("Training loss:", loss.numpy())


ModuleNotFoundError: No module named 'numpy'