## Transnet

**installations**

In [9]:
!pip install ffmpeg
!pip install ffmpeg-python pillow



**packages**

In [3]:
import sys
import argparse
import os

import h5py
import numpy as np
import pandas as pd

import tensorflow as tf
import ffmpeg


#### )) Paths

In [1]:
datasets_path='../../data'
public_dataset_path=datasets_path+'/Public datasets'
custom_data = datasets_path+'/Custom data'

tvsum_data = public_dataset_path+'/ydata-tvsum50-v1_1'
summe_data = public_dataset_path+'/SUMMe'

result_path= 'transnet_segments/'

#### )) Transnet

In [18]:
class TransNetV2:
    

    def __init__(self):
        model_dir = "transnetv2-weights/"
        self._input_size = (27, 48, 3)
        try:
            self._model = tf.saved_model.load(model_dir)
        except OSError as exc:
            raise IOError(f"[TransNetV2] It seems that files in {model_dir} are corrupted or missing. "
                          f"Re-download them manually and retry. For more info, see: "
                          f"https://github.com/soCzech/TransNetV2/issues/1#issuecomment-647357796") from exc

    def predict_raw(self, frames: np.ndarray):
        assert len(frames.shape) == 5 and frames.shape[2:] == self._input_size, \
            "[TransNetV2] Input shape must be [batch, frames, height, width, 3]."
        frames = tf.cast(frames, tf.float32)
        logits, dict_ = self._model(frames)
        single_frame_pred = tf.sigmoid(logits)
        # all_frames_pred = tf.sigmoid(dict_["many_hot"])
        
        return single_frame_pred
    # , all_frames_pred

    def predict_frames(self, frames: np.ndarray):
        assert len(frames.shape) == 4 and frames.shape[1:] == self._input_size, "[TransNetV2] Input shape must be [frames, height, width, 3]."

        def input_iterator():
            # return windows of size 100 where the first/last 25 frames are from the previous/next batch
            # the first and last window must be padded by copies of the first and last frame of the video
            no_padded_frames_start = 25
            no_padded_frames_end = 25 + 50 - (len(frames) % 50 if len(frames) % 50 != 0 else 50)  # 25 - 74

            start_frame = np.expand_dims(frames[0], 0)
            end_frame = np.expand_dims(frames[-1], 0)
            padded_inputs = np.concatenate(
                [start_frame] * no_padded_frames_start + [frames] + [end_frame] * no_padded_frames_end, 0
            )

            ptr = 0
            while ptr + 100 <= len(padded_inputs):
                out = padded_inputs[ptr:ptr + 100]
                ptr += 50
                yield out[np.newaxis]

        predictions = []

        for inp in input_iterator():
            # single_frame_pred, all_frames_pred = self.predict_raw(inp)
            single_frame_pred = self.predict_raw(inp)
            predictions.append((single_frame_pred.numpy()[0, 25:75, 0]))

            print("\r[TransNetV2] Processing video frames {}/{}".format(
                min(len(predictions) * 50, len(frames)), len(frames)
            ), end="")
        print("")

        single_frame_pred = np.concatenate([single_ for single_ in predictions])
        # all_frames_pred = np.concatenate([all_ for single_, all_ in predictions])

        # return single_frame_pred[:len(frames)], all_frames_pred[:len(frames)]  # remove extra padded frames
        return single_frame_pred[:len(frames)]

    def predict_video(self, video_fn: str):

        print("[TransNetV2] Extracting frames from {}".format(video_fn))
        video_stream, err = ffmpeg.input(video_fn).output(
            "pipe:", format="rawvideo", pix_fmt="rgb24", s="48x27"
        ).run(capture_stdout=True, capture_stderr=True)

        video = np.frombuffer(video_stream, np.uint8).reshape([-1, 27, 48, 3])
        return self.predict_frames(video)

    @staticmethod
    def predictions_to_scenes(predictions: np.ndarray, threshold: float = 0.5):
        predictions = (predictions > threshold).astype(np.uint8)

        scenes = []
        t, t_prev, start = -1, 0, 0
        for i, t in enumerate(predictions):
            if t_prev == 1 and t == 0:
                start = i
            if t_prev == 0 and t == 1 and i != 0:
                scenes.append([start, i])
            t_prev = t
        if t == 0:
            scenes.append([start, i])

        # just fix if all predictions are 1
        if len(scenes) == 0:
            return np.array([[0, len(predictions) - 1]], dtype=np.int32)

        return np.array(scenes, dtype=np.int32)


def main(args, mappings=None):
    model = TransNetV2()
    video_folder = args['videoFolder_path']+'/video/'
    files = os.listdir(video_folder)
    c=0
    
    
    with h5py.File(args['cpsH5_path'],'a') as d:
        for file in files:
            if mappings!=None:
                 name = 'video_'+str(mappings.index(file.split('.')[0])+1)
            else:
                name = 'video_'+str(c)
                
            if name in d:
                c+=1
                continue
                
            # video_frames, 
            single_frame_predictions= model.predict_video(video_folder+file)
            scenes = model.predictions_to_scenes(single_frame_predictions)

            c=c+1
            print(str(c)+'. '+file+' as '+name+'; no.of shots='+str(len(scenes)))
            d.create_dataset(name+'/change_points', data=scenes )
            d.create_dataset(name+'/video_name', data=file )
        





#### )) TVSum segmentations

In [13]:
## Arguments
args={
    'videoFolder_path': tvsum_data,
    'cpsH5_path': result_path+'/tvsumSegs.h5',    
}

In [14]:
## video name mappings
tvsum_info=pd.read_csv(tvsum_data+'/data/ydata-tvsum50-info.tsv',sep='\t')
vnames = tvsum_info['video_id'].tolist()

In [19]:
## generate segments
main(args, vnames)

[TransNetV2] Extracting frames from ../../data/Public datasets/ydata-tvsum50-v1_1/video/Bhxk-O1Y7Ho.mp4
[TransNetV2] Processing video frames 13511/13511
11. Bhxk-O1Y7Ho.mp4 as video_12; no.of shots=43
[TransNetV2] Extracting frames from ../../data/Public datasets/ydata-tvsum50-v1_1/video/byxOvuiIJV0.mp4
[TransNetV2] Processing video frames 3705/3705
12. byxOvuiIJV0.mp4 as video_34; no.of shots=53
[TransNetV2] Extracting frames from ../../data/Public datasets/ydata-tvsum50-v1_1/video/cjibtmSLxQ4.mp4
[TransNetV2] Processing video frames 19406/19406
13. cjibtmSLxQ4.mp4 as video_21; no.of shots=157
[TransNetV2] Extracting frames from ../../data/Public datasets/ydata-tvsum50-v1_1/video/E11zDS9XGzg.mp4
[TransNetV2] Processing video frames 15307/15307
14. E11zDS9XGzg.mp4 as video_46; no.of shots=13
[TransNetV2] Extracting frames from ../../data/Public datasets/ydata-tvsum50-v1_1/video/EE-bNr36nyA.mp4
[TransNetV2] Processing video frames 2941/2941
15. EE-bNr36nyA.mp4 as video_38; no.of shots=1