## HMT Tranining

**Packages**

In [1]:
import h5py
import math
import random
import numpy as np
import os.path as osp
import json

import import_ipynb
from Model import HMT

from ortools.algorithms import pywrapknapsack_solver

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
from torch.autograd import Variable

importing Jupyter notebook from Model.ipynb


### Util modules

In [2]:
def mkdir_if_missing(directory):
    if not osp.exists(directory):
        try:
            os.makedirs(directory)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

def write_json(obj, fpath):
    mkdir_if_missing(osp.dirname(fpath))
    with open(fpath, 'w') as f:
        json.dump(obj, f, indent=4, separators=(',', ': '))


In [3]:
def split_random(keys, num_videos, num_train):
    """Random split"""
    train_keys, test_keys = [], []
    rnd_idxs = np.random.choice(range(num_videos), size=num_train, replace=False)
    for key_idx, key in enumerate(keys):
        if key_idx in rnd_idxs:
            train_keys.append(key)
        else:
            test_keys.append(key)

    assert len(set(train_keys) & set(test_keys)) == 0, "Error: train_keys and test_keys overlap"

    return train_keys, test_keys

In [4]:
def parse_splits_filename(splits_filename):
    # Parse split file and count number of k_folds
    spath, sfname = os.path.split(splits_filename)
    sfname, _ = os.path.splitext(sfname)
    dataset_name = sfname.split('_')[0]  # Get dataset name e.g. tvsum
    dataset_type = sfname.split('_')[1]  # augmentation type e.g. aug

    # The keyword 'splits' is used as the filename fields terminator from historical reasons.
    if dataset_type == 'splits':
        # Split type is not present
        dataset_type = ''

    # Get number of discrete splits within each split json file
    with open(splits_filename, 'r') as sf:
        splits = json.load(sf)

    return dataset_name, dataset_type, splits


In [5]:
def get_frame_probs(shot_probs, cps, n_frames):
    
    if len(shot_probs) != len(cps):
        print('no. of shots does not match')
        return
    frame_probs = torch.zeros(n_frames, dtype=torch.float32)
    n_segs = cps.shape[0]
    for seg_idx in range(n_segs):
        first, last = cps[seg_idx]
        frame_probs[first:last + 1] = shot_probs[seg_idx]
        
    return frame_probs

In [6]:
def weights_init(m):
    classname = m.__class__.__name__
    if classname == 'Linear':
        init.xavier_uniform_(m.weight, gain=np.sqrt(2.0))
        if m.bias is not None:
            init.constant_(m.bias, 0.1)

In [7]:
def knapsack_ortools(values, weights, items, capacity ):
    scale = 1000
    values = np.array(values)
    weights = np.array(weights)
    values = (values * scale).astype(np.int32)
    weights = (weights).astype(np.int32)
    capacity = capacity
    osolver = pywrapknapsack_solver.KnapsackSolver(pywrapknapsack_solver.KnapsackSolver.KNAPSACK_DYNAMIC_PROGRAMMING_SOLVER,'test')
    osolver.Init(values.tolist(), [weights.tolist()], [capacity])
    computed_value = osolver.Solve()
    packed_items = [x for x in range(0, len(weights))
                    if osolver.BestSolutionContains(x)]

    return packed_items

In [8]:
def generate_summary(ypred, cps, n_frames, nfps, positions, proportion=0.15, method='knapsack'):
    """Generate keyshot-based video summary i.e. a binary vector.
    Args:
    ---------------------------------------------
    - ypred: predicted importance scores.
    - cps: change points, 2D matrix, each row contains a segment.
    - n_frames: original number of frames.
    - nfps: number of frames per segment.
    - positions: positions of subsampled frames in the original video.
    - proportion: length of video summary (compared to original video length).
    - method: defines how shots are selected, ['knapsack', 'rank'].
    """
    n_segs = cps.shape[0]
    frame_scores = np.zeros((n_frames), dtype=np.float32)
    if positions.dtype != int:
        positions = positions.astype(np.int32)
    if positions[-1] != n_frames:
        positions = np.concatenate([positions, [n_frames]])
    for i in range(len(positions) - 1):
        pos_left, pos_right = positions[i], positions[i+1]
        if i == len(ypred):
            frame_scores[pos_left:pos_right] = 0
        else:
            frame_scores[pos_left:pos_right] = ypred[i]

    seg_score = []
    for seg_idx in range(n_segs):
        start, end = int(cps[seg_idx,0]), int(cps[seg_idx,1]+1)
        scores = frame_scores[start:end]
        seg_score.append(float(scores.mean()))

    limits = int(math.floor(n_frames * proportion))

    if method == 'knapsack':
        #picks = knapsack_dp(seg_score, nfps, n_segs, limits)
        picks = knapsack_ortools(seg_score, nfps, n_segs, limits)
    elif method == 'rank':
        order = np.argsort(seg_score)[::-1].tolist()
        picks = []
        total_len = 0
        for i in order:
            if total_len + nfps[i] < limits:
                picks.append(i)
                total_len += nfps[i]
    else:
        raise KeyError("Unknown method {}".format(method))

    summary = np.zeros((1), dtype=np.float32) # this element should be deleted
    for seg_idx in range(n_segs):
        nf = nfps[seg_idx]
        if seg_idx in picks:
            tmp = np.ones((nf), dtype=np.float32)
        else:
            tmp = np.zeros((nf), dtype=np.float32)
        summary = np.concatenate((summary, tmp))

    summary = np.delete(summary, 0) # delete the first element
    return summary


def evaluate_summary(machine_summary, user_summary, eval_metric='avg'):
    """Compare machine summary with user summary (keyshot-based).
    Args:
    --------------------------------
    machine_summary and user_summary should be binary vectors of ndarray type.
    eval_metric = {'avg', 'max'}
    'avg' averages results of comparing multiple human summaries.
    'max' takes the maximum (best) out of multiple comparisons.
    """
    machine_summary = machine_summary.astype(np.float32)
    user_summary = user_summary.astype(np.float32)
    n_users,n_frames = user_summary.shape

    # binarization
    machine_summary[machine_summary > 0] = 1
    user_summary[user_summary > 0] = 1

    if len(machine_summary) > n_frames:
        machine_summary = machine_summary[:n_frames]
    elif len(machine_summary) < n_frames:
        zero_padding = np.zeros((n_frames - len(machine_summary)))
        machine_summary = np.concatenate([machine_summary, zero_padding])

    f_scores = []
    prec_arr = []
    rec_arr = []

    for user_idx in range(n_users):
        gt_summary = user_summary[user_idx,:]
        overlap_duration = (machine_summary * gt_summary).sum()
        precision = overlap_duration / (machine_summary.sum() + 1e-8)
        recall = overlap_duration / (gt_summary.sum() + 1e-8)
        if precision == 0 and recall == 0:
            f_score = 0.
        else:
            f_score = (2 * precision * recall) / (precision + recall)
        f_scores.append(f_score)
        prec_arr.append(precision)
        rec_arr.append(recall)

    if eval_metric == 'avg':
        final_f_score = np.mean(f_scores)
        final_prec = np.mean(prec_arr)
        final_rec = np.mean(rec_arr)
    elif eval_metric == 'max':
        final_f_score = np.max(f_scores)
        max_idx = np.argmax(f_scores)
        final_prec = prec_arr[max_idx]
        final_rec = rec_arr[max_idx]
    
    return final_f_score, final_prec, final_rec


### Training Hyper Parameters

In [9]:
class HParameters:
    
    def __init__(self, args):
        
        self.verbose = args['verbose']
        self.use_cuda = args['use_cuda']
        self.cuda_device = args['cuda_device']
        self.max_summary_length = args['max_summary_length']

        self.l2_req = 0.00001
        self.lr_epochs = [0]
        self.lr = [0.00005]
        self.epochs_max = 300
        self.train_batch_size = 1

        self.dataset=args['dataset']
        self.results_path = args['results_path']
        self.num_splits = args['num_splits']
        self.split_file = args['split_file']
        self.train_percent = args['train_percent']
        
        if 'model_path' in args:
            self.model_path = args['model_path']
        else:
            self.model_path = None
        return


    def create_split(self):
        print("Loading dataset from {}".format(self.dataset))
        
        with h5py.File(self.dataset, 'r') as dataset:
            keys = dataset.keys()
            num_videos = len(keys)
            num_train = int(math.ceil(num_videos * self.train_percent))
            num_test = num_videos - num_train

            print("Split breakdown: # total videos {}. # train videos {}. # test videos {}".format(num_videos, num_train, num_test))
            splits = []

            for split_idx in range(self.num_splits):
                train_keys, test_keys = split_random(keys, num_videos, num_train)
                splits.append({
                    'train_keys': train_keys,
                    'test_keys': test_keys,
                    })

            # saveto = osp.join(self.split_file)
            write_json(splits, self.split_file)
            print("Splits saved to {}".format(self.split_file))

        
    def __str__(self):
        vars = [attr for attr in dir(self) if not callable(getattr(self,attr)) and not (attr.startswith("__") or attr.startswith("_"))]

        info_str = ''
        for i, var in enumerate(vars):
            val = getattr(self, var)
            if isinstance(val, Variable):
                val = val.data.cpu().numpy().tolist()[0]
            info_str += '['+str(i)+'] '+var+': '+str(val)+'\n'

        return info_str
    
    
#     def load_from_args(self, args):
#         for key in args:
#             val = args[key]
#             if val is not None:
#                 if hasattr(self, key) and isinstance(getattr(self, key), list):
#                     val = val.split()

#                 setattr(self, key, val)

#     def get_dataset_by_name(self, dataset_name):
#         for d in self.datasets:
#             if dataset_name in d:
#                 return [d]
#         return None

    


### Trainer

In [10]:
class Trainer:
    def __init__(self, hps: HParameters):
        self.hps = hps
        self.model = HMT()
        self.verbose = True
        self.criterion = nn.MSELoss()
        self.show_every = 1
        

        
    def init_model(self):
        if self.hps.model_path:
            self.model.load_state_dict(torch.load(self.hps.model_path, map_location=lambda storage, loc: storage))
        else:
            self.model.eval()
            self.model.apply(weights_init)
        
   
    def train(self, train_keys):
        losses=[]
        for i, key in enumerate(train_keys):
            with h5py.File('../../Preprocessing/extracted_features/normal/TVSum.h5') as d:
                vid_feats= d[key]['features'][...]
                aud_feats= d[key]['aud_feats'][...]
                boundaries = d[key]['change_points'][...]
                n_frames = d[key]['n_frames'][()]
                target = d[key]['gt_probs'][...]
                # target = target.astype(float)
                
            vid_feats = torch.from_numpy(vid_feats).unsqueeze(0)
            aud_feats = torch.from_numpy(aud_feats).unsqueeze(0)
            target = torch.Tensor(target)

            # Min-Max Normalize frame scores
            # target -= target.min()
            # target /= target.max()


            if self.hps.use_cuda:
                seq, target = seq.float().cuda(), target.float().cuda()

            # seq_len = seq.shape[1]
            print('Video key:',key, 'video and audio feat shape:', vid_feats.shape, aud_feats.shape)
            P = self.model(vid_feats,aud_feats, boundaries)
            P = P.reshape(-1)
            P_frames = get_frame_probs(P, boundaries, n_frames)
        
            loss_att = 0
            loss = self.criterion(P_frames, target)
            print('Loss:', loss)
            loss = loss + loss_att
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            losses.append(float(loss))
            
        return np.mean(np.array(losses))

    
    def video_fscore(self, machine_summary_activations, test_keys, metric='tvsum', att_vecs=None):
        eval_metric = 'avg' if metric == 'tvsum' else 'max'

        # if results_filename is not None:
        #     h5_res = h5py.File(results_filename, 'w')

        fms = []
        video_scores = []
        for key_idx, key in enumerate(test_keys):
            
            probs = machine_summary_activations[key]


            with h5py.File(self.hps.dataset,'r') as d:
                cps = d[key]['change_points'][...]
                num_frames = d[key]['n_frames'][()]
                nfps = d[key]['n_frame_per_seg'][...].tolist()
                positions = d[key]['picks'][...]
                user_summary = d[key]['user_summary'][...]

            machine_summary = generate_summary(probs, cps, num_frames, nfps, positions)
            fm, _, _ = evaluate_summary(machine_summary, user_summary, eval_metric)
            fms.append(fm)

            # Reporting & logging
            video_scores.append([key_idx + 1, key, "{:.1%}".format(fm)])
            
        mean_fm = np.mean(fms)
        
        return mean_fm, video_scores

    def validate(self, test_keys):
        self.model.eval()
        summary = {}
        att_vecs = {}
        with torch.no_grad():
            for i, key in enumerate(test_keys):
                with h5py.File(self.hps.dataset) as d:
                    seq = d[key]['features'][...]
                    
                seq = torch.from_numpy(seq).unsqueeze(0)

                if self.hps.use_cuda:
                    seq = seq.float().cuda()

                y, att_vec = self.model(seq, seq.shape[1])
                summary[key] = y[0].detach().cpu().numpy()
                att_vecs[key] = att_vec.detach().cpu().numpy()

        f_score, video_scores = self.video_fscore(summary, test_keys, att_vecs=att_vecs)
        return f_score, video_scores
        
        
    def run(self):
        print("Initializing HMT model and optimizer...")
        self.init_model()
        self.model.train()

        if self.hps.use_cuda:
            self.criterion = self.criterion.cuda()

        parameters = filter(lambda p: p.requires_grad, self.model.parameters())
        self.optimizer = torch.optim.Adam(parameters, lr=self.hps.lr[0], weight_decay=self.hps.l2_req)
        
        lr = self.hps.lr[0]
        
        f = open(hps.split_file)
        splits = json.load(f)
        n_folds = len(splits)
        
        print("Starting training...")
        for split in splits:
            max_val_fscore = 0
            max_val_fscore_epoch = 0
            train_keys = split['train_keys']
            test_keys = split['test_keys']

            epoch_losses=[]
            for epoch in range(self.hps.epochs_max):

                print("Epoch: {0:6}".format(str(epoch)+"/"+str(self.hps.epochs_max)), end='')
                self.model.train()

                random.shuffle(train_keys) 
                loss = self.train(train_keys)
                epoch_losses.append(np.mean(loss))
                
                
                # # Evaluate test dataset
                # val_fscore, video_scores = self.validate(test_keys)
                # if max_val_fscore < val_fscore:
                #     max_val_fscore = val_fscore
                #     max_val_fscore_epoch = epoch
                
                if epoch%self.show_every==0:
                    print(f'Epoch:{epoch}, Loss:{loss}')

            # avg_loss = np.array(epoch_losses)
            print("   Train loss: {0:.05f}".format(np.mean(np.array(epoch_losses))), end='')
            # print('   Test F-score avg/max: {0:0.5}/{1:0.5}'.format(val_fscore, max_val_fscore))

            # if self.verbose:
            #     video_scores = [["No", "Video", "F-score"]] + video_scores
            #     print_table(video_scores, cell_width=[3,40,8])

        # return max_val_fscore, max_val_fscore_epoch
        return
    
    def save_model(self, name):
        # Save model weights
        filename = name+'_'+str(epoch)+'_'+splitn+'.pth.tar'
        torch.save(self.model.state_dict(), os.path.join('models', filename))
        

**Train**

In [11]:
args={
    'results_path':'training_results.txt',
    'num_splits':5,
    'split_file':'splits/test_split1.json',
    'dataset': '../../Preprocessing/extracted_features/normal/TVSum.h5',
    'train_percent':0.8,
    'verbose':True,
    'use_cuda' : False,
    'cuda_device': None,
    'max_summary_length': 0.15
    
}

In [12]:
hps = HParameters(args)
# hps.load_from_args(args.__dict__)
hps.create_split()

Loading dataset from ../../Preprocessing/extracted_features/normal/TVSum.h5
Split breakdown: # total videos 50. # train videos 40. # test videos 10
Splits saved to splits/test_split1.json


In [13]:
trainer = Trainer(hps)
trainer.run()

Initializing HMT model and optimizer...
Starting training...
Epoch: 0/300 Video key: video_35 video and audio feat shape: torch.Size([1, 297, 1024]) torch.Size([1, 297, 128])
Loss: tensor(0.0001, grad_fn=<MseLossBackward0>)
Video key: video_6 video and audio feat shape: torch.Size([1, 644, 1024]) torch.Size([1, 644, 128])
Loss: tensor(1.6509e-05, grad_fn=<MseLossBackward0>)
Video key: video_3 video and audio feat shape: torch.Size([1, 934, 1024]) torch.Size([1, 934, 128])
Loss: tensor(1.0195e-05, grad_fn=<MseLossBackward0>)
Video key: video_31 video and audio feat shape: torch.Size([1, 360, 1024]) torch.Size([1, 360, 128])
Loss: tensor(0.0001, grad_fn=<MseLossBackward0>)
Video key: video_36 video and audio feat shape: torch.Size([1, 530, 1024]) torch.Size([1, 530, 128])
Loss: tensor(3.6565e-05, grad_fn=<MseLossBackward0>)
Video key: video_22 video and audio feat shape: torch.Size([1, 377, 1024]) torch.Size([1, 377, 128])
Loss: tensor(5.2293e-05, grad_fn=<MseLossBackward0>)
Video key: v

KeyboardInterrupt: 

In [None]:
trainer.save_model()

---------------------------