In [1]:
import os
import cv2
import torch
import torchvision
from torchvision import transforms
from PIL import Image
import pickle

from catboost import CatBoostClassifier
import numpy as np
import pandas as pd
import time
import csv
from torch import nn



  warn(f"Failed to load image Python extension: {e}")


In [36]:
class ResNet101(nn.Module):
    def __init__(self, pretrained=False):
        super(ResNet101, self).__init__()
        self.model = torchvision.models.resnet101(pretrained=pretrained)
        
        self.model.fc = nn.Linear(2048, 10)
        self.linear = nn.Linear(10, 2)
        
        layers_count = len(list(self.model.parameters()))
        for i, parameter in enumerate(self.model.parameters()):
            if i < layers_count - 5:
                parameter.requires_grad = False
                
                
    def forward(self, X):
        logits = self.model(X)
        if self.training:
            logits = self.linear(logits)
        return logits



In [37]:
class ResNet152(nn.Module):
    def __init__(self, pretrained=False):
        super(ResNet152, self).__init__()
        self.model = torchvision.models.resnet152(pretrained=pretrained)
        
        self.model.fc = nn.Linear(2048, 10)
        self.linear = nn.Linear(10, 2)
        
        layers_count = len(list(self.model.parameters()))
        for i, parameter in enumerate(self.model.parameters()):
            if i < layers_count - 10:
                parameter.requires_grad = False
                
                
    def forward(self, X):
        logits = self.model(X)
        if self.training:
            logits = self.linear(logits)
        return logits

In [38]:
class DenseNet(nn.Module):
    def __init__(self, pretrained=False):
        super(DenseNet, self).__init__()
        self.model = torchvision.models.densenet201(pretrained=pretrained)
        
        self.model.classifier = nn.Linear(1920, 10)
        self.linear = nn.Linear(10, 2)
        
        layers_count = len(list(self.model.parameters()))
        for i, parameter in enumerate(self.model.parameters()):
            if i < layers_count - 5:
                parameter.requires_grad = False
                
                
    def forward(self, X):
        logits = self.model(X)
        if self.training:
            logits = self.linear(logits)
        return logits

In [39]:
transform = torchvision.transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(350),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

In [40]:
DATA_FOLDER = './data/6/'
MODEL_FOLDER = './models/stacking/'
VIDEO_PATH = os.path.join(DATA_FOLDER, 'video.mp4')
OUTPUT_PATH = os.path.join(DATA_FOLDER, 'predictions.txt')

In [41]:
resnet152 = ResNet152()
resnet152.load_state_dict(torch.load(os.path.join(MODEL_FOLDER, 'ResNet152.pt')))
resnet152.eval();

resnet101 = ResNet101()
resnet101.load_state_dict(torch.load(os.path.join(MODEL_FOLDER, 'ResNet101.pt')))
resnet101.eval();

densenet = DenseNet()
densenet.load_state_dict(torch.load(os.path.join(MODEL_FOLDER, 'DenseNet.pt')))
densenet.eval();

boosting = CatBoostClassifier().load_model(os.path.join(MODEL_FOLDER, 'boosting.model'))
random_forest = pickle.load(open(os.path.join(MODEL_FOLDER, 'random_forest.pkl'), 'rb'))
svm = pickle.load(open(os.path.join(MODEL_FOLDER, 'svm.pkl'), 'rb'))
scaler = pickle.load(open(os.path.join(MODEL_FOLDER, 'scaler.pkl'), 'rb'))
# extra_tree = pickle.load(open(os.path.join(MODEL_FOLDER, 'extra_tree.pkl'), 'rb'))

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [42]:
def get_prediction(frame):
    WEIGHTS = np.array([1, 1.2])
    transformed_image = transform(frame)
    
    resnet101_logits = scaler.transform(resnet101(transformed_image[None,]).detach().numpy())
#     resnet152_logits = resnet152(transformed_image[None,]).detach().numpy()
    densenet_logits = scaler.transform(densenet(transformed_image[None,]).detach().numpy())
    
    predictions = np.array([0, 0])
    
    boosting_resnet101_probas = boosting.predict_proba(resnet101_logits)
#     boosting_resnet152_probas = boosting.predict_proba(resnet152_logits)
    boosting_densenet_probas = boosting.predict_proba(densenet_logits)
    
    forest_resnet101_probas = random_forest.predict_proba(resnet101_logits)
#     forest_resnet152_probas = random_forest.predict_proba(resnet152_logits)
    forest_densenet_probas = random_forest.predict_proba(densenet_logits)
    
    svm_resnet101_probas = svm.predict_proba(resnet101_logits)
#     svm_resnet152_probas = svm.predict_proba(resnet152_logits)
    svm_densenet_probas = svm.predict_proba(densenet_logits)
    
    predictions = (boosting_resnet101_probas + boosting_densenet_probas + 
                forest_resnet101_probas + forest_densenet_probas + 
                svm_resnet101_probas + svm_densenet_probas) / 4
    predictions = predictions * WEIGHTS
    predictions /= predictions.sum()
    return predictions

In [43]:
video_capture = cv2.VideoCapture(VIDEO_PATH)
assert video_capture.isOpened()

FPS = int(video_capture.get(cv2.CAP_PROP_FPS))
DURATION = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT) / FPS)
SECONDS_PER_FRAME = 1

print("FPS: ", FPS)
print("Duration: ", DURATION)

previous_class = None
current_start = None

segments = []
for current_video_position in range(0, DURATION + SECONDS_PER_FRAME, SECONDS_PER_FRAME):
    video_capture.set(cv2.CAP_PROP_POS_MSEC, current_video_position * 1000)
    ret, frame = video_capture.read()
    if ret == False:
        break
    
    probas = get_prediction(frame)
    predicted_class = np.argmax(probas)
    
    if previous_class is None:
        current_start = current_video_position
        previous_class = predicted_class
    if predicted_class != previous_class:
        segments.append([current_start, current_video_position - 1, previous_class])
        current_start = current_video_position - 1
        previous_class = predicted_class
    
    print('Predicted class at second', current_video_position, 'is', predicted_class, 'with probas', probas)

segments.append([current_start, current_video_position, previous_class])
video_capture.release()
cv2.destroyAllWindows()

FPS:  30
Duration:  120
Predicted class at second 0 is 0 with probas [[0.70781026 0.29218974]]
Predicted class at second 1 is 0 with probas [[0.699358 0.300642]]
Predicted class at second 2 is 0 with probas [[0.71843811 0.28156189]]
Predicted class at second 3 is 0 with probas [[0.73157397 0.26842603]]
Predicted class at second 4 is 0 with probas [[0.68514336 0.31485664]]
Predicted class at second 5 is 0 with probas [[0.71148229 0.28851771]]
Predicted class at second 6 is 0 with probas [[0.72873458 0.27126542]]
Predicted class at second 7 is 0 with probas [[0.72266496 0.27733504]]
Predicted class at second 8 is 0 with probas [[0.74454016 0.25545984]]
Predicted class at second 9 is 0 with probas [[0.71847136 0.28152864]]
Predicted class at second 10 is 1 with probas [[0.19587264 0.80412736]]
Predicted class at second 11 is 1 with probas [[0.1872219 0.8127781]]
Predicted class at second 12 is 1 with probas [[0.1785638 0.8214362]]
Predicted class at second 13 is 1 with probas [[0.19101834

In [44]:
pd.DataFrame(segments).set_index(0).to_csv(OUTPUT_PATH, sep=' ', header=False)

In [45]:
OUTPUT_PATH

'./data/6/predictions.txt'