In [2]:
import os
from tqdm import tqdm
import torch
import cv2
import numpy as np
from PIL import Image
from torchvision import transforms
import csv
from time import time

### Get frames

In [None]:
data_dir = r"D:\VGAF_dataset"
print(data_dir)

In [None]:
def process_dir(dirname):
    resdir = data_dir + "/" + dirname + '_frames'
    d = os.path.normpath(os.path.join(data_dir, dirname))
    for filename in tqdm(os.listdir(d)):
        '''if filename.lower().endswith('ini'):
            continue'''
        if os.path.isdir(os.path.normpath(os.path.join(d,filename))):
            videofile=None
            for fn in os.listdir(os.path.normpath(os.path.join(d,filename))):
                '''if fn.lower().endswith('ini'):
                    continue'''
                videofile=fn
            if videofile is None:
                continue
            filename=os.path.normpath(os.path.join(filename,videofile))
        fn, ext = os.path.splitext(os.path.basename(filename))
        outdir=os.path.normpath(os.path.join(resdir, fn))
        if not os.path.exists(outdir):
            os.makedirs(outdir)
        command = "ffmpeg -r 1 -i "+os.path.join(d,filename) + " -r 1 "+outdir+"/%05d.png"
        command = os.path.normpath(command)
        # print(command)
        os.system(command=command)

In [None]:
process_dir('Train')

In [None]:
process_dir('Val')

### Detect faces in frames

In [None]:
print(f"Torch: {torch.__version__}")
# device = 'cuda:0'
device = 'cpu'
use_cuda = torch.cuda.is_available()
print(use_cuda)

In [None]:
from facenet_pytorch import MTCNN # pretrained model for image recognition
mtcnn = MTCNN(keep_all=True, min_face_size=40, device=device)

In [None]:
from facial_analysis import FacialImageProcessing
imgProcessing=FacialImageProcessing(False,minsize=64)

In [None]:
scale=1
def save_faces(source_path,save_path):
    if not os.path.exists(save_path):
        os.mkdir(os.path.normpath(save_path))
    for folder in tqdm(os.listdir(source_path)):
        if not os.path.exists(os.path.join(save_path, folder)):
            os.mkdir(os.path.normpath(os.path.join(save_path, folder)))
    
        for image in os.listdir(os.path.join(source_path, folder)):
            filename = os.path.join(source_path, folder, image)
            # print(filename)
            frame_bgr = cv2.imread(filename)
            frame = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
            #frame=cv2.resize(frame, (0,0), fx=1/scale, fy=1/scale) 
            bounding_boxes, _ = imgProcessing.detect_faces(frame)

            if len(bounding_boxes)!=0:
                root,ext=os.path.splitext(image)
                faces_folder=os.path.join(save_path, folder, root) 
                if not os.path.exists(faces_folder):
                    os.mkdir(faces_folder)
                for i,bounding_box in enumerate(bounding_boxes):
                    outfile=os.path.join(faces_folder, str(i)+ext)
                    if not os.path.exists(outfile):
                        bounding_box*=scale
                        b=[max(0,int(bi)) for bi in bounding_box]
                        x1,y1,x2,y2=b[0:4]
                        face_img=frame_bgr[y1:y2,x1:x2,:]

                        if np.prod(face_img.shape)==0:
                            print('Empty face ',b,' found for ',filename)
                            continue
                        #face_img=cv2.resize(face_img,INPUT_SIZE)
                        cv2.imwrite(outfile, face_img) 
        
        
save_faces(os.path.join(data_dir,'Val_frames'),os.path.join(data_dir,'Val_faces'))
save_faces(os.path.join(data_dir,'Train_frames'),os.path.join(data_dir,'Train_faces'))

### Extract features from frames

In [3]:
data_dir = r"../input/vgaf-dataset"

In [4]:
idx_to_class = {1: 'Positive', 2: 'Neutral', 3: 'Negative'}

In [5]:
!pip install timm==0.4.5

Collecting timm==0.4.5
  Downloading timm-0.4.5-py3-none-any.whl (287 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.4/287.4 kB[0m [31m874.6 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: timm
Successfully installed timm-0.4.5
[0m

In [6]:
IMG_SIZE=224
PATH='../input/enet-b2/enet_b2_8.pt'
test_transforms = transforms.Compose(
    [
        transforms.Resize((IMG_SIZE,IMG_SIZE)),
        #transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    ]
)
np_transforms = transforms.Compose(
    [
        transforms.ToPILImage(None),
        transforms.Resize((IMG_SIZE,IMG_SIZE)),
        #transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    ]
)

feature_extractor_model = torch.load(PATH)

In [None]:
classifier_weights=feature_extractor_model.classifier.weight.cpu().data.numpy()
classifier_bias=feature_extractor_model.classifier.bias.cpu().data.numpy()
print(classifier_weights.shape,classifier_weights)
print(classifier_bias.shape,classifier_bias)

In [7]:
device = 'cuda:0'
feature_extractor_model.classifier=torch.nn.Identity()
feature_extractor_model.to(device)
feature_extractor_model.eval()

EfficientNet(
  (conv_stem): Conv2dSame(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
  (bn1): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  (act1): SiLU(inplace=True)
  (blocks): Sequential(
    (0): Sequential(
      (0): DepthwiseSeparableConv(
        (conv_dw): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
        (bn1): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
        (act1): SiLU(inplace=True)
        (se): SqueezeExcite(
          (conv_reduce): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
          (act1): SiLU(inplace=True)
          (conv_expand): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
        )
        (conv_pw): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn2): BatchNorm2d(16, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
        (act2): Identity()
      )
      (1): DepthwiseSeparableConv(


In [9]:
def get_features_scores(data_dir):
    videoname2features={}
    if 'Train_faces' in data_dir:
        data_dir = '../input/vgaf-dataset/Train_faces/Train_faces'
    if 'Val_faces' in data_dir:
        data_dir = '../input/vgaf-dataset/Val_faces/Val_faces'
    for videoname in tqdm(os.listdir(data_dir)): # 2_1
        frames_dir=os.path.join(data_dir,videoname) 
        X_global_features = [] # features каждого кадра
        for filename in sorted(os.listdir(frames_dir)): # 00001 - directory with faces of the frame 
            faces_dir=os.path.join(frames_dir,filename)
            imgs=[] # тензоры лиц одного кадра

            for img_name in sorted(os.listdir(faces_dir)): # лицо
                #print(img_name)
                
                img = Image.open(os.path.join(faces_dir,img_name)) # открываем // по-другому считали и обработали изображение
                img_tensor = test_transforms(img) # transform
                # print('Image open and transform', time() - start)
                
                if img.size:
                    imgs.append(img_tensor) # добавляем в imgs
                    # print('img_tensor.shape', img_tensor.shape)

            if len(imgs)>0: # если лица есть 
                # inp = preprocessing_function(np.array(imgs, dtype=np.float32))
                stacked_images = torch.stack(imgs, dim=0).to(device)
                features = feature_extractor_model(stacked_images) # конкатенация изображений (img.shape = 3,224,224 // torch.stack.shape = 7,3,224,224) time()
                # print('features.shape', features.shape)
                #print(features.is_cuda)
                #print(videoname,filename,global_features.shape,feats.shape,scores.shape)
                features = features.data.cpu().numpy()
                X_global_features.append(features)
                # print('Get features', time() - start)
        
        #print(videoname,len(X_global_features))
        videoname2features[videoname] = X_global_features
    return videoname2features

video2Allfeatures_val=get_features_scores(os.path.join(data_dir,'Val_faces'))
#video2Allfeatures_train=get_features_scores(os.path.join(data_dir,'Train_faces'))

100%|██████████| 741/741 [1:00:28<00:00,  4.90s/it]


In [None]:
video2Allfeatures_train['2_1'][0].shape

In [14]:
import pickle
model_name = 'enet_b2_8'
MODEL2EMOTIW_FEATURES=model_name+'_vgaf_val.pickle' 

print(MODEL2EMOTIW_FEATURES)


enet_b2_8_vgaf_val.pickle


In [15]:
with open(MODEL2EMOTIW_FEATURES, 'wb') as handle:
    pickle.dump(video2Allfeatures_val, handle, protocol=pickle.HIGHEST_PROTOCOL)
#print(len(video2Allfeatures_train),len(video2Allfeatures_val))

In [None]:
def create_dataset(videoname2features,labelsfile):
    x = []
    y = []
    has_faces=[]
    ind=0
    with open(labelsfile, mode='r') as csvfile:
        labels_reader = csv.reader(csvfile, delimiter=' ')
        for i,row in enumerate(labels_reader):
            if i==0:
                #print('first:',row)
                continue
            print(row) 
            videoname,label=row[0],int(row[1]) # row[0] - 2_1 videoname, int(row[1]) - 2 label
            print(videoname,label)
            X_global_features=videoname2features[videoname]
            #print(videoname,label,len(X_global_features))
            
            total_features=[]
            for cur_features in X_global_features: # cur_features - фичи каждого кадра
                #print(cur_features.shape)
                if False:
                    total_features.extend(cur_features)
                else:
                    mean_features = (np.mean(cur_features, axis=0))
                    std_features = (np.std(cur_features, axis=0))
                    max_features = (np.max(cur_features, axis=0))
                    min_features = (np.min(cur_features, axis=0))

                    # join several features together
                    #feature = np.concatenate((mean_features, std_features, min_features, max_features), axis=None)                    
                    #feature = np.concatenate((mean_features, std_features, min_features), axis=None)
                    #feature = np.concatenate((mean_features, min_features, max_features), axis=None)
                    feature = np.concatenate((mean_features,std_features), axis=None)
                    #feature = np.concatenate((max_features,mean_features,std_features), axis=None)
                    #feature=max_features

                    total_features.append(feature)
            
            if len(total_features)>0:
                total_features=np.array(total_features)
                mean_features = (np.mean(total_features, axis=0))
                std_features = (np.std(total_features, axis=0))
                max_features = (np.max(total_features, axis=0))
                min_features = (np.min(total_features, axis=0))

                # join several features together
                #feature = np.concatenate((mean_features, std_features, min_features, max_features), axis=None)                    
                #feature = np.concatenate((mean_features, std_features, min_features), axis=None)
                feature = np.concatenate((mean_features,std_features), axis=None)
                #feature = np.concatenate((max_features,std_features), axis=None)
                #feature=std_features
                x.append(feature)
                has_faces.append(1)
            else:
                x.append(np.zeros_like(feature))
                has_faces.append(0)
            y.append(label-1)
    x=np.array(x)
    y=np.array(y)
    has_faces=np.array(has_faces)
    print(x.shape,y.shape)
    return x,y,has_faces

x_train, y_train, has_faces_train = create_dataset(video2Allfeatures_train,os.path.join(data_dir,'Train_labels.txt'))
x_test, y_test, has_faces_test = create_dataset(video2Allfeatures_val,os.path.join(DATA_DIR,'Val_labels.txt'))

In [None]:
x_train[0].shape

In [None]:
from sklearn import svm,metrics,preprocessing

x_train_norm=preprocessing.normalize(x_train_enet,norm='l2')
x_test_norm=preprocessing.normalize(x_test_enet,norm='l2')

In [None]:
svc_clf = svm.LinearSVC(C=1.1) #0.5 1.1 0.6
#clf = svm.SVC(C=10.0, gamma=1.0, kernel='rbf')
#np.random.seed(1)
#clf=RandomForestClassifier(n_estimators=1000,max_depth=7, n_jobs=-1)
#clf=KNeighborsClassifier(n_neighbors=3,p=2)

#import xgboost as xgb
#clf = xgb.XGBClassifier(n_estimators=1000,use_label_encoder=False)

if True:    
    svc_clf.fit(x_train_norm[has_faces_train==1], y_train_enet[has_faces_train==1])
    y_pred = svc_clf.predict(x_test_norm)
else:
    clf.fit(x_train[has_faces_train==1], y_train[has_faces_train==1])
    y_pred = clf.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test_enet[has_faces_test==1], y_pred[has_faces_test==1]))
print("Complete accuracy:",metrics.accuracy_score(y_test_enet, y_pred))