# Milestone 3 - training the classifier

## Features for Training Set

**Objective**

Using the feature extraction from the previous chapter, compute features for all videos in the training set and save them in HDF5 files. Make sure the features for genuine and Deepfake videos are saved in separate folders, so you can know if the feature comes from original genuine video or Deepfake.



**Workflow**

Now you can compute a feature vector for a single image, which is the cropped face from a video frame.
The goal now is, for each video and for each frame of the video, to detect the face in the video, compute the features for that face, and save the resulted feature on disk in HDF5 file. You should have **one HDF5 file for each video**. The **file will contain a matrix with the number of rows equal to the number of frames in that video and the number of columns equal to the number of features you compute for a single face**.
The **HDF5 files should be saved in the same directory structure that the video database has, but instead of videos you will have HDF5 files with features**.
To loop through the videos inside a directory, you can use standard python routines for recursively traversing the directory.



To fulfill above task first recursive function of this project can be updated with feature calculation and HDF5 file saving.

In [1]:
import glob
import cv2
import os
import numpy as np
from matplotlib import pyplot as plt
from skimage.metrics import structural_similarity as ssim
import imutils
import cv2
import dlib
from imutils.face_utils import FaceAligner
import math
import h5py
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
def hell(hist_1, hist_2):
    return cv2.compareHist(hist_1, hist_2, cv2.HISTCMP_BHATTACHARYYA)

In [3]:
def cv_hist(frame, bins_no = 100):
    return cv2.calcHist([frame], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])

In [4]:
def np_hist(image, num_bins = 100):
    hist, bins = np.histogram(image.ravel(), num_bins, [0,256], density=True)
    return hist

In [5]:
def chi_sqr(hist_1, hist_2):
    return cv2.compareHist(hist_1, hist_2, cv2.HISTCMP_CHISQR)

In [6]:
def template(image_1, image_2):
    image_2 = cv2.cvtColor(image_2, cv2.COLOR_BGR2GRAY) 
    res = cv2.matchTemplate(image_2, cv2.cvtColor(image_1, cv2.COLOR_BGR2GRAY),
                                cv2.TM_CCOEFF_NORMED)   # match operation
    return res

In [7]:
def mse(image_1, image_2):
    err = np.sum((image_1.astype("float") - image_2.astype("float"))**2)
    err /= float(image_1.shape[0]*image_2.shape[1])
    return err

In [8]:
def blurred_frame(image, kernel_size = 3, sigma_x = 5, sigma_y = 0.5):
    return cv2.GaussianBlur(image, (kernel_size, kernel_size), sigma_x, sigma_y)

In [9]:
def psnr(image_1, image_2, mse): 
    if(mse == 0): 
        return 100
    max_pixel = 255.0
    psnr = 20 * math.log10(max_pixel / math.sqrt(mse)) 
    return psnr 

In [10]:
def feat_calc(image):
    blurred_image = blurred_frame(image);
    hist_0 = np_hist(image)
    hist_1 = cv_hist(image);
    hist_2 = cv_hist(blurred_image);
    chi = chi_sqr(hist_1, hist_2);
    hellinger = hell(hist_1, hist_2);
    err = mse(image, blurred_image);
    peak = psnr(image, blurred_image, err);
    template_found = template(image, blurred_image);
    (score, diff) = ssim(image, blurred_image, full = True, multichannel=True);
    return np.concatenate([[score], [peak], [err], [hellinger], [chi], hist_0])

In [11]:
store_feat = []
f = 1
classifier = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')   # load the pre-trained model
pat = os.getcwd()
path = pat    # making global copy because glob was changing the path in jupyther (no such problems stationary)
# normalisation of faces dimensions - declaration of normalisators
predictor = dlib.shape_predictor(path + '/shape_predictor_68_face_landmarks.dat')
fa = FaceAligner(predictor, desiredFaceWidth = 200)

def getting_features(pat):
    ''' function iterates recurively through direcory files and
    saves the frames of encountered videos to three folders
    1 - oryginal videos, 2 - lower quality deep fakes, 
    3 - higher quality deep fakes '''

    for filename in glob.iglob(pat+'/*',
                        recursive = True): 		# 1 (from Workflow list)
        if 'avi' in filename and 'mgwt0' in filename:
            cap = cv2.VideoCapture(filename)
            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            print(filename, total_frames)
            for fno in range(0, total_frames):
                cap.set(cv2.CAP_PROP_POS_FRAMES, fno)
                image, frame = cap.read()
                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                global classifier
                global predictor
                global fa
                bboxes = classifier.detectMultiScale(frame)
                for box in bboxes:
                    # extract
                    x, y, w, h = box
                    rect = dlib.rectangle(x, y, w + x, h + y)   # normalisation of each frame
                    cropped_img = fa.align(frame, gray, rect)
                    feat = feat_calc(cropped_img)
                    global store_feat
                    store_feat.append(feat)
            # converse to numpy array with n rows
            features_save = np.stack(store_feat)
            store_feat = []
            with h5py.File(filename.replace('avi', 'h5'), 'w') as hf: 
                Xset = hf.create_dataset(name = 'features', data = features_save)
            getting_features(filename)			# recursion
        else:
            getting_features(filename)			# recursion if folder is not the desired one

In [12]:
getting_features(pat)

/home/ricz/Desktop/Manning_books/Deep_fake/VidTIMIT/VidTIMIT/mgwt0/sx369.avi 83


KeyboardInterrupt: 

   Few Numpy methods of creating matrix from list of arrays with time changes connected with its usage:
1. concatenate: 21 s ± 416 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2. array: 20.8 s ± 652 ms per loop (mean ± std. dev. of 7 runs, 1 loop each
3. vstack: 20.1 s ± 627 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
4. stack: 19.7 s ± 677 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Last method, stack, was choosen because of its extraordinary good time of computation. Saved files are also experimental. They consist the template matching data wchich is not as straightforward and takes some computation time. Their efficiency will be compared to classical set of data later.

## Train SVM Classifier

**Objective**

Using scikit-learn train SVM classifier on the features. When reading the features from the saved HDF5 files, you need to also construct a vector with labels that has **0 label for each Deepfake features and 1 label for each genuine feature**. You can use linear SVM and play with different parameters of this classifier and study their impact on the results.



Workflow

1. **Split the set of videos into two sets**: training and testing. There are different ways to do it but the split of **80% of data for training and 20% for testing** is the common one. You can use train_test_split() function from sklearn.model_selection.
2. Be careful how you split the list of videos into 80% for training and 20% for testing. You need to make sure that 80% of Deepfake videos are inside the training set and 80% of original videos are also inside the training set. Also, you need to split the videos, not their features (you have many features vectors for each video); when you evaluate later, you will need to compute one prediction score per test video, which means all features from that videos must be inside the test set. **You must always evaluate your trained classification model on the features that you did not use for training**.
3. In a loop through all original and deepfake videos (use Python’s Glob to loop through folders), for each video compute features for all frames (loop through frames with OpenCV) in the video and save the features in HDF5 files. One HDF5 files should correspond to one video and should contain the feature matrix of N x M, where N is the number of frames in that video and M is the number of features you computed for on frame, so each row is a feature vector for one frame of the video.
4. Once all features are computed, **focus on the training set of videos**. Loop through the stored HDF5 files (use the same Glob library) of the training set, read HDF5 files and **combine all the features vectors in one numpy array, where rows are feature vectors from all videos. In the same time, create a separate array of integer labels, which would have 0 label for the feature vector corresponding to Deepfake frame and label 1 corresponding to original frame.** In the end, you should have two arrays: 1) array of features extracted from all frames of all videos and 2) array of labels of the same length, where you store which feature is from fake video and which is from the original video.
5. Train SVM classifier of scikit-learn on the features and labels from the training set. This trained classifier will be used in the next milestone.



In [2]:
f = 1
pat = os.getcwd()    # getting the full path to actual folder
path = pat           # copy of path because in upyther recursion changes the oryginal
ct = 0
origin_folders = []
fake_folders = []
def getting_data(pat, sample_rate, count):
    ''' function iterates recurively through direcory files and
    saves the frames of encountered videos to three folders
    1 - oryginal videos, 2 - lower quality deep fakes, 
    3 - higher quality deep fakes'''
    for filename in glob.iglob(pat+'/*',
                        recursive = True): 		# 1 (from Workflow list)
        if 'avi' in filename:					# 2 iteration over every frame in repository as desired in 2. of work flow
            global ct
            global common_folders
            global fake_folders
            if not pat[-5:] in origin_folders:
                origin_folders.append(pat[-5:])
            if 'higher' in filename:
                if not pat[-5:] in fake_folders:
                    fake_folders.append(pat[-5:])
            ct += 1
            getting_data(filename, 1, count + 1)
        else:
            getting_data(filename, 1, 0)			# recursion if folder is not the desired one
getting_data(path, 1, 0)  

print('\nnumber of videos in data directory: ', ct)
print('\noryginal folders and their number: \n', origin_folders, len(origin_folders))
print('\nHQ fake folders and their number: \n', fake_folders, len(fake_folders))
list_1 = set(origin_folders)
intersect = list_1.intersection(fake_folders)
print('\ncommon part of both sets: \n', list(intersect), len(intersect))


number of videos in data directory:  1070

oryginal folders and their number: 
 ['fadg0', 'faks0', 'fcmh0', 'mdld0', 'mtas1', 'mrgg0', 'mjar0', 'fcmr0', 'mdbb0', 'mtmr0', 'mdab0', 'mccs0', 'mstk0', 'msjs1', 'fedw0', 'mrcz0', 'fram1', 'fdrd1', 'fdms0', 'mpgl0', 'mgwt0', 'fcft0', 'fdac1', 'mmdb1', 'mrjo0', 'fjwb0', 'fjas0', 'fkms0', 'mpdf0', 'mabw0', 'fjem0', 'mwbt0', 'mbdg0', 'mjsw0', 'fcrh0', 'mbjk0', 'mreb0', 'fpkt0', 'fgjd0', 'felc0', 'fjre0', 'mcem0', 'mmdm2'] 43

HQ fake folders and their number: 
 ['fadg0', 'faks0', 'fcmh0', 'mdld0', 'mrgg0', 'mjar0', 'mdbb0', 'mdab0', 'mccs0', 'mstk0', 'msjs1', 'fedw0', 'mrcz0', 'fram1', 'fdrd1', 'mpgl0', 'mgwt0', 'fcft0', 'fdac1', 'mmdb1', 'mrjo0', 'fjwb0', 'fjas0', 'fkms0', 'mpdf0', 'fjem0', 'mwbt0', 'mjsw0', 'felc0', 'fjre0', 'mcem0', 'mmdm2'] 32

common part of both sets: 
 ['fjre0', 'mmdb1', 'mccs0', 'fjas0', 'fedw0', 'mwbt0', 'mmdm2', 'felc0', 'mrcz0', 'mcem0', 'mdbb0', 'fcft0', 'faks0', 'mpdf0', 'fdac1', 'mdld0', 'fjwb0', 'mpgl0', 'fdrd1'

Above script is a helper that once more checks the form of data directory. In oryginal videos are 110 videos without deepfake child. Intersect list provides the common folders of deep fake and oryginal videos.

In [3]:
pat = os.getcwd()    # getting the full path to actual folder
path = pat           # copy of path because in upyther recursion changes the oryginal
ct = 0
origin_folders = []
fake_folders = []
def getting_data(pat, sample_rate, count):
    ''' function iterates recurively through direcory files and
    saves the frames of encountered videos to three folders
    1 - oryginal videos, 2 - lower quality deep fakes, 
    3 - higher quality deep fakes'''
    global ct
    global common_folders
    global fake_folders
    for filename in glob.iglob(pat+'/*',
                        recursive = True): 		# 1 (from Workflow list)
        if 'avi' in filename:					# 2 iteration over every frame in repository as desired in 2. of work flow
            if not pat[-5:] in origin_folders:
                origin_folders.append(pat[-5:])
            if 'higher' in filename:
                if not pat[-5:] in fake_folders:
                    fake_folders.append(pat[-5:])
            ct += 1
            getting_data(filename, 1, count + 1)
        else:
            getting_data(filename, 1, 0)			# recursion if folder is not the desired one
getting_data(path, 1, 0)  

print('\nnumber of videos in data directory: ', ct)
print('\noryginal folders and their number: \n', origin_folders, len(origin_folders))
print('\nHQ fake folders and their number: \n', fake_folders, len(fake_folders))
list_1 = set(origin_folders)
intersect = list_1.intersection(fake_folders)
print('\ncommon part of both sets: \n', list(intersect), len(intersect))


number of videos in data directory:  1070

oryginal folders and their number: 
 ['fadg0', 'faks0', 'fcmh0', 'mdld0', 'mtas1', 'mrgg0', 'mjar0', 'fcmr0', 'mdbb0', 'mtmr0', 'mdab0', 'mccs0', 'mstk0', 'msjs1', 'fedw0', 'mrcz0', 'fram1', 'fdrd1', 'fdms0', 'mpgl0', 'mgwt0', 'fcft0', 'fdac1', 'mmdb1', 'mrjo0', 'fjwb0', 'fjas0', 'fkms0', 'mpdf0', 'mabw0', 'fjem0', 'mwbt0', 'mbdg0', 'mjsw0', 'fcrh0', 'mbjk0', 'mreb0', 'fpkt0', 'fgjd0', 'felc0', 'fjre0', 'mcem0', 'mmdm2'] 43

HQ fake folders and their number: 
 ['fadg0', 'faks0', 'fcmh0', 'mdld0', 'mrgg0', 'mjar0', 'mdbb0', 'mdab0', 'mccs0', 'mstk0', 'msjs1', 'fedw0', 'mrcz0', 'fram1', 'fdrd1', 'mpgl0', 'mgwt0', 'fcft0', 'fdac1', 'mmdb1', 'mrjo0', 'fjwb0', 'fjas0', 'fkms0', 'mpdf0', 'fjem0', 'mwbt0', 'mjsw0', 'felc0', 'fjre0', 'mcem0', 'mmdm2'] 32

common part of both sets: 
 ['fjre0', 'mmdb1', 'mccs0', 'fjas0', 'fedw0', 'mwbt0', 'mmdm2', 'felc0', 'mrcz0', 'mcem0', 'mdbb0', 'fcft0', 'faks0', 'mpdf0', 'fdac1', 'mdld0', 'fjwb0', 'mpgl0', 'fdrd1'

In [4]:
pat = os.getcwd()    # getting the full path to actual folder
path = pat           # copy of path because in upyther recursion changes the oryginal
ct = 1
training_videos = []
eval_videos = []


def getting_data(pat):
    global ct
    global training_videos
    global eval_videos
    for filename in glob.iglob(pat+'/*',
                        recursive = True): 		# 1 (from Workflow list)
        if 'h5' in filename:					# 2 iteration over every frame in repository as desired in 2. of work flow
            if any(name in filename for name in intersect):
                if ct <= 8:
                    ct += 1
                    training_videos.append(filename)
                elif ct > 8 and ct < 10:
                    ct += 1
                    eval_videos.append(filename)
                elif ct == 10:
                    ct = 1
                    eval_videos.append(filename)
            getting_data(filename)
        else:
            getting_data(filename)			# recursion if folder is not the desired one
getting_data(path) 

In [5]:
all_features = []
labels = []
ct = 0
ct_1 = 0
for i in training_videos:
    hf = h5py.File(i, 'r')
    data = hf.get('features')[()]
    all_features.append(data)
    (x, y) = data.shape
    if 'Deepfake' in i:
        ct += 1
        for j in range(0, x):
            labels.append(0)
    else:
        ct_1 += 1
        for j in range(0, x):
            labels.append(1)
all_features = np.concatenate(all_features)
print('number of frames and length of label vector: ', len(labels))
print('shape of combined h5 files: ', all_features.shape)
print('number of fake/oryginal videos: ', ct, ct_1)

number of frames and length of label vector:  86078
shape of combined h5 files:  (86078, 105)
number of fake/oryginal videos:  512 256


In [6]:
test_features = []
test_labels = []
ct = 0
ct_1 = 0
for i in eval_videos:
    hf = h5py.File(i, 'r')
    data = hf.get('features')[()]
    test_features.append(data)
    (x, y) = data.shape
    if 'Deepfake' in i:
        ct += 1
        for j in range(0, x):
            test_labels.append(0)
    else:
        ct_1 += 1
        for j in range(0, x):
            test_labels.append(1)
test_features = np.concatenate(test_features)
print('number of frames and length of label vector: ', len(test_labels))
print('shape of combined h5 files: ', test_features.shape)
print('number of fake/oryginal videos: ', ct, ct_1)

number of frames and length of label vector:  20411
shape of combined h5 files:  (20411, 105)
number of fake/oryginal videos:  128 64


In [None]:
# 1st method
X = all_features
y = np.array(labels)
clf = svm.SVC()
clf.fit(X, y)

In [19]:
data = test_features
pred = clf.predict(data)
ct = 0
for i in range(0, len(pred)):
    if pred[i] != test_labels[i]:
        ct += 1
print('No. of errors: ', ct)

No. of errors:  4629


In [None]:
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

In [7]:
# 2nd method 
X_test = test_features
y_test = test_labels
X = all_features
y = np.array(labels)
svclassifier = SVC(kernel = 'linear', max_iter = 100000)
svclassifier.fit(X, y)
y_pred = svclassifier.predict(X_test)



In [8]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[5812 7706]
 [5036 1857]]
              precision    recall  f1-score   support

           0       0.54      0.43      0.48     13518
           1       0.19      0.27      0.23      6893

    accuracy                           0.38     20411
   macro avg       0.36      0.35      0.35     20411
weighted avg       0.42      0.38      0.39     20411



In [None]:
# 3rd method
svclassifier_1 = SVC(kernel = 'poly', degree = 8, max_iter = 100000)
svclassifier_1.fit(X, y)
y_pred_1 = svclassifier.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred_1))
print(classification_report(y_test, y_pred_1))

In [17]:
# Gaussian kernel
svclassifier_2 = SVC(kernel = 'rbf')
svclassifier_2.fit(X, y)
y_pred_2 = svclassifier.predict(X_test)

NameError: name 'X' is not defined

In [None]:
print(confusion_matrix(y_test, y_pred_2))
print(classification_report(y_test, y_pred_2))

In [None]:
# sigmoid kernel
svclassifier_3 = SVC(kernel = 'sigmoid')
svclassifier_3.fit(X, y)
y_pred_3 = svclassifier.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred_3))
print(classification_report(y_test, y_pred_3))