In [1]:
import sys
import os

import pandas as pd
import numpy as np
import natsort
import random as rn
import skvideo.io
import tqdm

import matplotlib.pyplot as plt

#Sklearn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPRegressor
from sklearn.svm import (SVC, SVR)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (AdaBoostRegressor, RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, BaggingRegressor)
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score

from tqdm import tqdm_notebook as tqdm

In [2]:
def get_videos_from_folder(data_folder):
    '''
    get a list of video x wehre each video is a numpy array in the format [n_frames,width,height] 
    with uint8 elements.
    argument: relative path to the data_folder from the source folder.
    '''
    data_folder = os.path.join(dir_path,data_folder)
    x = []
    file_names = []
    
    if os.path.isdir(data_folder):
        for dirpath, dirnames, filenames in os.walk(data_folder):
            filenames = natsort.natsorted(filenames,reverse=False)
            for filename in filenames:
                file_path = os.path.join(dirpath, filename)
                statinfo = os.stat(file_path)
                if statinfo.st_size != 0:
                    video = skvideo.io.vread(file_path, outputdict={"-pix_fmt": "gray"})[:, :, :, 0]
                    x.append(video)
                    file_names.append(int(filename.split(".")[0]))

    indices = sorted(range(len(file_names)), key=file_names.__getitem__)
    x = np.take(x,indices)
    return x

def get_target_from_csv(csv_file):
    '''
    get a numpy array y of labels. the order follows the id of video. 
    argument: relative path to the csv_file from the source folder.
    '''
    csv_file = os.path.join(dir_path,csv_file)
    with open(csv_file, 'r') as csvfile:
        label_reader = pd.read_csv(csvfile)
        #print("Labels: ", label_reader['id'])
        y = label_reader['y']
        
    y = np.array(y)
    return y



def extract_features(videos):

    #Extracting features
    height_im = videos[0][0].shape[0]
    
    X_video_features = []
    
    for video in tqdm(videos):
        all_features = []
        n_parts = len(video)
        for part in video:
            feature_vec = []
            part_grad = np.diff(part,axis = 0) #frame-wise gradient
            height_im_grad = part_grad.shape[1]
            width_im_grad = part_grad.shape[2]

            # compute the sum of pixels per frame
            heartBeatApprox = np.sum(np.sum(part, axis=1), axis=1)
            feature_vec.append(np.min(heartBeatApprox))
            feature_vec.append(np.max(heartBeatApprox))
            feature_vec.append(np.mean(heartBeatApprox))
            feature_vec.append(np.std(heartBeatApprox))
            feature_vec.append(np.std(heartBeatApprox))
            feature_vec += heartBeatApprox.tolist()


            for idx in range(height_im):
                feature_vec.append(np.mean(part[:,:,idx])) #entire video video mean_column_pixels            1
                feature_vec.append(np.mean(part[:,idx,:])) #entire video mean_row_pixels             2
                feature_vec.append(np.std(part[:,:,idx])) #entire video std_column_pixels            3
                feature_vec.append(np.std(part[:,idx,:])) #entire video std_row_pixels               4
                feature_vec.append(np.count_nonzero(part[:,:,idx])) #entire nonzero_column_pixels    5
                feature_vec.append(np.count_nonzero(part[:,idx,:])) #entire nonzero_row_pixels       6

            # ? TODO ? Taking some of the same previous features but just frame-wise?
            feature_vec.append(np.mean(part_grad)) #mean_video_grad    1
            feature_vec.append(np.std(part_grad)) #std_video_grad      2
            feature_vec.append(np.mean(part_grad)) #mean_frame_grad    3


            for frame_grad in part_grad:
                feature_vec.append(np.mean(frame_grad))
                feature_vec.append(np.std(frame_grad)) #Std of the gradient of the single frame std_frame_grad      1

                for idx in range(height_im_grad):            
                    feature_vec.append(np.mean(frame_grad[:,idx])) #mean_grad_column_pixels   1
                    feature_vec.append(np.mean(frame_grad[idx,:])) #mean_grad_row_pixels           2
                    feature_vec.append(np.std(frame_grad[:,idx])) #std_grad_column_pixels          3
                    feature_vec.append(np.std(frame_grad[idx,:])) #std_grad_row_pixels               4
                    feature_vec.append(np.count_nonzero(frame_grad[:,idx])) #nonzero_grad_column_pixels  5
                    feature_vec.append(np.count_nonzero(frame_grad[idx,:])) #nonzero_grad_row_pixels     6

            total_features = len(feature_vec)
            all_features.append(feature_vec)

        X_features = np.zeros((n_parts,total_features))

        for i in range(n_parts):
            X_features[i,:] = all_features[i]
        
        X_video_features.append(X_features)
    return np.concatenate([X_video_features])

def make_submission(filename, predictions):
    ids = extract_ids(test_folder)
    df = pd.DataFrame({'id':ids, 'y':predictions})
    df[["id", "y"]].to_csv("submissions/"+filename, index= False)

In [3]:
seed=42
np.random.seed(seed)
rn.seed(seed)
dir_path = os.getcwd()

train_folder = os.path.join(dir_path,"data/train/")
test_folder = os.path.join(dir_path,"data/test/")

train_target = os.path.join(dir_path,'data/train_target.csv')

print("Current dir -> ", dir_path)
print("Train folder -> ",train_folder)
print("Train target -> ",train_target)
print("Test folder -> ",test_folder)

#Load data from csv file
print("Train Data\n")
x_train = get_videos_from_folder(train_folder) #List of numpy arrays
y_train = get_target_from_csv(train_target) #Numpy array of labels
print("Test Data\n")
x_test = get_videos_from_folder(test_folder) #List of numpy arrays

Current dir ->  C:\Development\AML-18\task4
Train folder ->  C:\Development\AML-18\task4\data/train/
Train target ->  C:\Development\AML-18\task4\data/train_target.csv
Test folder ->  C:\Development\AML-18\task4\data/test/
Train Data

Test Data



In [4]:
   
def split_into_parts(x_data, n_frames, y_data=None):

    height_im = x_data[0].shape[1]
    width_im = x_data[0].shape[2]
    
    videos = []
    video_ids = []
    video_labels = []
    
    n_videos = x_data.shape[0]

    for v_id in range(n_videos):
        video = x_data[v_id]
        if y_data is not None:
            label = y_data[v_id]
            
        n_subsamples = int(video.shape[0]/n_frames)
        parts = []
        parts_v_ids = []
        parts_labels = []
        
        for i in range(n_subsamples):
            from_frame = i*n_frames
            to_frame = from_frame + n_frames
            parts.append(video[from_frame:to_frame,:,:])
            parts_v_ids.append(v_id)
            
            if y_data is not None:
                parts_labels.append(label)
        
        
        
        
        videos.append(np.concatenate([parts]))
        video_ids.append(np.concatenate([parts_v_ids]))
        
        if y_data is not None:
            video_labels.append(np.concatenate([parts_labels]))
            
    X = np.concatenate([videos])
    v_idx = np.concatenate([video_ids])
    
    if y_data is not None:
        y = np.concatenate([video_labels])
        
        return X, v_idx, y
        
    else:
        return X, v_idx
    

def combine_parts_pred(y_pred, v_idx):
    d = {}
    for v_id, pred in zip(v_idx, y_pred):
        if v_id not in d:
            d[v_id] = []
        
        d[v_id].append(pred)
        
    
    results = []
    for v_id, preds in d.items():
        results.append({"id":v_id,"y":sum(preds) / float(len(preds))})
    
    df = pd.DataFrame(results)
    return df
    


In [None]:
# Create a feature selctor base on a random forest
sfm = SelectFromModel(RandomForestClassifier(n_estimators=10000, random_state=seed, n_jobs=-1), threshold=0.0001)

# Train the classifier
sfm.fit(X_train_scaled, Y)
print(f"Using: {np.sum(sfm.get_support())} features")

In [5]:
rf = RandomForestRegressor(n_estimators=2000,
                                       random_state=seed,
                                       n_jobs=-1,
                                       verbose=False)

rfc = RandomForestClassifier(n_estimators=2000, 
                             criterion='gini', 
                             max_depth=None, 
                             min_samples_split=2, 
                             min_samples_leaf=1,
                             min_weight_fraction_leaf=0.0, 
                             max_features='auto', 
                             max_leaf_nodes=None, 
                             min_impurity_decrease=0.0, 
                             min_impurity_split=None, 
                             bootstrap=True, 
                             oob_score=False, n_jobs=-1, 
                             random_state=seed, 
                             verbose=0, 
                             warm_start=False, 
                             class_weight=None)

classifiers = [rf]
classifiers_names = ["RandomForestRegressor"]

In [6]:
n_frames = 22


print(f"x_train: {x_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"x_test: {x_test.shape}")

scaler = StandardScaler()

print("Splitting videos into parts...")
x_train_parts, train_idx_parts, y_train_parts = split_into_parts(x_data=x_train, n_frames=n_frames, y_data=y_train)



print("Extracting features from parts...")
x_train_parts_features = extract_features(x_train_parts)

print(f"xtrain_parts_features: {x_train_parts_features.shape}")
print(f"xtrain_parts_features[0]: {x_train_parts_features[0].shape}")

x_train_parts_features_unrolled = np.concatenate(x_train_parts_features)
print(f"Unrolled: {x_train_parts_features_unrolled.shape}")

print("Fitting Standard Scalar...")
scaler.fit(x_train_parts_features_unrolled)

x_train: (158,)
y_train: (158,)
x_test: (69,)
Splitting videos into parts...
Extracting features from parts...


HBox(children=(IntProgress(value=0, max=158), HTML(value='')))


xtrain_parts_features: (158,)
xtrain_parts_features[0]: (3, 12804)
Unrolled: (402, 12804)
Fitting Standard Scalar...


StandardScaler(copy=True, with_mean=True, with_std=True)

In [7]:
seed = 42
np.random.seed(seed)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

clf_scores_parts_avg = []
clf_scores_parts_std = []

clf_scores_video_avg = []
clf_scores_video_std = []

verbose = False


print("Start")
for clf in tqdm(classifiers, desc="Classifier: "):
    roc_auc_parts_scores = []
    roc_auc_video_scores = []
    
    for train, valid in tqdm(kfold.split(x_train_parts_features , y_train)):
        
        # split training set into parts
        x_train_fold = x_train_parts_features[train]
        y_train_fold = y_train_parts[train] 
        
        # split validation set into parts
        x_valid_fold = x_train_parts_features[valid]
        y_valid_fold = y_train_parts[valid]
        y_valid_fold_videos = y_train[valid]
        idx_valid_fold = train_idx_parts[valid]
        
        # unrolling from [video id, part idx, feature] to [part idx, feature]
        x_train_fold_unrolled = np.concatenate(x_train_fold)
        y_train_fold_unrolled = np.concatenate(y_train_fold)
        
        x_valid_fold_unrolled = np.concatenate(x_valid_fold)
        y_valid_fold_unrolled = np.concatenate(y_valid_fold)
        idx_valid_fold_unrolled = np.concatenate(idx_valid_fold)
        
        if verbose:
            print(f"x_train_fold: {x_train_fold.shape}")
            print(f"y_train_fold: {y_train_fold.shape}")
            print(f"x_train_fold_unrolled: {x_train_fold_unrolled.shape}")
            print(f"y_train_fold_unrolled: {y_train_fold_unrolled.shape}")
            
            print(f"x_valid_fold: {x_valid_fold.shape}")
            print(f"y_valid_fold: {y_valid_fold.shape}")
            print(f"idx_valid_fold: {idx_valid_fold.shape}")
            print(f"x_valid_fold_unrolled: {x_valid_fold_unrolled.shape}")
            print(f"y_valid_fold_unrolled: {y_valid_fold_unrolled.shape}")
            print(f"idx_valid_fold_unrolled: {idx_valid_fold_unrolled.shape}")
        
        # scale the extracted features to zero mean and unit variance 
        x_train_fold_scaled = scaler.transform(x_train_fold_unrolled)
        x_valid_fold_scaled = scaler.transform(x_valid_fold_unrolled)
        
        
        # Shuffle the training data
        indices = np.arange(x_train_fold_scaled.shape[0])
        np.random.shuffle(indices)
        x_train_fold_scaled = x_train_fold_scaled[indices]
        y_train_fold_unrolled = y_train_fold_unrolled[indices]
    
        # use only important features
        #X_important_train = sfm.transform(X_fold)
        #X_important_test = sfm.transform(X_fold_test)
        
        # fit classifier
        clf.fit(x_train_fold_scaled, y_train_fold_unrolled)
        
        y_valid_fold_pred = clf.predict(x_valid_fold_scaled)

        # calculate the roc auc based on per part predictions
        roc_auc_parts = roc_auc_score(y_true=y_valid_fold_unrolled, y_score=y_valid_fold_pred)
        roc_auc_parts_scores.append(roc_auc_parts)
        print(f"roc auc parts: {roc_auc_parts}")
        
        # calculate the roc auc based on per video predictions
        df = combine_parts_pred(y_pred=y_valid_fold_pred, v_idx=idx_valid_fold_unrolled)
        if verbose:
            display(df)
        roc_auc_video = roc_auc_score(y_true=y_valid_fold_videos, y_score=df['y'].values)
        roc_auc_video_scores.append(roc_auc_video)
        print(f"roc auc video: {roc_auc_video}")
        
    
    clf_scores_parts_avg.append(np.mean(roc_auc_parts_scores))
    clf_scores_parts_std.append(np.std(roc_auc_parts_scores))

    clf_scores_video_avg.append(np.mean(roc_auc_video_scores))
    clf_scores_video_std.append(np.std(roc_auc_video_scores))
        
    print("========================================")
for i in range(len(classifiers)):
    print(f"{classifiers_names[i]} roc_auc parts avg score {clf_scores_parts_avg[i]} +/- {clf_scores_parts_std[i]} roc_auc video avg score {clf_scores_video_avg[i]} +/- {clf_scores_video_std[i]}")

Start


HBox(children=(IntProgress(value=0, description='Classifier: ', max=1, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

roc auc parts: 0.6940104166666667
roc auc video: 0.640625
roc auc parts: 0.4624505928853755
roc auc video: 0.5625
roc auc parts: 0.8530020703933748
roc auc video: 0.921875
roc auc parts: 0.48639455782312924
roc auc video: 0.5625
roc auc parts: 0.7946428571428571
roc auc video: 0.765625
roc auc parts: 0.81203007518797
roc auc video: 0.8125
roc auc parts: 0.5051652892561984
roc auc video: 0.5
roc auc parts: 0.4935897435897436
roc auc video: 0.5390625
roc auc parts: 0.825925925925926
roc auc video: 0.859375
roc auc parts: 0.782051282051282
roc auc video: 0.7755102040816326

RandomForestRegressor roc_auc parts avg score 0.6709262810922524 +/- 0.15547129040864743 roc_auc video avg score 0.6939572704081632 +/- 0.14288709232029995


In [None]:
#Predict with one classifiers
X_train_importance = sfm.transform(X_train_scaled)
X_test_importance = sfm.transform(X_test_scaled)
print("start fitting")
rf.fit(X_train_importance, Y)

In [12]:
# split videos in parts of 22 frames 
x_test_parts, test_idx_parts = split_into_parts(x_data=x_test, n_frames=n_frames)

# extract features for each part
x_test_parts_features = extract_features(x_test_parts)

# unroll [video, part_id, features] to [part_id, features]
x_test_parts_features_unrolled = np.concatenate(x_test_parts_features)
test_idx_parts_unrolled = np.concatenate(test_idx_parts)

# scale data
x_test_scaled = scaler.transform(x_test_parts_features_unrolled)

# predict 
y_test_pred = clf.predict(x_test_scaled)

# combine preidctions of multiple parts per video into single prediction by averaging
df = combine_parts_pred(y_pred=y_test_pred, v_idx=test_idx_parts_unrolled)

display(df)

Unnamed: 0,id,y
0,0,0.465000
1,1,0.295000
2,2,0.468333
3,3,0.630000
4,4,0.567500
5,5,0.520000
6,6,0.697500
7,7,0.623333
8,8,0.520000
9,9,0.412500
