In [3]:
import sys
import os

import pandas as pd
import numpy as np
import natsort
import random as rn
import skvideo.io
import tqdm
import tensorflow as tf

#Keras
import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten
from keras import optimizers

#Sklearn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPRegressor
from sklearn.svm import (SVC, SVR)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score

seed=42
np.random.seed(seed)
rn.seed(seed)
tf.set_random_seed(seed)
dir_path = os.getcwd()

train_folder = os.path.join(dir_path,"data/train/")
test_folder = os.path.join(dir_path,"data/test/")

train_target = os.path.join(dir_path,'data/train_target.csv')

print("Current dir -> ", dir_path)
print("Train folder -> ",train_folder)
print("Train target -> ",train_target)
print("Test folder -> ",test_folder)


def get_videos_from_folder(data_folder):
    '''
    get a list of video x wehre each video is a numpy array in the format [n_frames,width,height] 
    with uint8 elements.
    argument: relative path to the data_folder from the source folder.
    '''
    data_folder = os.path.join(dir_path,data_folder)
    x = []
    file_names = []
    
    if os.path.isdir(data_folder):
        for dirpath, dirnames, filenames in os.walk(data_folder):
            print("Default ordering of filenames loading: ",filenames, "\n")
            filenames = natsort.natsorted(filenames,reverse=False)
            print("Reordered ordering of filenames loading: ", filenames, "\n")
            for filename in filenames:
                file_path = os.path.join(dirpath, filename)
                statinfo = os.stat(file_path)
                if statinfo.st_size != 0:
                    video = skvideo.io.vread(file_path, outputdict={"-pix_fmt": "gray"})[:, :, :, 0]
                    x.append(video)
                    file_names.append(int(filename.split(".")[0]))

    indices = sorted(range(len(file_names)), key=file_names.__getitem__)
    x = np.take(x,indices)
    return x

def get_target_from_csv(csv_file):
    '''
    get a numpy array y of labels. the order follows the id of video. 
    argument: relative path to the csv_file from the source folder.
    '''
    csv_file = os.path.join(dir_path,csv_file)
    with open(csv_file, 'r') as csvfile:
        label_reader = pd.read_csv(csvfile)
        #print("Labels: ", label_reader['id'])
        y = label_reader['y']
        
    y = np.array(y)
    return y


def save_solution(csv_file,prob_positive_class):
    with open(csv_file, 'w') as csv:
        df = pd.DataFrame.from_dict({'id':range(len(prob_positive_class)),'y': prob_positive_class})
        df.to_csv(csv,index = False)


Current dir ->  /home/francesco/Scrivania/AML-18/task4
Train folder ->  /home/francesco/Scrivania/AML-18/task4/data/train/
Train target ->  /home/francesco/Scrivania/AML-18/task4/data/train_target.csv
Test folder ->  /home/francesco/Scrivania/AML-18/task4/data/test/


In [4]:
#Support function from the given repo for the project

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def save_tf_record(x,file_name,y = None):
    writer = tf.python_io.TFRecordWriter(file_name)
    if y is None:
        for video in x:
            sys.stdout.flush()
            feature = {'len': _int64_feature(video.shape[0]),
                       'height': _int64_feature(video.shape[1]),
                       'width': _int64_feature(video.shape[2]),
                       'video': _bytes_feature(tf.compat.as_bytes(video.tostring()))}
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            writer.write(example.SerializeToString())
    else:
        for video,label in zip(x,y):
            sys.stdout.flush()
            feature = {'len': _int64_feature(video.shape[0]),
                       'height': _int64_feature(video.shape[1]),
                       'width': _int64_feature(video.shape[2]),
                       'video': _bytes_feature(tf.compat.as_bytes(video.tostring())),
                       'label': _int64_feature(label)}
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            writer.write(example.SerializeToString())
    
    writer.close()
    sys.stdout.flush()

def prob_positive_class_from_prediction(pred):
    return np.array([p['probabilities'][1] for p in pred])

def decode(serialized_example):
    features = tf.parse_single_example(
        serialized_example,
        features={
            'len': tf.FixedLenFeature([], tf.int64),
            'height': tf.FixedLenFeature([], tf.int64),
            'width': tf.FixedLenFeature([], tf.int64),
            'label': tf.FixedLenFeature([], tf.int64,default_value = 0),
            'video': tf.FixedLenFeature([], tf.string),
        })
    video = tf.decode_raw(features['video'], tf.uint8)
    height = features['height']
    width = features['width']
    length = features['len']
    shape = tf.stack([length,height,width])
    video = tf.reshape(video,shape)
    label = features['label']
    features = {'video':video}
    return features,label

def input_fn_from_dataset(files,batch_size = 1,num_epochs = None,shuffle = True):
    data_set = tf.data.TFRecordDataset(files)
    if shuffle:
        data_set = data_set.shuffle(buffer_size=len(files)) 
    data_set = data_set.map(decode)
    data_set = data_set.padded_batch(batch_size,padded_shapes= ({'video':[212,100,100]},[]))
    data_set = data_set.repeat(num_epochs)
    data_set = data_set.prefetch(batch_size)
    
    return data_set

def decode_frame(serialized_example):
    features = tf.parse_single_example(
        serialized_example,
        features={
            'len': tf.FixedLenFeature([], tf.int64),
            'height': tf.FixedLenFeature([], tf.int64),
            'width': tf.FixedLenFeature([], tf.int64),
            'label': tf.FixedLenFeature([], tf.int64,default_value = 0),
            'video': tf.FixedLenFeature([], tf.string),
        })
    video = tf.decode_raw(features['video'], tf.uint8)
    height = features['height']
    width = features['width']
    length = features['len']
    shape = tf.stack([length,height,length])
    video = tf.reshape(video,shape)
    label = features['label']
    label = tf.expand_dims(label,axis=-1)
    label = tf.tile(label,tf.expand_dims(length,axis=-1))
    features = {'frame':video}
    return features,label

def input_fn_frame_from_dataset(files,batch_size = 1,num_epochs = None):
    data_set = tf.data.TFRecordDataset(files)
    data_set = data_set.shuffle(buffer_size=len(files)) 
    data_set = data_set.map(decode_frame)
    data_set = data_set.shuffle(buffer_size=batch_size)
    data_set = data_set.apply(tf.contrib.data.unbatch())
    data_set = data_set.batch(batch_size)
    data_set = data_set.repeat(num_epochs)
    data_set = data_set.prefetch(batch_size)
    
    return data_set

In [5]:
#Load data from csv file
print("Train Data\n")
x_train = get_videos_from_folder(train_folder) #List of numpy arrays
y_train = get_target_from_csv(train_target) #Numpy array of labels
print("Test Data\n")
x_test = get_videos_from_folder(test_folder) #List of numpy arrays

Train Data

Default ordering of filenames loading:  ['116.avi', '66.avi', '92.avi', '69.avi', '104.avi', '113.avi', '117.avi', '153.avi', '102.avi', '36.avi', '72.avi', '82.avi', '73.avi', '18.avi', '27.avi', '144.avi', '99.avi', '30.avi', '54.avi', '140.avi', '125.avi', '114.avi', '42.avi', '83.avi', '21.avi', '0.avi', '122.avi', '78.avi', '119.avi', '31.avi', '4.avi', '93.avi', '67.avi', '79.avi', '136.avi', '147.avi', '16.avi', '48.avi', '12.avi', '142.avi', '46.avi', '150.avi', '59.avi', '107.avi', '55.avi', '96.avi', '151.avi', '105.avi', '143.avi', '51.avi', '120.avi', '131.avi', '7.avi', '61.avi', '86.avi', '25.avi', '2.avi', '8.avi', '98.avi', '110.avi', '44.avi', '89.avi', '112.avi', '145.avi', '76.avi', '68.avi', '138.avi', '29.avi', '74.avi', '100.avi', '111.avi', '126.avi', '128.avi', '132.avi', '58.avi', '23.avi', '135.avi', '47.avi', '137.avi', '134.avi', '81.avi', '108.avi', '41.avi', '1.avi', '70.avi', '56.avi', '95.avi', '60.avi', '28.avi', '149.avi', '37.avi', '34.avi

In [6]:
#Custom support functions

#Count for the minimum number of frames video among the list_array of videos
def count_min_number_frames(list_array):
    min_frames = 999
    for sample in list_array:
        if sample.shape[0] < min_frames:
            min_frames = sample.shape[0]
            
    return min_frames

#Count number of subsamples given the min_frames
def count_new_subsamples(list_array, min_frames, train_set=True, labels=[]):
    """Params:
       - list array: list of videos (list of numpy array)
       - min_frames: number of frames per subsample (equal to the minimum number fo frames among training and test videos)
       - train_set: set to false if the dataset passed is the test set
       - labels: pass the labels for the train set
       Return:
       - n_new_subsamples 
         if train set == true -> int(video frames/min_frames)
         if test set == false -> len(list_array) just pick one subsample of min_frames per video and discard the other frames
    """
    classes = [0,0]
    i = 0
    n_new_samples = 0
    for sample in list_array:
        n_new_samples += int(sample.shape[0]/min_frames)
        if train_set:
            classes[labels[i]] += int(sample.shape[0]/min_frames)
        i+=1
    print("Train set" if train_set==True else "Test set")
    print(n_new_samples)
    print("Labels per class ->",classes if train_set==True else "")
    return n_new_samples



def extract_features(X_videos):
    total_videos = len(X_videos)
    total_features = 13205
    #Extracting features
    X_features = np.zeros((total_videos,total_features))
    video_n = 0
    for video in X_videos:
        print("Video shape -> ",video.shape)
        video_grad = np.diff(video,axis = 0) #frame-wise gradient
        print("Video gradient shape -> ",video_grad.shape)
        height_im_grad = video_grad.shape[1]
        width_im_grad = video_grad.shape[2]

        for idx in range(height_im):

            X_features[video_n][idx*6] = np.mean(video[:,:,idx]) #entire video video mean_column_pixels            1
            X_features[video_n][idx*6+1] = np.mean(video[:,idx,:]) #entire video mean_row_pixels             2
            X_features[video_n][idx*6+2] = np.std(video[:,:,idx]) #entire video std_column_pixels            3
            X_features[video_n][idx*6+3] = np.std(video[:,idx,:]) #entire video std_row_pixels               4
            X_features[video_n][idx*6+4] = np.count_nonzero(video[:,:,idx]) #entire nonzero_column_pixels    5
            X_features[video_n][idx*6+5] = np.count_nonzero(video[:,idx,:]) #entire nonzero_row_pixels       6
        
        # ? TODO ? Taking some of the same previous features but just frame-wise?
        idx_next = height_im*6
      
        X_features[video_n][idx_next+1] = np.mean(video_grad) #mean_video_grad    1
        X_features[video_n][idx_next+2] = np.std(video_grad) #std_video_grad      2
        X_features[video_n][idx_next+3] = np.mean(video_grad) #mean_frame_grad    3
    
        idx_next = idx_next+4

        idx_frame_grad = 0
        
        for frame_grad in video_grad:

            X_features[video_n][idx_next  + 1 +   idx_frame_grad*6*height_im_grad] = np.std(frame_grad) #Std of the gradient of the single frame std_frame_grad      1

            for idx in range(height_im_grad):
                
                #print("Inside inside: ",idx_next  + 1 +   idx*6 + idx_frame_grad*6*height_im_grad)
                X_features[video_n][idx_next  + 1 +   idx*6 + idx_frame_grad*6*height_im_grad] = np.mean(frame_grad[:,idx]) #mean_grad_column_pixels   1
                X_features[video_n][idx_next  + 2 +   idx*6 + idx_frame_grad*6*height_im_grad] = np.mean(frame_grad[idx,:]) #mean_grad_row_pixels           2
                X_features[video_n][idx_next  + 3 +   idx*6 + idx_frame_grad*6*height_im_grad] = np.std(frame_grad[:,idx]) #std_grad_column_pixels          3
                X_features[video_n][idx_next  + 4 +   idx*6 + idx_frame_grad*6*height_im_grad] = np.std(frame_grad[idx,:]) #std_grad_row_pixels               4
                X_features[video_n][idx_next  + 5 +   idx*6 + idx_frame_grad*6*height_im_grad] = np.count_nonzero(frame_grad[:,idx]) #nonzero_grad_column_pixels  5
                X_features[video_n][idx_next  + 6 +   idx*6 + idx_frame_grad*6*height_im_grad] = np.count_nonzero(frame_grad[idx,:]) #nonzero_grad_row_pixels     6
                #print("Finish Inside inside: ",idx_next  + 6 +   idx*6 + idx_frame_grad*6*height_im_grad)
        
            idx_frame_grad+=1
    
        print(video_n)
        video_n+=1
    return X_features


def extract_ids(data_folder):
    
    print("Extracting ids from test set videos")
    data_folder = os.path.join(dir_path,data_folder)
    x = []
    file_names = []
    
    if os.path.isdir(data_folder):
        for dirpath, dirnames, filenames in os.walk(data_folder):
            print("Default ordering of filenames loading: ",filenames, "\n")
            filenames = natsort.natsorted(filenames,reverse=False)
            print("Reordered ordering of filenames loading: ", filenames, "\n")
            ids = []
            for filename in filenames:
              ids.append(int(filename.split(".")[0]))
    return ids

def make_submission(filename, predictions):
    ids = extract_ids(test_folder)
    df = pd.DataFrame({'id':ids, 'y':predictions})
    df[["id", "y"]].to_csv("submissions/"+filename, index= False)

In [7]:
#Compute minimum number of frames per video in train and test set
min_frames_train = count_min_number_frames(x_train)
min_frames_test = count_min_number_frames(x_test)

#Pick minimum number of frames among all dataset
min_frames = min_frames_train if min_frames_train<min_frames_test else min_frames_test
print("Minimum number of frames among video dataset -> ",min_frames)

#Count the number of new subsamples per video of min_frames
n_train_subsamples = count_new_subsamples(x_train, min_frames, train_set = True, labels=y_train)
n_test_subsamples = count_new_subsamples(x_test, min_frames, train_set = False)

Minimum number of frames among video dataset ->  22
Train set
402
Labels per class -> [182, 220]
Test set
183
Labels per class -> 


In [10]:
#Construct new datasets from subsamples

height_im = x_train[0].shape[1]
width_im = x_train[0].shape[2]
X = np.zeros((n_train_subsamples, min_frames, height_im, width_im))
Y = np.zeros((n_train_subsamples))


#TRAIN

sample_idx = 0
subsample_idx = 0

#Train subsamples
for sample_video in x_train:
    
    subsamples = int(sample_video.shape[0]/min_frames)
    
    for i in range(subsamples):
        X[subsample_idx,:,:,:] = sample_video[i*min_frames : i*min_frames+min_frames, :, :]
        Y[subsample_idx] = y_train[sample_idx]
        subsample_idx+=1
        
    sample_idx+=1

    
#TEST
X_test = np.zeros((len(x_test), min_frames, height_im, width_im))


sample_idx = 0
subsample_idx = 0

#Test subsamples
# ? TODO ? create test set subsamples and go for a maximum consensus or other tecniques for prediction?
for sample_video in x_test:
    
    X_test[subsample_idx,:,:,:] = sample_video[0 : min_frames, :, :]
    """    subsamples = int(sample.shape[0]/min_frames)
    
    for i in range(subsamples):
        X[subsample_idx,:,:,:] = sample[i*min_frames : i*min_frames+min_frames, :, :]
        Y[subsample_idx] = y_train[sample_idx]
        subsample_idx+=1"""
    subsample_idx+=1
    #sample_idx+=1

    
#Reshaping for (n_samples, n_frames, height_frame, width_frame )
X = np.reshape(X,(402,22,100,100))
X_test = np.reshape(X_test, (len(x_test), min_frames, x_test[0].shape[1], x_test[0].shape[2]))
print("New training samples shape -> ", X.shape)
print("New test samples shape -> ", X_test.shape)

New training samples shape ->  (402, 22, 100, 100)
New test samples shape ->  (69, 22, 100, 100)


In [11]:
#Scale data -> NB: takes about 3 mins in the current implementation

#TODO use standard scaler and scaler, fit w.r.t all data

X_train_features = extract_features(X)
X_train_scaled = preprocessing.scale(X_train_features)
print(X_train_scaled)
print(X_train_scaled.shape)


X_test_features = extract_features(X_test)
X_test_scaled = preprocessing.scale(X_test_features)
print(X_test_scaled)
print(X_test_scaled.shape)

Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
0
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
1
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
2
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
3
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
4
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
5
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
6
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
7
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
8
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
9
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
10
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
11
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
12
Video shape ->  (22, 100, 100)
Video gradient sh

111
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
112
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
113
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
114
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
115
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
116
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
117
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
118
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
119
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
120
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
121
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
122
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
123
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
124
Video shape ->  (22, 

221
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
222
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
223
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
224
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
225
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
226
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
227
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
228
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
229
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
230
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
231
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
232
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
233
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
234
Video shape ->  (22, 

331
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
332
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
333
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
334
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
335
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
336
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
337
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
338
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
339
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
340
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
341
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
342
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
343
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
344
Video shape ->  (22, 

34
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
35
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
36
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
37
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
38
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
39
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
40
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
41
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
42
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
43
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
44
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
45
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
46
Video shape ->  (22, 100, 100)
Video gradient shape ->  (21, 100, 100)
47
Video shape ->  (22, 100, 100)
Vide

In [12]:
#Freeing memory
X = []
x_train = []
x_test = []

In [13]:
mlp = MLPRegressor(solver='adam',
                           activation='relu',
                           learning_rate='adaptive',
                           max_iter=2000,
                           random_state=seed,
                           hidden_layer_sizes=(300,300),
                           verbose=False)

gb = GradientBoostingRegressor(random_state=seed,
                                       n_estimators=1000,
                                       max_depth=5,
                                       learning_rate=0.1)

rf = RandomForestRegressor(n_estimators=1000,
                                       random_state=seed,
                                       n_jobs=-1,
                                       verbose=False)

svc = SVC()

svr = SVR(kernel='rbf', degree=3, tol=0.01, C=1.0, epsilon=0.5)

ab = AdaBoostRegressor(n_estimators=1000, 
                       learning_rate=1.0, 
                       loss='square',
                       base_estimator = DecisionTreeRegressor(max_depth=50, random_state=seed),
                       random_state=seed)
lr = LogisticRegression(solver="lbfgs",
                                   multi_class="multinomial",
                                   max_iter=5000,
                                   random_state=seed,
                                   class_weight='balanced')

classifiers = [mlp, gb, rf, svc, svr, ab, lr]
classifiers_names = ["MLPRegressor", "GradientBoostingRegressor", "RandomForestRegressor", "SVC", "SVR", "AdaBoostRegressor", "LogisticRegression"  ]

In [133]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
clf_scores_avg = []
clf_scores_std = []

X = X_train_scaled
y = Y

print("Start")
for clf in classifiers:
    roc_auc_scores = []
    for train, test in kfold.split(X, y):
        X_fold = X[train]
        y_fold = y[train]
        X_fold_test = X[test]
        y_valid = y[test]
        
        #scaler = StandardScaler() 
        #scaler.fit(X_fold)

        #x_train_scaled = scaler.transform(X_fold)
        #x_test_scaled = scaler.transform(X_fold_test)
        x_train_scaled = X_fold
        x_test_scaled = X_fold_test
        #class_weight = get_weights(y_fold, X_fold.shape[0])

        clf.fit(x_train_scaled, y_fold)

        y_pred = clf.predict(x_test_scaled)
        y_true = y[test]
        roc_auc = roc_auc_score(y_true, y_pred)

        roc_auc_scores.append(roc_auc)
    clf_scores_avg.append(np.mean(roc_auc_scores))
    clf_scores_std.append(np.std(roc_auc_scores))
for i in range(len(classifiers)):
    print(f"{classifiers_names[i]} roc_auc avg score {clf_scores_avg[i]} +/- {clf_scores_std[i]}" )

Start
MLPRegressor roc_auc avg score 0.8114965443912812 +/- 0.05729103938725251


IndexError: list index out of range

In [130]:
for i in range(len(classifiers)):
    print(f"{classifiers_names[i]} roc_auc avg score {clf_scores_avg[i]} +/- {clf_scores_std[i]}" )

MLPRegressor roc_auc avg score 0.6557283359914939 +/- 0.03928407204527997
GradientBoostingRegressor roc_auc avg score 0.8718766613503455 +/- 0.06237703803872251
RandomForestRegressor roc_auc avg score 0.8877126528442318 +/- 0.04600035975699656
SVC roc_auc avg score 0.7483386496544391 +/- 0.06607772920014919
SVR roc_auc avg score 0.5 +/- 0.0
AdaBoostRegressor roc_auc avg score 0.8059409888357256 +/- 0.08126187344261152
LogisticRegression roc_auc avg score 0.7186071238702818 +/- 0.050120700989840256


In [115]:
#Predict with one classifiers
#p_rf = classifiers[1].predict(X_test_scaled)
rf.fit(X_train_scaled, Y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=4000, n_jobs=-1,
           oob_score=False, random_state=42, verbose=False,
           warm_start=False)

In [116]:
p_rf = rf.predict(X_test_scaled)

In [117]:
print(p_rf)

[0.30225 0.2175  0.58625 0.72475 0.389   0.45025 0.7305  0.52675 0.528
 0.39575 0.52525 0.368   0.68225 0.2785  0.47125 0.80825 0.56075 0.8065
 0.69075 0.55375 0.463   0.83475 0.81075 0.32825 0.62175 0.68975 0.4615
 0.6255  0.5075  0.61925 0.41325 0.49875 0.3155  0.4575  0.45425 0.86075
 0.278   0.67425 0.63325 0.463   0.74975 0.511   0.209   0.49    0.465
 0.61475 0.5675  0.66975 0.69625 0.54025 0.78725 0.65875 0.2685  0.28475
 0.346   0.3385  0.59    0.6815  0.67125 0.70875 0.36625 0.51975 0.778
 0.71525 0.61675 0.63725 0.659   0.78875 0.707  ]


In [118]:
make_submission("rf_extractedfeat_4000estimator.csv",p_rf)

['66.avi', '36.avi', '18.avi', '27.avi', '30.avi', '54.avi', '42.avi', '21.avi', '0.avi', '31.avi', '4.avi', '67.avi', '16.avi', '48.avi', '12.avi', '46.avi', '59.avi', '55.avi', '51.avi', '7.avi', '61.avi', '25.avi', '2.avi', '8.avi', '44.avi', '68.avi', '29.avi', '58.avi', '23.avi', '47.avi', '41.avi', '1.avi', '56.avi', '60.avi', '28.avi', '37.avi', '34.avi', '32.avi', '50.avi', '53.avi', '15.avi', '17.avi', '14.avi', '43.avi', '64.avi', '3.avi', '9.avi', '11.avi', '19.avi', '45.avi', '20.avi', '57.avi', '35.avi', '38.avi', '22.avi', '5.avi', '49.avi', '6.avi', '39.avi', '26.avi', '62.avi', '52.avi', '24.avi', '10.avi', '33.avi', '40.avi', '63.avi', '65.avi', '13.avi']
['0.avi', '1.avi', '2.avi', '3.avi', '4.avi', '5.avi', '6.avi', '7.avi', '8.avi', '9.avi', '10.avi', '11.avi', '12.avi', '13.avi', '14.avi', '15.avi', '16.avi', '17.avi', '18.avi', '19.avi', '20.avi', '21.avi', '22.avi', '23.avi', '24.avi', '25.avi', '26.avi', '27.avi', '28.avi', '29.avi', '30.avi', '31.avi', '32.avi'