In [1]:
import warnings
warnings.filterwarnings("ignore")

#### Video imports

In [41]:
import pandas as pd
import numpy as np
import pickle
from time import time
import os
from scipy.special import softmax
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC,LinearSVC
import csv
import pathlib

#### Audio imports

In [3]:
import torchaudio
import torch
import numpy as np
from transformers import AutoConfig, Wav2Vec2Processor, AutoModelForAudioClassification
import librosa
from tqdm.auto import tqdm
import pathlib 

In [4]:
model_name_or_path = "facebook/wav2vec2-base-960h"
pooling_mode = "mean"
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
target_sampling_rate = processor.feature_extractor.sampling_rate

Downloading:   0%|          | 0.00/159 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/163 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [5]:
def preprocess_function_eval(speech_path):   
    speech_array, sampling_rate = librosa.load(speech_path, sr = 16000)
    result = processor(speech_array, sampling_rate=target_sampling_rate, max_length=60000, padding="max_length", truncation='longest_first', return_attention_mask=True)
    len_of_input_data = result['input_values'][0].shape[0]
    padded_array = np.pad(result['input_values'][0], ((0,60000-len_of_input_data)), constant_values=0)
    return padded_array

In [6]:
classes = ['Positive', 'Neutral', 'Negative']
class_to_idx = {'Positive': 1, 'Neutral': 2, 'Negative': 3}
idx_to_class = {1: 'Positive', 2: 'Neutral', 3: 'Negative'}

### 5-fold train split

In [36]:
train_test_dir = '../input/vgaf-train-test-processed-dataset/created_dataset_features/'
with open(train_test_dir + 'x_train.pickle', 'rb') as handle:
    x_train = pickle.load(handle)
    
with open(train_test_dir + 'y_train.pickle', 'rb') as handle:
    y_train = pickle.load(handle)
    
with open(train_test_dir + 'has_faces_train.pickle', 'rb') as handle:
    has_faces_train = pickle.load(handle)

In [69]:
with open(train_test_dir + 'x_test.pickle', 'rb') as handle:
    x_val = pickle.load(handle)
    
with open(train_test_dir + 'y_test.pickle', 'rb') as handle:
    y_val = pickle.load(handle)
    
with open(train_test_dir + 'has_faces_test.pickle', 'rb') as handle:
    has_faces_val = pickle.load(handle)

In [70]:
x_val.shape # сохранить pickle с 766 признаками

(741, 5632)

In [67]:
txt_data = pd.read_csv('../input/vgaf-features/Train_labels.txt', sep=" ", header=None).iloc[1:, :1].reset_index(drop=True)
txt_data.columns = ["videoname"]

has_faces_column = pd.DataFrame(has_faces_train, columns = ['has_face'])

y_train_all = pd.DataFrame(y_train, columns = ['true_class'])

x_train_all = pd.DataFrame(x_train)

x_val_all = pd.DataFrame(x_val)

In [68]:
x_val_all.shape

(741, 5632)

In [59]:
del txt_data

In [26]:
X_train_all = pd.concat([txt_data, x_train_all], axis="columns") # has_faces_column
X_train_all.head()

Unnamed: 0,videoname,0,1,2,3,4,5,6,7,8,...,5622,5623,5624,5625,5626,5627,5628,5629,5630,5631
0,2_1,-0.001202,-0.040623,-0.023654,-0.094873,-0.083472,-0.081198,-0.11892,-0.121311,-0.121255,...,0.010852,0.01657,0.00246,0.008309,0.025981,0.005881,0.024872,0.026392,0.034415,0.021816
1,2_2,-0.001566,-0.045602,-0.039252,-0.082221,-0.071286,-0.072859,-0.109454,-0.139013,-0.131101,...,0.018799,0.031263,0.004277,0.014619,0.023179,0.010445,0.037011,0.023053,0.029168,0.033827
2,2_3,-0.001421,-0.046034,-0.041626,-0.072248,-0.058282,-0.08254,-0.114226,-0.140922,-0.126592,...,0.018559,0.02895,0.004272,0.022276,0.025155,0.011036,0.02679,0.014028,0.023428,0.02213
3,2_4,-0.001969,-0.045378,-0.064284,-0.060539,-0.053997,-0.074704,-0.109434,-0.154998,-0.093597,...,0.048026,0.029284,0.005174,0.015966,0.050546,0.013887,0.027765,0.012306,0.031414,0.012717
4,3_1,0.00062,-0.030695,-0.067916,-0.053613,-0.039545,-0.066786,-0.055722,-0.197108,-0.148137,...,0.020053,0.025481,0.003672,0.018554,0.04516,0.011147,0.027073,0.020037,0.033719,0.00394


In [16]:
x_train_all.shape

(2661, 5632)

In [130]:
X_train_faces_only = X_train_all[X_train_all['has_face'] == 1]
y_train_faces_only = y_train_all[X_train_all['has_face'] == 1]
X_train_faces_only.drop(columns=['has_face'], inplace = True)
X_train_all.drop(columns=['has_face'], inplace = True)
X_train_faces_only.shape, X_train_all.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


((2619, 5633), (2661, 5633))

In [119]:
X_train_faces_only.reset_index(drop=True)
y_train_faces_only.reset_index(drop=True)
print()




In [39]:
x_train_all.shape, y_train_all.shape

((2661, 5632), (2661, 1))

In [43]:
audio_models_names = ['w2v2-l-1f-0632', 'w2v2-l-2f-065', 'w2v2-l-3f-053', 'w2v2-l-4f-048', 'w2v2-l-5f-053']

In [45]:
from sklearn.model_selection import StratifiedKFold
from sklearn import svm,metrics,preprocessing

skf = StratifiedKFold(n_splits = 5)

counter = 1

test_weights = []
all_test_accuracies = []
all_val_accuracies = []
 
weights = np.arange(0.05, 0.5, 0.01)
small_weights = np.arange(0, 0.1, 0.01)

counter = 1

input_column = "Vid_name"
output_column = "Label"

for train_index, test_index in skf.split(x_train_all, y_train_all): # len должна быть = 2661
    print('{}-fold has started'.format(str(counter)))
    X_train, X_test = x_train_all.iloc[train_index], x_train_all.iloc[test_index]
    y_train, y_test = y_train_all.iloc[train_index], y_train_all.iloc[test_index]
    
    x_train_norm=preprocessing.normalize(X_train,norm='l2')
    x_test_norm=preprocessing.normalize(X_test,norm='l2')
    x_val_norm=preprocessing.normalize(x_val_all,norm='l2')
    
    test_accuracies = {
        'lin_svc': 0,
        'rf': 0,
        'audio': 0,
    }
    
    val_accuracies = {
        'lin_svc': 0,
        'rf': 0,
        'audio': 0,
    }
    
    print('SVC is fitting')
    lin_svc = svm.SVC(kernel='rbf',C=1) 
    
    lin_svc.fit(x_train_norm, y_train) # [has_faces_train==1], [has_faces_train==1]
    y_pred = lin_svc.predict(x_test_norm) 
    
    test_accuracy = metrics.accuracy_score(y_test, y_pred) # [has_faces_test==1], [has_faces_test==1]
    test_complete_accuracy = metrics.accuracy_score(y_test, y_pred)
    test_accuracies['lin_svc'] = [test_accuracy, test_complete_accuracy]
    print('SVC test accuracy - {} and complete accuracy - {}'.format(str(test_accuracy), str(test_complete_accuracy)))
    
    linsvc_proba = [] 
    linsvc_dist = lin_svc.decision_function(x_test_norm)
    for i in range(len(linsvc_dist)):
        linsvc_proba.append(softmax(linsvc_dist[i]))
    linsvc_proba = np.array(linsvc_proba)
    
    y_pred_val = lin_svc.predict(x_val_norm) 
    
    val_accuracy = metrics.accuracy_score(y_val, y_pred_val) # [has_faces_test==1]
    val_complete_accuracy = metrics.accuracy_score(y_val, y_pred_val)
    val_accuracies['lin_svc'] = [val_accuracy, val_complete_accuracy]
    print('SVC val accuracy - {} and complete accuracy - {}'.format(str(val_accuracy), str(val_complete_accuracy)))
    
    linsvc_proba_val = [] 
    linsvc_dist = lin_svc.decision_function(x_val_norm)
    for i in range(len(linsvc_dist)):
        linsvc_proba_val.append(softmax(linsvc_dist[i]))
    linsvc_proba_val = np.array(linsvc_proba_val)
    
    
    print('Random Forest is fitting')
    np.random.seed(42)

    rf_clf=RandomForestClassifier(n_estimators=1000,max_depth=12, n_jobs=-1)
  
    rf_clf.fit(x_train_norm, y_train) # [has_faces_train==1], [has_faces_train==1]
    y_pred = rf_clf.predict(x_test_norm)
    
    test_accuracy = metrics.accuracy_score(y_test, y_pred) # [has_faces_train==1], [has_faces_train==1]
    test_complete_accuracy = metrics.accuracy_score(y_test, y_pred)
    test_accuracies['rf'] = [test_accuracy, test_complete_accuracy]
    print('Random Forest test accuracy - {} and complete accuracy - {}'.format(str(test_accuracy), str(test_complete_accuracy)))
    
    rf_proba = rf_clf.predict_proba(x_test_norm) 
    
    y_pred_val = rf_clf.predict(x_val_norm)
    
    val_accuracy = metrics.accuracy_score(y_val, y_pred_val) # [has_faces_train==1], [has_faces_train==1]
    val_complete_accuracy = metrics.accuracy_score(y_val, y_pred_val)
    val_accuracies['rf'] = [val_accuracy, val_complete_accuracy]
    print('Random Forest val accuracy - {} and complete accuracy - {}'.format(str(val_accuracy), str(val_complete_accuracy)))
    
    rf_proba_val = rf_clf.predict_proba(x_val_norm)
    
    print('Audio model is fitting')
    
    model = AutoModelForAudioClassification.from_pretrained(
        '../input/' + audio_models_names[counter-1], 
        num_labels=3,
        label2id=class_to_idx,
        id2label=idx_to_class,
    ) # acc 

    device = 'cuda:0'
    model.to(device)
    
    train_path = pathlib.Path('../input/vgaf-dataset/Train_VGAF_audio-20220823T161233Z-001/Train_VGAF_audio')
    
    audio_proba = [] 
    
    test_labels = pd.read_csv('../input/5-fold/{}_test_fold.txt'.format(str(counter)), delimiter = ' ')
    
    for vid_name in tqdm(test_labels['Vid_name']): 
        vid_name += '.wav'
        audio_path = train_path/vid_name
        with torch.no_grad():
            input = torch.from_numpy(np.expand_dims(preprocess_function_eval(audio_path), 0)).to(device)
            output = model(input)
            audio_proba.append(softmax(output[0][0].cpu().numpy()))
    
    audio_proba = np.array(audio_proba)
    
    val_path = pathlib.Path('../input/vgaf-dataset/Val_VGAF_audio-20220823T161237Z-001/Val_VGAF_audio')
    
    audio_proba_val = [] 
    
    val_labels = pd.read_csv('../input/vgaf-features/Val_labels.txt', delimiter = ' ')
    
    for vid_name in tqdm(val_labels['Vid_name']): 
        vid_name += '.wav'
        audio_path = val_path/vid_name
        with torch.no_grad():
            input = torch.from_numpy(np.expand_dims(preprocess_function_eval(audio_path), 0)).to(device)
            output = model(input)
            audio_proba_val.append(softmax(output[0][0].cpu().numpy()))
            
    audio_proba_val = np.array(audio_proba_val)

    
    print('Coefficients selection has started')
    best_weights = [0, 0, 0]
    best_acc = 0

    for r in small_weights:
        for a in weights:
            for l in weights:
                y_pred = []
                for i in range(0, len(linsvc_proba)):
                    max_prob = np.argmax(r*rf_proba[i]+a*audio_proba[i]+l*linsvc_proba[i])
                    y_pred.append(max_prob)
                if metrics.accuracy_score(y_test, np.array(y_pred)) > best_acc:
                    best_acc = metrics.accuracy_score(y_test, np.array(y_pred))
                    best_weights[0] = r
                    best_weights[1] = a
                    best_weights[2] = l
                    print(best_acc, best_weights)

    print('Best weights of {}-fold are found, best accuracy is {}'.format(str(counter), str(best_acc)))
    test_weights.append(best_weights)
    all_test_accuracies.append(best_acc)
    
     
    y_pred_val = []
    for i in range(0, len(linsvc_proba_val)):
        max_prob = np.argmax(best_weights[0]*rf_proba_val[i]+best_weights[1]*audio_proba_val[i]+best_weights[2]*linsvc_proba_val[i])
        y_pred_val.append(max_prob)

    final_val_accuracy = metrics.accuracy_score(y_val, np.array(y_pred_val))
    print('{}-fold val accuracy is {}'.format(str(counter), str(final_val_accuracy)))
    all_val_accuracies.append(final_val_accuracy)
    
    counter += 1


1-fold has started
SVC is fitting
SVC test accuracy - 0.6904315196998124 and complete accuracy - 0.6904315196998124
SVC val accuracy - 0.689608636977058 and complete accuracy - 0.689608636977058
Random Forest is fitting
Random Forest test accuracy - 0.7298311444652908 and complete accuracy - 0.7298311444652908
Random Forest val accuracy - 0.6545209176788124 and complete accuracy - 0.6545209176788124
Audio model is fitting


  0%|          | 0/533 [00:00<?, ?it/s]

  0%|          | 0/766 [00:00<?, ?it/s]

Coefficients selection has started
0.6754221388367729 [0.0, 0.05, 0.05]
0.6866791744840526 [0.0, 0.05, 0.060000000000000005]
0.7073170731707317 [0.0, 0.05, 0.07]
0.726078799249531 [0.0, 0.05, 0.08000000000000002]
0.7448405253283302 [0.0, 0.05, 0.09000000000000001]
0.7523452157598499 [0.0, 0.060000000000000005, 0.11000000000000001]
0.7542213883677298 [0.01, 0.08000000000000002, 0.14]
0.7636022514071295 [0.02, 0.05, 0.07]
0.7654784240150094 [0.03, 0.05, 0.07]
0.7673545966228893 [0.04, 0.05, 0.07]
0.7711069418386491 [0.05, 0.05, 0.060000000000000005]
Best weights of 1-fold are found, best accuracy is 0.7711069418386491
1-fold val accuracy is 0.6518218623481782
2-fold has started
SVC is fitting
SVC test accuracy - 0.75 and complete accuracy - 0.75
SVC val accuracy - 0.6383265856950068 and complete accuracy - 0.6383265856950068
Random Forest is fitting
Random Forest test accuracy - 0.7218045112781954 and complete accuracy - 0.7218045112781954
Random Forest val accuracy - 0.6369770580296896 

  0%|          | 0/532 [00:00<?, ?it/s]

  0%|          | 0/766 [00:00<?, ?it/s]

Coefficients selection has started
0.7819548872180451 [0.0, 0.05, 0.05]
0.7875939849624061 [0.0, 0.05, 0.060000000000000005]
0.7894736842105263 [0.0, 0.060000000000000005, 0.07]
0.7913533834586466 [0.0, 0.19, 0.22000000000000003]
0.793233082706767 [0.01, 0.05, 0.05]
0.7951127819548872 [0.02, 0.09000000000000001, 0.09000000000000001]
0.7969924812030075 [0.05, 0.09000000000000001, 0.060000000000000005]
0.7988721804511278 [0.05, 0.1, 0.07]
0.8007518796992481 [0.08, 0.11000000000000001, 0.060000000000000005]
0.8026315789473685 [0.09, 0.12000000000000001, 0.060000000000000005]
Best weights of 2-fold are found, best accuracy is 0.8026315789473685
2-fold val accuracy is 0.5883940620782726
3-fold has started
SVC is fitting
SVC test accuracy - 0.5977443609022557 and complete accuracy - 0.5977443609022557
SVC val accuracy - 0.6761133603238867 and complete accuracy - 0.6761133603238867
Random Forest is fitting
Random Forest test accuracy - 0.5695488721804511 and complete accuracy - 0.569548872180

  0%|          | 0/532 [00:00<?, ?it/s]

  0%|          | 0/766 [00:00<?, ?it/s]

Coefficients selection has started
0.5883458646616542 [0.0, 0.05, 0.05]
0.5977443609022557 [0.0, 0.05, 0.060000000000000005]
0.599624060150376 [0.0, 0.05, 0.07]
0.6109022556390977 [0.0, 0.05, 0.08000000000000002]
0.6127819548872181 [0.0, 0.060000000000000005, 0.1]
0.6146616541353384 [0.01, 0.060000000000000005, 0.09000000000000001]
0.6165413533834586 [0.02, 0.08000000000000002, 0.12000000000000001]
0.618421052631579 [0.04, 0.13, 0.19]
Best weights of 3-fold are found, best accuracy is 0.618421052631579
3-fold val accuracy is 0.6531713900134952
4-fold has started
SVC is fitting
SVC test accuracy - 0.5451127819548872 and complete accuracy - 0.5451127819548872
SVC val accuracy - 0.6491228070175439 and complete accuracy - 0.6491228070175439
Random Forest is fitting
Random Forest test accuracy - 0.5037593984962406 and complete accuracy - 0.5037593984962406
Random Forest val accuracy - 0.6126855600539811 and complete accuracy - 0.6126855600539811
Audio model is fitting


  0%|          | 0/532 [00:00<?, ?it/s]

  0%|          | 0/766 [00:00<?, ?it/s]

Coefficients selection has started
0.5244360902255639 [0.0, 0.05, 0.05]
0.543233082706767 [0.0, 0.05, 0.07]
0.5469924812030075 [0.0, 0.05, 0.1]
0.5488721804511278 [0.0, 0.09000000000000001, 0.14]
0.5507518796992481 [0.0, 0.12000000000000001, 0.25000000000000006]
Best weights of 4-fold are found, best accuracy is 0.5507518796992481
4-fold val accuracy is 0.6437246963562753
5-fold has started
SVC is fitting
SVC test accuracy - 0.6372180451127819 and complete accuracy - 0.6372180451127819
SVC val accuracy - 0.6788124156545209 and complete accuracy - 0.6788124156545209
Random Forest is fitting
Random Forest test accuracy - 0.6240601503759399 and complete accuracy - 0.6240601503759399
Random Forest val accuracy - 0.6383265856950068 and complete accuracy - 0.6383265856950068
Audio model is fitting


  0%|          | 0/532 [00:00<?, ?it/s]

  0%|          | 0/766 [00:00<?, ?it/s]

Coefficients selection has started
0.6146616541353384 [0.0, 0.05, 0.05]
0.6259398496240601 [0.0, 0.05, 0.060000000000000005]
0.6353383458646616 [0.0, 0.05, 0.07]
0.6428571428571429 [0.0, 0.05, 0.08000000000000002]
0.6447368421052632 [0.0, 0.060000000000000005, 0.1]
0.6466165413533834 [0.0, 0.08000000000000002, 0.13]
0.6484962406015038 [0.03, 0.060000000000000005, 0.05]
0.650375939849624 [0.08, 0.19, 0.16000000000000003]
0.6522556390977443 [0.09, 0.19, 0.16000000000000003]
Best weights of 5-fold are found, best accuracy is 0.6522556390977443
5-fold val accuracy is 0.6248313090418354


In [70]:
rf_proba.shape, audio_proba.shape, linsvc_proba.shape

((533, 3), (533, 3), (533, 3))

____________________________

In [52]:
test_weights = np.array(test_weights)
mean_test_weights = np.mean(test_weights, axis=0)
mean_test_weights

array([0.054, 0.122, 0.144])

In [60]:
test_weights

array([[0.05, 0.05, 0.06],
       [0.09, 0.12, 0.06],
       [0.04, 0.13, 0.19],
       [0.  , 0.12, 0.25],
       [0.09, 0.19, 0.16]])

In [None]:
''' remove quotes in txt
    dir = r'C:\Users\User\Downloads\5_fold/'

    for i in range(1, 6):
        with open(dir + str(i) + '_train_fold.txt', mode='r') as txt:
            old_data = txt.read()

        new_data = old_data.replace("\'", '')

        with open (dir + str(i) + '_train_fold.txt', 'w') as txt:
            txt.write(new_data)
'''