In [1]:
import numpy  as np 
import pandas as pd
from processing import *
from scipy.stats import truncnorm
from fastai.tabular.all import *

In [2]:
def get_truncated_normal(mean=265, sd=265, low=0, upp=500):
    return truncnorm((low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)

def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return idx

In [3]:
fs = 256
interval = 20
k = 10

filenames = ['avatar1', 'avengers1', 'bbc1', 'bear1', 'bighero1', 'creed1', 'edgeoftmr11', 'gunviolence1', 'ironman1', 'joe1', 'lex1', 'vox1']
files = [pd.read_csv(('datasets/{}.csvprocessed.csv').format(name)) for name in filenames]

data_sets = pd.concat(files, ignore_index=True).dropna()

In [4]:
labels = data_sets["Interest"]
    
labels_norm = [1 if x == 4 or x == 5 else 0 for x in labels]

data_sets["Interest"] = labels_norm

interested = np.array(data_sets[data_sets['Interest'] == 1])
uninterested = np.array(data_sets[data_sets['Interest'] == 0])

# print(int(len(interested)/(fs*interval))*fs*interval)
# print(int(len(interested)/(fs*interval))) #53
# print(int(len(uninterested)/(fs*interval))) #41


In [5]:
# interestedN = np.array(np.array_split(interested[:int(len(interested)/(fs*interval))*fs*interval], int((len(interested)/(fs*interval))), axis=0))
interestedN1 = np.array(np.array_split(np.array(np.array_split(interested[:int(len(interested)/(fs*interval))*fs*interval], int((len(interested)/(fs*interval))), axis=0)), k, axis=1)).reshape(10*-1, 512, 6)[:, :, 1:5]

sortedN1 = interestedN1[np.argsort(interestedN1.reshape(-1, 512*4).sum(axis=1))]

print(sortedN1.shape)

mean = np.mean(sortedN1.reshape(-1, 512*4))
std = np.std(sortedN1.reshape(-1, 512*4))

mean_indx = find_nearest(np.mean(sortedN1.reshape(-1, 512*4), axis=1), mean)
std_indx = find_nearest(np.mean(sortedN1.reshape(-1, 512*4), axis=1), std)

r = get_truncated_normal(mean=mean_indx, sd=int(std_indx/2), low=0, upp=len(sortedN1))
print(int(r.rvs()))

new_pos_segments = []
for n in range(500):
    ind = [interestedN1[int(r.rvs())] for x in range(10)]
    new_pos_segments.append(np.concatenate(ind, axis=0))

new_pos_segments = np.array(new_pos_segments)
# print(new_pos_segments.shape)

(530, 512, 4)
52


In [6]:
uninterestedN1 = np.array(np.array_split(np.array(np.array_split(uninterested[:int(len(uninterested)/(fs*interval))*fs*interval], int((len(uninterested)/(fs*interval))), axis=0)), k, axis=1)).reshape(10*-1, 512, 6)[:, :, 1:5]

sortedUN1 = uninterestedN1[np.argsort(uninterestedN1.reshape(-1, 512*4).sum(axis=1))]

print(sortedUN1.shape)

Umean = np.mean(sortedUN1.reshape(-1, 512*4))
Ustd = np.std(sortedUN1.reshape(-1, 512*4))

Umean_indx = find_nearest(np.mean(sortedUN1.reshape(-1, 512*4), axis=1), Umean)
Ustd_indx = find_nearest(np.mean(sortedUN1.reshape(-1, 512*4), axis=1), Ustd)

Ur = get_truncated_normal(mean=Umean_indx, sd=int(Ustd_indx/2), low=0, upp=len(sortedUN1))
print(int(Ur.rvs()))


new_neg_segments = []
for n in range(500):
    ind = [uninterestedN1[int(Ur.rvs())] for x in range(10)]
    new_neg_segments.append(np.concatenate(ind, axis=0))

new_neg_segments = np.array(new_neg_segments)
# print(new_neg_segments.shape)

(410, 512, 4)
6


In [7]:
augmented_batches = np.concatenate((new_neg_segments, new_pos_segments), axis=0).reshape(-1*5120, 4, order='F')

print(augmented_batches.shape)

(5120000, 4)


In [8]:
labels = np.concatenate((np.zeros(500), np.full(500, 1)))
# print(labels.shape)

features = PSD(augmented_batches, fs, filtering=True)

# print(features)

normalized = pd.DataFrame(descriptive_stats(features))
normalized["label"] = labels

print(normalized.head())

procs = [Categorify, FillMissing, Normalize]
dls = TabularDataLoaders.from_df(df = normalized, procs=procs, cont_names=list(normalized.columns)[:-1], 
                                 y_names="label", y_block=CategoryBlock, bs=64)

  0%|          | 0/4 [00:00<?, ?it/s]

(5120000, 4)


100%|██████████| 4/4 [00:14<00:00,  3.62s/it]
100%|██████████| 1000/1000 [00:12<00:00, 77.51it/s]


          0         1         2         3         4         5         6  \
0 -1.012846 -1.183549 -0.952168 -1.057001 -0.975189 -1.065700  0.351425   
1 -0.786426 -0.908924  0.654961 -0.871849 -1.076684 -1.190552 -0.758977   
2 -1.096515 -1.338745  2.097281 -1.279174 -1.106541 -1.200076 -0.854891   
3 -1.027439 -0.970438  0.559958 -1.027470 -1.031350 -1.013526  0.780325   
4 -1.176215 -1.387097 -1.198844 -1.260419 -1.228658 -1.410788 -1.611914   

          7         8         9  ...        87        88        89        90  \
0 -1.032129 -1.255045 -1.248234  ... -1.120615 -1.308857 -1.247570 -1.218510   
1 -1.127477 -0.837460 -0.978745  ... -0.792518 -1.022640 -1.007990  2.090873   
2 -1.110816 -1.020694 -1.074304  ... -1.185136 -1.150918 -1.203454  0.747225   
3 -1.079907 -0.974324 -1.004701  ... -1.188314 -1.101107 -1.161597  2.005377   
4 -1.277766 -1.009178 -1.265779  ... -1.046440 -0.682643 -0.786641 -0.111249   

         91        92        93        94        95  label  
0 -1.18

In [9]:
f1_score = F1Score()
learn = tabular_learner(dls, metrics=[accuracy])


In [10]:
learn.fit_one_cycle(30) # cbs=EarlyStoppingCallback(min_delta=0.1, patience=2)
# learn.recorder.plot_losses()
# algo = svm_model(normalized, labels)
# search = random_search_svm(normalized, labels)
# print(search.best_params_)


epoch,train_loss,valid_loss,accuracy,time
0,0.733669,0.696128,0.47,00:00
1,0.72736,0.709919,0.485,00:00
2,0.709354,0.73045,0.445,00:00
3,0.690585,0.741675,0.46,00:00
4,0.669822,0.76908,0.45,00:00
5,0.647714,0.779813,0.46,00:00
6,0.634589,0.80568,0.45,00:00
7,0.612246,0.820162,0.47,00:00
8,0.592004,0.881848,0.44,00:00
9,0.569757,0.931046,0.465,00:00
