In [16]:
import numpy as np
import pandas as pd
import os
import pickle
import uuid
from sklearn.manifold import TSNE

In [17]:
dataset_directory=os.path.join("..","api","dataset","v2")
output_directory=os.path.join("..","api","dataset","v3")

In [18]:
def shortid(num):
    return [str(uuid.uuid4())[:8] for i in range(num)]
def label_encode(string):
    return string.replace("\\'",".").replace(" ","_")

In [31]:
with open(os.path.join(dataset_directory, "birds",'train.pickle'), 'rb') as f:
    X_train, y_train, feature_names, label_names = pickle.load(f)

index=shortid(X_train.shape[0])
X_train=pd.DataFrame(X_train.todense(),columns=[feature_names[x][0] for x in range(X_train.shape[1])],index=index)
X_train["id"]=index
X_train.to_pickle(os.path.join(output_directory,"birds","train","feature.pkl"))

y_train=pd.DataFrame(y_train.todense(),columns=[label_encode(label_names[x][0]) for x in range(y_train.shape[1])],index=index)
y_train["id"]=index
y_train.to_pickle(os.path.join(output_directory,"birds","train","label.pkl"))

In [32]:
t_sne = TSNE()
t_sne.fit(X_train.drop("id",axis=1))
t_sne_df = pd.DataFrame(t_sne.embedding_, columns=["x","y"], index=index)
t_sne_df["id"]=index
t_sne_df.to_pickle(os.path.join(output_directory,"birds","train","tsne_feature.pkl"))



In [33]:
t_sne = TSNE()
t_sne.fit(y_train.drop("id",axis=1))
t_sne_df = pd.DataFrame(t_sne.embedding_, columns=["x","y"], index=index)
t_sne_df["id"]=index
t_sne_df.to_pickle(os.path.join(output_directory,"birds","train","tsne_label.pkl"))



In [29]:
X_train.drop("id",axis=1).shape # num of data, num of features

(322, 260)

In [30]:
y_train.drop("id",axis=1).shape # num of data, num of labels

(322, 19)

In [15]:
pd.DataFrame([{"name": "birds", "domain": "audio", "instances": X_train.drop("id",axis=1).shape[0], "labels": y_train.drop("id",axis=1).shape[1],
              "features": X_train.drop("id",axis=1).shape[1]}]).to_pickle(os.path.join(output_directory,"available.pkl"))

Unnamed: 0,audio-ssd1,audio-ssd2,audio-ssd3,audio-ssd4,audio-ssd5,audio-ssd6,audio-ssd7,audio-ssd8,audio-ssd9,audio-ssd10,...,cluster100,segments,mean_rect_width,std_rect_width,mean_rect_height,std_rect_height,mean_rect_volume,std_rect_volume,hasSegments,location
56e868a9,0.016521,0.039926,0.089632,0.134119,0.170470,0.176872,0.171546,0.182392,0.162482,0.159083,...,0.0,13.0,16.384615,20.617394,46.769231,71.863118,788.923077,1761.802180,1.0,2.0
d45bfb1b,0.006600,0.035984,0.089956,0.123214,0.172273,0.177068,0.165507,0.179655,0.161744,0.163678,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,2.0
abe225ce,0.006894,0.017722,0.048062,0.065802,0.103443,0.091397,0.084931,0.088666,0.075676,0.074408,...,0.0,2.0,24.000000,2.828427,28.000000,1.414214,674.000000,113.137085,1.0,2.0
47004b62,0.031046,0.127675,0.221428,0.272707,0.358743,0.349389,0.316029,0.330656,0.310752,0.306288,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,2.0
e0fae88f,0.064721,0.226644,0.304482,0.274662,0.346980,0.334063,0.307223,0.324666,0.297070,0.292258,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18d9fccf,0.009148,0.009075,0.015139,0.020908,0.037890,0.036355,0.038220,0.044481,0.041390,0.044327,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,6.0
6d073318,0.025508,0.011626,0.023700,0.030874,0.047864,0.040987,0.041066,0.045088,0.040161,0.044836,...,0.0,4.0,39.750000,17.571283,25.250000,18.567445,1085.000000,1177.918503,1.0,6.0
cd99c51f,0.332050,0.053668,0.123005,0.142725,0.178769,0.165174,0.161457,0.162597,0.124231,0.130416,...,0.5,2.0,38.000000,11.313708,13.500000,0.707107,517.000000,179.605122,1.0,6.0
c3fd4aaa,0.009871,0.014142,0.030270,0.044325,0.065054,0.060812,0.062368,0.061929,0.055983,0.057395,...,0.0,8.0,75.750000,141.370082,43.250000,80.693866,4249.625000,13831.376717,1.0,6.0
