In [1]:
import numpy as np
import pandas as pd
import os
import pickle
import uuid
from sklearn.manifold import TSNE

In [2]:
dataset_directory=os.path.join("..","api","dataset","v2")
output_directory=os.path.join("..","api","dataset","v3")

In [5]:
def shortid(num):
    return [str(uuid.uuid4())[:8] for i in range(num)]
def label_encode(string):
    return string.replace("\\'",".").replace(" ","_")

In [4]:
with open(os.path.join(dataset_directory, "birds",'train.pickle'), 'rb') as f:
    X_train, y_train, feature_names, label_names = pickle.load(f)

In [5]:
index=shortid(X_train.shape[0])
X_train=pd.DataFrame(X_train.todense(),columns=[feature_names[x][0] for x in range(X_train.shape[1])],index=index)
X_train["id"]=index
X_train.to_pickle(os.path.join(output_directory,"birds","train","feature.pkl"))

y_train=pd.DataFrame(y_train.todense(),columns=[label_encode(label_names[x][0]) for x in range(y_train.shape[1])],index=index)
y_train["id"]=index
y_train.to_pickle(os.path.join(output_directory,"birds","train","label.pkl"))

In [6]:
t_sne = TSNE()
t_sne.fit(X_train.drop("id",axis=1))
t_sne_df = pd.DataFrame(t_sne.embedding_, columns=["x","y"], index=index)
t_sne_df["id"]=index
t_sne_df.to_pickle(os.path.join(output_directory,"birds","train","tsne_feature.pkl"))

t_sne = TSNE()
t_sne.fit(y_train.drop("id",axis=1))
t_sne_df = pd.DataFrame(t_sne.embedding_, columns=["x","y"], index=index)
t_sne_df["id"]=index
t_sne_df.to_pickle(os.path.join(output_directory,"birds","train","tsne_label.pkl"))



In [29]:
X_train.drop("id",axis=1).shape # num of data, num of features

(322, 260)

In [30]:
y_train.drop("id",axis=1).shape # num of data, num of labels

(322, 19)

In [15]:
pd.DataFrame([{"name": "birds", "domain": "audio", "instances": X_train.drop("id",axis=1).shape[0], "labels": y_train.drop("id",axis=1).shape[1],
              "features": X_train.drop("id",axis=1).shape[1]}]).to_pickle(os.path.join(output_directory,"available.pkl"))

In [10]:
attribute=pd.DataFrame(feature_names,columns=["attribute","type"])
attribute["type"]=attribute["type"].apply(lambda x: "numeric" if x == "NUMERIC" else "nominal")
attribute.to_pickle(os.path.join(output_directory,"birds","attribute.pkl"))

In [20]:
pd.Series(map(lambda x: x[0],label_names)).apply(lambda x: label_encode(x)).to_pickle(os.path.join(output_directory,"birds","labels.pkl"))

In [40]:
y_train.drop("id",axis=1).sum(axis=1).to_json(orient="index")

'{"03e9467e":2,"f0f1dd56":0,"7be96912":1,"fce2a4fd":0,"75fc09f9":0,"c6e6798a":0,"8be5880d":2,"dc9f517b":0,"f0305e2c":0,"c89ea271":0,"17d7eac5":1,"0f747ef5":1,"42e0873e":2,"2f3ad96f":0,"1115fc94":0,"0189a545":1,"ab92e77c":0,"b3bf5f30":1,"dd5650c8":0,"44dc157e":1,"033bf735":2,"528f20eb":2,"a087e8c7":3,"71548f84":2,"5e519ef6":0,"8d949ee3":1,"8a4c4747":0,"94628b0e":0,"20ec9e67":1,"4e2cd74e":1,"d6b1b041":1,"58701f90":0,"49b574f1":0,"88c16678":1,"5120ca55":0,"a83bf0c8":3,"7fad416e":1,"f0baa825":0,"c81d2c01":0,"ad72d873":2,"0798768f":1,"49cabec2":0,"132fba18":0,"1467d5d7":0,"bb7da2dd":2,"6cfb9955":2,"e76eddd3":1,"61fde72a":2,"f8178e9b":2,"5a8c85c9":0,"efdcf3c4":3,"a0825560":0,"18e0e657":2,"b6c158f9":1,"a20496a3":2,"22aa92c2":0,"f87d61dd":0,"ccdbdb18":1,"54f3be36":2,"d904faec":1,"36d5c5ed":0,"a9b08b27":0,"8a133d3e":0,"3718c4e3":3,"77a30f61":0,"4790a399":1,"c82a814a":3,"61b5634f":0,"360fa278":0,"18255ef8":1,"a2358e8f":0,"23e44b30":0,"f9136dcf":0,"bdff3917":0,"6d0782b6":3,"7122a0fb":3,"c7e3fb9a"

In [39]:
y_train.drop("id",axis=1).sum(axis=0).to_json(orient="index")

TypeError: to_dict() got an unexpected keyword argument 'orient'

In [3]:
with open(os.path.join(dataset_directory, "birds",'test.pickle'), 'rb') as f:
    X_test, y_test, feature_names, label_names = pickle.load(f)

In [6]:
index=shortid(X_test.shape[0])
X_test=pd.DataFrame(X_test.todense(),columns=[feature_names[x][0] for x in range(X_test.shape[1])],index=index)
X_test["id"]=index
X_test.to_pickle(os.path.join(output_directory,"birds","test","feature.pkl"))

y_test=pd.DataFrame(y_test.todense(),columns=[label_encode(label_names[x][0]) for x in range(y_test.shape[1])],index=index)
y_test["id"]=index
y_test.to_pickle(os.path.join(output_directory,"birds","test","label.pkl"))

In [7]:
t_sne = TSNE()
t_sne.fit(X_test.drop("id",axis=1))
t_sne_df = pd.DataFrame(t_sne.embedding_, columns=["x","y"], index=index)
t_sne_df["id"]=index
t_sne_df.to_pickle(os.path.join(output_directory,"birds","test","tsne_feature.pkl"))

t_sne = TSNE()
t_sne.fit(y_test.drop("id",axis=1))
t_sne_df = pd.DataFrame(t_sne.embedding_, columns=["x","y"], index=index)
t_sne_df["id"]=index
t_sne_df.to_pickle(os.path.join(output_directory,"birds","test","tsne_label.pkl"))

