In [1]:
import numpy as np
import pandas as pd
import pymongo
import os
import pickle
import uuid
import ujson
from sklearn.manifold import TSNE

In [2]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")

In [3]:
dataset_directory=os.path.join("..","dataset","v2")
def shortid(num):
    return [str(uuid.uuid4())[:8] for i in range(num)]
def label_encode(string):
    return string.replace("\\'",".").replace(" ","_")

In [4]:
def dataset_to_mongodb(dataset_name, dataset_type):
    collection_name = dataset_name + "_" + dataset_type

    # open file
    with open(os.path.join(dataset_directory, dataset_name, dataset_type + ".pickle"), 'rb') as f:
        X_train, y_train, feature_names, label_names = pickle.load(f)

    # assign ids
    index=shortid(X_train.shape[0])

    # insert feature data
    X_train=pd.DataFrame(X_train.todense(),columns=[feature_names[x][0] for x in range(X_train.shape[1])],index=index)
    X_train["_id"]=index
    myclient[collection_name]["features"].insert_many(X_train.to_dict('records'))

    # insert label data
    y_train=pd.DataFrame(y_train.todense(),columns=[label_encode(label_names[x][0]) for x in range(y_train.shape[1])],index=index)
    y_train["_id"]=index
    myclient[collection_name]["labels"].insert_many(y_train.to_dict('records'))

    # insert feature tsne
    t_sne = TSNE()
    t_sne.fit(X_train.drop("_id",axis=1))
    t_sne_df = pd.DataFrame(t_sne.embedding_, columns=["v1","v2"], index=index)
    t_sne_df["_id"]=index
    myclient[collection_name]["tsne_features"].insert_many(t_sne_df.to_dict('records'))

    # insert label tsne
    t_sne = TSNE()
    t_sne.fit(y_train.drop("_id",axis=1))
    t_sne_df = pd.DataFrame(t_sne.embedding_, columns=["v1","v2"], index=index)
    t_sne_df["_id"]=index
    myclient[collection_name]["tsne_labels"].insert_many(t_sne_df.to_dict('records'))
    
    # insert feature combination tsne
    t_sne = TSNE()
    y_unique = np.unique(y_train.drop("_id",axis=1).to_numpy().astype(int),axis=0)
    t_sne.fit(y_unique)
    t_sne_df = pd.DataFrame(t_sne.embedding_, columns=["v1","v2"])
    member_list = [list(y_train.index[y_train.drop("_id",axis=1)
                                              .apply(lambda x: np.array_equal(np.array(x.values).astype(int),row),axis=1)]) for row in y_unique]
    t_sne_df["_member"] = member_list
    myclient[collection_name]["tsne_labels_combination"].insert_many(t_sne_df.to_dict('records'))

In [5]:
def label_to_mongodb(dataset_name):
    with open(os.path.join(dataset_directory, dataset_name, "train.pickle"), 'rb') as f:
        X_train, y_train, feature_names, label_names = pickle.load(f)
        
    z=[]
    
    for item in label_names:
        z.append(label_encode(item[0]))
        
    print(z)
        
    myclient["config"]["avaliable_dataset"].insert_one({"dataset_name": dataset_name,"label_names": z})

In [9]:
dataset_to_mongodb("bibtex","train")
dataset_to_mongodb("bibtex","test")



In [6]:
dataset_to_mongodb("emotions","train")
dataset_to_mongodb("emotions","test")



In [7]:
label_to_mongodb("emotions")

['amazed-suprised', 'happy-pleased', 'relaxing-calm', 'quiet-still', 'sad-lonely', 'angry-aggresive']


In [41]:
label_to_mongodb("bibtex")

['TAG_2005', 'TAG_2006', 'TAG_2007', 'TAG_agdetection', 'TAG_algorithms', 'TAG_amperometry', 'TAG_analysis', 'TAG_and', 'TAG_annotation', 'TAG_antibody', 'TAG_apob', 'TAG_architecture', 'TAG_article', 'TAG_bettasplendens', 'TAG_bibteximport', 'TAG_book', 'TAG_children', 'TAG_classification', 'TAG_clustering', 'TAG_cognition', 'TAG_collaboration', 'TAG_collaborative', 'TAG_community', 'TAG_competition', 'TAG_complex', 'TAG_complexity', 'TAG_compounds', 'TAG_computer', 'TAG_computing', 'TAG_concept', 'TAG_context', 'TAG_cortex', 'TAG_critical', 'TAG_data', 'TAG_datamining', 'TAG_date', 'TAG_design', 'TAG_development', 'TAG_diffusion', 'TAG_diplomathesis', 'TAG_disability', 'TAG_dynamics', 'TAG_education', 'TAG_elearning', 'TAG_electrochemistry', 'TAG_elisa', 'TAG_empirical', 'TAG_energy', 'TAG_engineering', 'TAG_epitope', 'TAG_equation', 'TAG_evaluation', 'TAG_evolution', 'TAG_fca', 'TAG_folksonomy', 'TAG_formal', 'TAG_fornepomuk', 'TAG_games', 'TAG_granular', 'TAG_graph', 'TAG_hci', 'TA

In [8]:
dataset_to_mongodb("delicious","train")
dataset_to_mongodb("delicious","test")
label_to_mongodb("delicious")



['TAG_.imported', 'TAG_.net', 'TAG_2.0', 'TAG_2007', 'TAG_3d', 'TAG_??', 'TAG_???', 'TAG_????', 'TAG_academia', 'TAG_academic', 'TAG_access', 'TAG_accessibility', 'TAG_accessories', 'TAG_accounts', 'TAG_actionscript', 'TAG_activism', 'TAG_ad', 'TAG_addon', 'TAG_addons', 'TAG_admin', 'TAG_administration', 'TAG_adobe', 'TAG_ads', 'TAG_adsense', 'TAG_adult', 'TAG_advertising', 'TAG_advice', 'TAG_affiliate', 'TAG_agencies', 'TAG_agency', 'TAG_aggregator', 'TAG_agile', 'TAG_ai', 'TAG_air', 'TAG_airfare', 'TAG_airline', 'TAG_airlines', 'TAG_ajax', 'TAG_album', 'TAG_algorithm', 'TAG_algorithms', 'TAG_alternative', 'TAG_amazing', 'TAG_amazon', 'TAG_america', 'TAG_analysis', 'TAG_analytics', 'TAG_angst', 'TAG_animals', 'TAG_animation', 'TAG_anime', 'TAG_anonymous', 'TAG_anthropology', 'TAG_apache', 'TAG_apartment', 'TAG_api', 'TAG_app', 'TAG_apparel', 'TAG_apple', 'TAG_application', 'TAG_applications', 'TAG_apps', 'TAG_architecture', 'TAG_archive', 'TAG_archives', 'TAG_art', 'TAG_arte', 'TAG_ar