In [34]:
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
import os
import scipy.sparse as sp
import graphviz

In [33]:
os.environ["PATH"] += os.pathsep + 'C:\\Users\\Vanda\\Downloads\\graphviz-2.38\\release\\bin'

In [10]:
THD = "40"
TEST_SIZE = 0.3
EMBEDDING = "../../data/sparse_matrices/word_base/embeddings/filtered/glove300d_l_0.3.emb_f_microsoft_concept_graph_w_10.json.npz"
EMBEDDING_NAME = embedding_name = ".".join((os.path.basename(EMBEDDING).strip().split("."))[0:-1])
WORD_CONCEPT_NAME = (EMBEDDING_NAME.strip().split("_f_"))[-1] + "_t" + THD
WORD_CONCEPT = os.path.join("../../data/sparse_matrices/word_concept/", EMBEDDING_NAME, WORD_CONCEPT_NAME, "word_concept_mtx.npz")

print(EMBEDDING, WORD_CONCEPT)

../../data/sparse_matrices/word_base/embeddings/filtered/glove300d_l_0.3.emb_f_microsoft_concept_graph_w_10.json.npz ../../data/sparse_matrices/word_concept/glove300d_l_0.3.emb_f_microsoft_concept_graph_w_10.json\microsoft_concept_graph_w_10.json_t40\word_concept_mtx.npz


In [11]:
def preprocess(embedding_path, word_concept_path, base=0):
    E = sp.load_npz(embedding_path)
    C = sp.load_npz(word_concept_path)
    y = (E.getcol(base).toarray().T)[0, :]
    y[y>0] = 1
    y = y.reshape(y.shape[0], 1)
    print(y)
    print("y: ", y.shape)
    print("X: ", C.shape)
    return C,y

In [12]:
def delete_rows_csr(mat, indices):
    """
    Remove the rows denoted by ``indices`` form the CSR sparse matrix ``mat``.
    """
    if not isinstance(mat, sp.csr_matrix):
        raise ValueError("works only for CSR format -- use .tocsr() first")
    indices = list(indices)
    mask = np.ones(mat.shape[0], dtype=bool)
    mask[indices] = False
    return mat[mask]


In [13]:
def balance(X,y):
    pos_len = list(y).count(1)
    neg_len = list(y).count(0)
    neg_ind = [ind for ind, val in enumerate(list(y)) if val == 0]
    assert neg_len>pos_len
    number_of_ind_to_drop = neg_len-pos_len
    ind_to_drop = neg_ind[-number_of_ind_to_drop:]
    # print("pos len: ", pos_len)
    # print("to drop: ", len(ind_to_drop), " to keep: ", neg_len-len(ind_to_drop))
    X_balanced = delete_rows_csr(X, ind_to_drop)
    # print(X_balanced.shape)
    y_balanced = np.delete(y, ind_to_drop)
    # print(y_balanced.shape)
    return X_balanced, y_balanced


In [25]:
def classifier(X,y):
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X, y)
    return clf

In [17]:
def visualize(clf):
    dot_data = tree.export_graphviz(clf, out_file=None)
    graph = graphviz.Source(dot_data)
    # out_path = "../results/decision_tree/" + EMBEDDING_NAME + "_t" + THD
    # print(out_path)
    # dir_path = os.path.dirname(out_path)
    # if not os.path.exists(dir_path):
    #     os.makedirs(dir_path)
    graph.render("test", view=True)

In [37]:
def classify(base=0):
    X,y = preprocess(EMBEDDING, WORD_CONCEPT, base=base)
    X,y = balance(X,y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE,
                                                        shuffle=True, random_state=41)
    clf = classifier(X_train, y_train)
    y_pred = clf.predict(X_test)
    eval_data = precision_recall_fscore_support(y_test, y_pred)
    print(eval_data)
    return clf

In [38]:
clf = classify(base=1)
type(clf)

[[0.]
 [1.]
 [1.]
 ...
 [1.]
 [0.]
 [1.]]
y:  (41105, 1)
X:  (41105, 865)
(array([0.70957309, 0.66249374]), array([0.61942405, 0.746614  ]), array([0.66144106, 0.70204298]), array([1771, 1772], dtype=int64))


sklearn.tree.tree.DecisionTreeClassifier

In [39]:
dot_data = tree.export_graphviz(clf, out_file=None)
graph = graphviz.Source(dot_data)
    # out_path = "../results/decision_tree/" + EMBEDDING_NAME + "_t" + THD
    # print(out_path)
    # dir_path = os.path.dirname(out_path)
    # if not os.path.exists(dir_path):
    #     os.makedirs(dir_path)
graph.render("test", view=True)

'test.pdf'