In [None]:
using Pkg; for p in ("Embeddings","DataStructures","DataFrames","FileIO","LinearAlgebra","Knet","FileIO"); haskey(Pkg.installed(),p) || Pkg.add(p); end
using DataStructures,DataFrames,FileIO,Embeddings,LinearAlgebra,FileIO
using Knet: Knet, AutoGrad, param, param0, mat, RNN, relu, Data, adam, progress, nll, zeroone

In [None]:
mutable struct node 
    word
    kidsword
    kidsindex
    parent
    finished
    is_word
    selfindex
    parentindex
    label
    ind
node(word) = word == nothing ? new(nothing,nothing,nothing,nothing,nothing,0,nothing,nothing,nothing,nothing) : new(word,[],[],[],0,1,0,0,"",-1)
    
end

In [None]:
function get_split(size=5952)
dict=Dict()
    for i in range(1,length=size)
        if i < 5452
            dict[i] = 1
        else
            dict[i] =2
        end 
    end 
    return dict
end

In [None]:
function get_labels(fn)
    f=open(fn,"r")
    dict=Dict()
    for (index, i) in enumerate(readlines(f))
        dict[index] = parse(Int,i) 
    end
    return dict
end

In [None]:
function clean_str(string, TREC=false)
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    
    string = replace(string,r"[^A-Za-z0-9(),!?\'\`]" =>s" ")
    string = replace(string,r"\'s" =>s" 's") 
    string = replace(string,r"\'ve" =>s" 've") 
    string = replace(string,r"n\'t" =>s" n't") 
    string = replace(string,r"\'re" =>s" 're") 
    string = replace(string,r"\'d" =>s" 'd") 
    string = replace(string,r"\'ll" =>s" 'll") 
    string = replace(string,r"," =>s" , ") 
    string = replace(string,r"!" =>s" ! ") 
    string = replace(string,r"\(" =>s" \\( ") 
    string = replace(string,r"\)" =>s" \\) ") 
    string = replace(string,r"\?" =>s" \\? ") 
    string = replace(string,r"\s{2,}" =>s" ")    
        
    return (TREC ?  strip(string) : lowercase(strip(string)))
end

In [None]:
function build_data_cv(file, split_dict, label_dict, clean_string=false)
    """
    Loads data and split data
    """
    revs = []
    f = open(file,"r")
    vocab = DefaultDict(0)#https://juliacollections.github.io/DataStructures.jl/latest/default_dict.html
    
    for (index, line) in enumerate(readlines(f))     
        rev = []
        push!(rev,strip(line))
        if clean_string
            orig_rev = clean_str(join(rev," "))
        else
            orig_rev = join(rev," ")
        end
        words = Set(split(orig_rev))
        for word in words
            vocab[word] += 1
        end
        datum  = Dict("y"=>label_dict[index], 
                    "text"=> orig_rev,                             
                    "num_words"=> length(split(orig_rev)),
                    "split"=> split_dict[index])
        push!(revs,datum)
    end

    return revs, vocab
end

In [None]:
function sibling2(sents, opt)
    sent_list = []
    
    for (key,currnet_node) in sents
        #@show key
        #@show currnet_node
        if key == 0;continue;end
        #currnet_node = sents[key]
        word_list = []
        push!(word_list,currnet_node.word)
        
        parent_index = currnet_node.parentindex
        parent = sents[parent_index]
        push!(word_list,parent.word)
        sib_list = parent.kidsindex
        if key < parent_index
            sib_candidate = [i for i in sib_list if i < key]
            if sib_candidate == [];push!(word_list,"*START*")
            else;push!(word_list,sents[pop!(sib_candidate)].word);end 
            if sib_candidate == [];push!(word_list,"*START*")
            else;push!(word_list,sents[pop!(sib_candidate)].word);end
        else
            sib_candidate = [i for i in sib_list if i > key]
            if sib_candidate == [];push!(word_list,"*STOP*")
            else;push!(word_list,sents[pop!(sib_candidate)].word);end
            if sib_candidate == [];push!(word_list,"*STOP*")
            else;push!(word_list,sents[pop!(sib_candidate)].word); end
       end
        grad_parent_ind = parent.parentindex
        grad_word = sents[grad_parent_ind].word
        push!(word_list,grad_word)
        push!(sent_list,word_list)
    end
    return sent_list
end
                                                    

In [None]:
function set_sibling2(tree,labels_dict,max_len)

    sent_num = length(tree)
    doc_list =[]
    for (ind,sents) in enumerate(tree)
        #sents
        sib_6 = sibling2(sents,6)
        sent_list = sib_6
        dummy_len = length(sent_list[1])
        dummy = repeat(["*ZERO*"],dummy_len)
        while length(sent_list) < max_len
            push!(sent_list,dummy)
        end
        currnet_label = labels_dict[ind]
        class_dummy = [currnet_label]*dummy_len
        push!(sent_list,class_dummy)        
        push!(doc_list,sent_list)
    end
    return doc_list    
end

In [None]:
function set_conv_sent(tree,labels_dict,max_len)
    conv_length = 5
    @show sent_num = length(tree)
    ##65-4 the most beginning 4 will be append to the front at last
    #sent_tensor = np.array.zeros((1,61,5))
    #sent_counter = 0
    doc_list =[]
    for (ind,sents) in enumerate(tree)
         sent_list = []
        for (key,currnet_node) in sents
            #@show key 
            if key == 0;continue;end
            #currnet_node = sents[key]
            word_list = []
            for i in range(1,conv_length)
                #@show currnet_node.word
                if currnet_node.word != "ROOT";push!(word_list,currnet_node.word)
                else; push!(word_list,currnet_node.word);end
                if currnet_node.word != "ROOT"; currnet_node = sents[currnet_node.parentindex];end
            end
             push!(sent_list,word_list)
            #@show length(sent_list)
        end 
        header = []
        dummy = repeat(["ROOT"],conv_length)
        for i in range(1,conv_length-1);push!(header,vcat(dummy[1:conv_length-i-1], sent_list[1][1:i+1]));end
        sent_list = vcat(header,sent_list)
        while length(sent_list) < max_len;push!(sent_list,dummy);end
        currnet_label = labels_dict[ind]
        class_dummy = repeat([currnet_label],conv_length)
        push!(sent_list,class_dummy)
        #@show length(sent_list)
        push!(doc_list,sent_list)
            end     
    @show length(doc_list)
    return doc_list
end

In [None]:
function add_tree2vocab(sent, vocab)
    
    for (j, each_word) in enumerate(sent[1:end-1])
        for (l, each_field) in enumerate(each_word)
            if each_field in keys(vocab);continue
            elseif each_field == 0;continue
            elseif each_field == "ROOT";continue
                else;vocab[each_field] += 1;end
        end
    end
end

In [None]:
function merge_two(revs, tree)
    counter=1
    for i in revs
        sent2 = tree[counter]
        counter += 1
        i["tree"] = sent2
    end    
    return revs
        
end

In [None]:
#TODO : adjust this function 
function load_bin_vec(fname, vocab)
p(s)=return parse(Int,s)
    open(fname, "r") do f
            @show header = readline(f)
            vocab_size, layer1_size = map(p, split(header))
            @show binary_len = sizeof(Float32) * layer1_size
        for line in collect(1:vocab_size)
            word=[]
            while true 
                    ch=read(f,1)
                    if ch == ' ';word = join(word,"");break;end
                    if ch != '\n';push!(word,ch);end
            end
            if word in keys(vocab)
                word_vecs[word] = Array{Float32}(map(pf,read(f,1200)))
                vec_norm = norm(w)
                word_vecs[word]= w./vec_norm
            @show length(word_vecs[word])
            else;read(f,binary_len);end
        end    
    end;
return word_vecs
end

In [None]:
function add_unknown_words(word_vecs, vocab, min_df=1, k=300):
    """
    For words that occur in at least min_df documents, create a separate word vector.    
    0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
    """
    for word in vocab:
        if word ∉ word_vecs && vocab[word] >= min_df
            word_vecs[word] = rand(k)*0.5 - 0.25 
        end
end

In [None]:
function get_W(word_vecs, k=300):
    """
    Get word matrix. W[i] is the vector for word indexed by i
    """
    vocab_size = length(word_vecs)
    word_idx_map = Dict()
    W = zeros((vocab_size+1, k))            
    W[1] = zeros(k)
    i = 1
    for word in word_vecs
        W[i] = word_vecs[word]
        word_idx_map[word] = i
        i += 1
    end
    return W, word_idx_map
    end

In [None]:
#execfile("preindex.py")
w2v_file = "/Users/abdulrhmanalabrash/Google Drive/KU/DL/paper/DCNN/data/google_w2v.bin"   
sent_file = "/Users/abdulrhmanalabrash/Google Drive/KU/DL/paper/DCNN/TREC/TREC_all.txt"
tree_file = "/Users/abdulrhmanalabrash/Google Drive/KU/DL/paper/DCNN/TREC/TREC_all_tree.jld2" # hdf5 wrtoe 
label_file = "/Users/abdulrhmanalabrash/Google Drive/KU/DL/paper/DCNN/TREC/label_all.txt"
label_dict = get_labels(label_file);
split_dict = get_split(5952) ;
       
revs, vocab = build_data_cv(sent_file, split_dict, label_dict);
function dfun(d::Dict);return d["num_words"];end
max_l,maxIndex = findmax(map(dfun, revs))
    
all_tree = load("data.jld2","data"); # use jld2 method
data_sibling = set_sibling2(all_tree,label_dict,max_l+8);
data_tree = set_conv_sent(all_tree,label_dict,max_l+8); 
#summary.(data_tree)

new_data_tree = []
for (ind,l) in enumerate(data_tree)
    new_list=[]
    for (ind2,l2) in enumerate(l);push!(new_list,vcat(data_tree[ind][ind2],data_sibling[ind][ind2]));end
    push!(new_data_tree,new_list)
end
data_tree = new_data_tree
#@show length.(new_data_tree)
for i in data_tree;add_tree2vocab(i, vocab);end
@show length(vocab)
revs = merge_two(revs,data_tree);

In [None]:
    println("data loaded!")
    println("number of sentences: ", length(revs))
    println("vocab size: " ,length(vocab))
    println("max sentence length: " ,max_l)
    println("loading word2vec vectors...")
   # w2v = load_bin_vec(w2v_file, vocab)
    println("word2vec loaded!")
    #println("num words already in word2vec: ",length(w2v)))
    vocab["ROOT"]=1
    vocab["*START*"]=1
    vocab["*STOP*"]=1
    vocab["*ZERO*"]=1
    vocab["*STARTWE*"]=1
    vocab["*STOPWE*"]=1
    vocab["*ZEROWE*"]=1    
#     add_unknown_words(w2v, vocab)
#     W, word_idx_map = get_W(w2v)
#     rand_vecs = Dict()
#     add_unknown_words(rand_vecs, vocab)
#     W2, _ = get_W(rand_vecs)
#save("TREC_sib.jld2","datas",[revs, W, W2, word_idx_map, vocab])  
#     println("dataset created!")