In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


# TF-IDF extraction

In [None]:
from collections import Counter
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

def tfidf_extract(raw_texts_train):
    """
    Calcuate TFIDF for the input texts

    Args:
        raw_texts_train (List[Str]): training corpus documents

    Returns:
        tfidf_df (Dataframe): TFIDF matrix for the word tokens in the training corpus
        cleaned_docs (List[Str]): cleaned training corpus documents
    """
    cleaned_docs = [re.sub("[.,!?:;-='...'@#_]", " ", doc) for doc in raw_texts_train]
    cleaned_docs = [re.sub(r'\d+', '', doc) for doc in cleaned_docs]
    cleaned_docs = [doc.lower() for doc in cleaned_docs]

    tfidfvectorizer = TfidfVectorizer(analyzer= 'word', stop_words='english')
    tfidf_wm = tfidfvectorizer.fit_transform(cleaned_docs)
    tfidf_tokens = tfidfvectorizer.get_feature_names_out()
    dense = tfidf_wm.todense()
    lst1 = dense.tolist()
    tfidf_df = pd.DataFrame(lst1, columns=tfidf_tokens)
    return tfidf_df, cleaned_docs

# POS, NER extraction

In [None]:
import spacy
def pos_ner_extract(tfidf_df, cleaned_docs, spacy_model="en_core_web_sm"):
    """
    Compute POS and NER for the input texts

    Args:
        tfidf_df (Dataframe): TFIDF matrix for the word tokens in the training corpus
        cleaned_docs (List[Str]): cleaned training corpus documents
        spacy_model (str, optional): spacy model name, default as 'en_core_web_sm'

    Returns:
        pos_df (Dataframe): possible POS tags for the word tokens in the training corpus context
        ner_df (Dataframe): possible NEG tags for the word tokens in the training corpus context
    """
    nlp = spacy.load(spacy_model)

    pos_df = pd.DataFrame('-', columns=tfidf_df.columns, index=tfidf_df.index)
    ner_df = pd.DataFrame('-', columns=tfidf_df.columns, index=tfidf_df.index)
    for i, text in enumerate(cleaned_docs):
        doc = nlp(text)
        for token in doc:
            if token.text in tfidf_df.columns:
                pos_df.loc[i,token.text] = token.tag_
                if token.ent_type_ in ["PERSON", "ORG", "GPE","LOC"]:
                    ner_df.loc[i,token.text] = token.ent_type_
                elif token.ent_iob_ != "O":
                    ner_df.loc[i,token.text] = "MISC"
    return pos_df, ner_df

# Make fired rule extraction script

In [None]:
def make_fired_rule_script(dt_folder_path):
    """
    Generate the fired rule extraction script based on the decision tree rules

    Args:
        dt_folder_path (str): path where the decision tree python script is saved, and where the fired rule extraction script will be saved

    Returns:
        branch_dict (dict), where each key is a (parent, child, edge) triplet for the decision tree structure, and the value is an empty list to store the document index belonging to the child node.
        edges_differentiated (List[List[Str]]), which saves the [parent, child]
        edge_labels_differentiated (dict), where each key is a (parent, child) tuple, and the value is the edge label of the tuple
    """
    rule_script_path = f"/{dt_folder_path}/outputs/rules/rules.py"
    fired_rule_script_path = f"/{dt_folder_path}/outputs/rules/rule_extraction.py"

    tree_file = open(rule_script_path,'r')
    content = tree_file.read()
    tree_file.close()

    content_by_line = content.split('\n')
    cleaned_content = []
    for i,line in enumerate(content_by_line):
        if line == "\n":
            continue
        if 'def' in line:
            continue
        if 'else' in line:
            continue
        if '#' in line:
            continue
        if 'return' in line and 'else' in content_by_line[i-1]:
            continue
        cleaned_content.append(line)

    if cleaned_content[1].count('\t') - cleaned_content[0].count('\t') == 1:
        tab = '\t'
        used_original_tab = True
    else:
        num_space_per_tab = cleaned_content[1].count(' ') - cleaned_content[0].count(' ')
        tab = ' ' * num_space_per_tab
        used_original_tab = False

    # add indentation for original lines
    updated_cleaned_content = []
    for line in cleaned_content:
        new_line = tab+line
        updated_cleaned_content.append(new_line)

    # prepare output py content lines
    output_lines = []
    output_lines.append("def extraction(X_test,branch_dict):")
    output_lines.append(tab+"instance_no = len(X_test.index)")
    output_lines.append(tab+"for i in range(instance_no):")
    output_lines.append(tab*2+"obj = X_test.iloc[i]")


    # get all the node and edges of the DT in text
    leaf_count = 0
    edges, edge_labels = [], {}
    parent_counter = {}
    edges_differentiated = []
    edge_labels_differentiated = {}
    branch_dict = {} # initialize branch dict to contain the docs belonging to each branch,
    for i, line in enumerate(cleaned_content):
        if 'if' in line and 'if' in cleaned_content[i+1]:
            # print(line.split('if ')[1])
            parent_no = line.split('if ')[1].split(']')[0].split('[')[1]
            parent = "vec_val_" + parent_no
            edge = line.split('if ')[1].split(']')[1].split(':')[0]
            child_no = cleaned_content[i+1].split('if ')[1].split(']')[0].split('[')[1]
            child = "vec_val_" + child_no
            edges.append([parent,child])
            edge_labels[(parent,child)] = edge
            try:
                parent_counter[parent] += 1
            except:
                parent_counter[parent] = 1
            if (parent_counter[parent]-1)//2 > 0:
                parent = parent+ '_' + str((parent_counter[parent]-1)//2)
            if child in parent_counter and (parent_counter[child])//2 > 0:
                child = child+ '_' + str((parent_counter[child])//2)
            edges_differentiated.append([parent,child])
            edge_labels_differentiated[(parent,child)] = edge

            # branch dict to contain the docs belonging to each branch, and add content to output source
            branch_dict[(parent, child, edge)] = []
            output_lines.append(updated_cleaned_content[i])
            if used_original_tab:
                num_tabs_next_level = cleaned_content[i+1].count("\t")
            else:
                num_tabs_next_level = int((cleaned_content[i+1].count(" ") - 1)/num_space_per_tab)
            output_lines.append(tab*(num_tabs_next_level+1) + f"branch_dict[('{parent}','{child}','{edge}')].append(i)")

        if 'if' in line and 'return' in cleaned_content[i+1]:
            parent_no = line.split('if ')[1].split(']')[0].split('[')[1]
            parent = "vec_val_" + parent_no
            edge = line.split('if ')[1].split(']')[1].split(':')[0]
            if "return '0'" in cleaned_content[i+1]:
                child = f"L{leaf_count}_neg"
                leaf_count += 1
            elif "return '1'" in cleaned_content[i+1]:
                child = f"L{leaf_count}_pos"
                leaf_count += 1

            edges.append([parent,child])
            edge_labels[(parent,child)] = edge
            try:
                parent_counter[parent] += 1
            except:
                parent_counter[parent] = 1
            if (parent_counter[parent]-1)//2 > 0:
                parent = parent+ '_' + str((parent_counter[parent]-1)//2)
            if child in parent_counter and (parent_counter[child])//2 > 0:
                child = child+ '_' + str((parent_counter[child])//2)
            edges_differentiated.append([parent,child]) # edges_differentiated[0][0] is root node
            edge_labels_differentiated[(parent,child)] = edge

            branch_dict[(parent, child, edge)] = []
            output_lines.append(updated_cleaned_content[i])
            if used_original_tab:
                num_tabs_next_level = cleaned_content[i+1].count("\t")
            else:
                num_tabs_next_level = int((cleaned_content[i+1].count(" ") - 1)/num_space_per_tab)
            output_lines.append(tab*(num_tabs_next_level+1) + f"branch_dict[('{parent}','{child}','{edge}')].append(i)")


    output_lines.append(tab+"return branch_dict")


    updated_output_lines = []
    for line in output_lines:
        updated_output_lines.append(line+'\n')

    out_f = open(fired_rule_script_path,'w')
    out_f.writelines(updated_output_lines)
    out_f.close()

    return branch_dict, edges_differentiated, edge_labels_differentiated

# fired rule extraction for train set

In [None]:
import os

def fired_rule_extraction(dt_folder_path, X_train, branch_dict):
    """
    Generate the fired rule extraction script based on the decision tree rules

    Args:
        dt_folder_path (str): where the fired rule extraction script is saved
        X_train (Dataframe): training set features
        branch_dict (Dict): where fired rule information will be saved, each key is a (parent, child, edge) triplet for the decision tree structure, and the value is an empty list to store the document index belonging to the child node.

    Returns:
        branch_dict (dict), where fired rule information is saved,  each key is a (parent, child, edge) triplet for the decision tree structure, and the value is a list of the document indices belonging to the child node.
        edges_differentiated (List[List[Str]]), which saves the [parent, child]
        edge_labels_differentiated (dict), where each key is a (parent, child) tuple, and the value is the edge label of the tuple
    """
    os.chngdir(dt_folder_path)
    import importlib
    import rule_extraction
    importlib.reload(rule_extraction)
    from rule_extraction import extraction
    branch_dict, edges_differentiated, edge_labels_differentiated = make_fired_rule_script(dt_folder_path)
    branch_dict = extraction(X_train, branch_dict)
    return branch_dict, edges_differentiated, edge_labels_differentiated

# collate node information from fired rules

In [None]:
import numpy as np
import pandas as pd
import json
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from wordcloud import STOPWORDS

import matplotlib.pyplot as plt

In [None]:
def collate_tree_rendering_info(tfidf_df,pos_df,ner_df,branch_dict,edges_differentiated,edge_labels_differentiated,output_path):
    """
    Based on the tree strucutre and the training documents that each node should have, collate the top important words based on their TFIDF values, and their corresponding POS and NER tags for rendering later.

    Args:
        tfidf_df (Dataframe): TFIDF matrix for the word tokens in the training corpus
        pos_df (Dataframe): possible POS tags for the word tokens in the training corpus context
        ner_df (Dataframe): possible NEG tags for the word tokens in the training corpus context
        branch_dict (dict), where fired rule information is saved,  each key is a (parent, child, edge) triplet for the decision tree structure, and the value is a list of the document indices belonging to the child node.
        edges_differentiated (List[List[Str]]), which saves the [parent, child]
        edge_labels_differentiated (dict), where each key is a (parent, child) tuple, and the value is the edge label of the tuple
        output_path (str): where the tree rendering info should be saved

    """
    FINAL_JSON = []
    nodes_info = []

    added_stopwords = ['s', 'S', 'said', 'will', 'u','movie','film']
    sw = STOPWORDS.update(added_stopwords)
    tfidf_df_word_based = tfidf_df.T.sum(axis=1).sort_values(ascending=False)

    pos_df_word_based = pos_df.T
    ner_df_word_based = ner_df.T
    for word in STOPWORDS:
        if word in tfidf_df_word_based.index:
            tfidf_df_word_based=tfidf_df_word_based.drop(word)
            pos_df_word_based = pos_df_word_based.drop(word)
            ner_df_word_based = ner_df_word_based.drop(word)

    # deal with root node
    node ={"node_name":edges_differentiated[0][0],
        "contain_words":[],
        "children":[]}

    # add words info for current node
    for word in tfidf_df_word_based.index[:100]:
        importance = float(tfidf_df_word_based.loc[word])
        pos = set(pos_df_word_based.loc[word].values.tolist())
        pos = [item for item in list(pos) if item != '-']
        # print(pos)
        ner = set(ner_df_word_based.loc[word].values.tolist())
        ner = [item for item in list(ner) if item != '-']
        current_word_dict = {
            "text": word,
            "weight": importance,
            "POS": pos,
            "NER": ner
        }
        node["contain_words"].append(current_word_dict)

    # add child info for current node
    for key, val in edge_labels_differentiated.items():
        if node["node_name"] == key[0]:
            node["children"].append({"edge":val,"node_name":key[1]})

    nodes_info.append(node)



    # visualize all other nodes except for root node
    for key,val in branch_dict.items():
        print(key)
        branch_tfidf_df = tfidf_df.iloc[val,:]
        # Create and generate a word cloud image:
        branch_tfidf_df_word_based = branch_tfidf_df.T.sum(axis=1).sort_values(ascending=False)
        branch_pos_df_word_based = pos_df.iloc[val,:].T
        branch_ner_df_word_based = ner_df.iloc[val,:].T
        for word in STOPWORDS:
            if word in branch_tfidf_df_word_based.index:
                branch_tfidf_df_word_based=branch_tfidf_df_word_based.drop(word)
                branch_pos_df_word_based = branch_pos_df_word_based.drop(word)
                branch_ner_df_word_based = branch_ner_df_word_based.drop(word)

        # deal with current node
        node ={"node_name":key[1],
            "contain_words":[],
            "children":[]}

        # add words info for current node
        for word in branch_tfidf_df_word_based.index[:100]:
            importance = float(branch_tfidf_df_word_based.loc[word])
            pos = set(branch_pos_df_word_based.loc[word].values.tolist())
            pos = [item for item in list(pos) if item != '-']
            ner = set(branch_ner_df_word_based.loc[word].values.tolist())
            ner = [item for item in list(ner) if item != '-']
            current_word_dict = {
                "text": word,
                "weight": importance,
                "POS": pos,
                "NER": ner
            }
            node["contain_words"].append(current_word_dict)

        # add child info for current node
        for check_key, check_val in edge_labels_differentiated.items():
            if node["node_name"] == check_key[0]:
                node["children"].append({"edge":check_val,"node_name":check_key[1]})

        nodes_info.append(node)

    FINAL_JSON.append(nodes_info)
    with open(f"{output_path}.json", "w") as outfile:
        json.dump(FINAL_JSON, outfile)
    with open(f"{output_path}.js", 'w') as f:
            f.write(f"var nodes_info={FINAL_JSON};")

# Decision path for test set

In [None]:
import os

def decision_path_extraction(dt_folder_path, df_test, output_path):
    """
    Based on the tree strucutre and the training documents that each node should have, collate the top important words based on their TFIDF values, and their corresponding POS and NER tags for rendering later.

    Args:
        dt_folder_path (str): where the fired rule extraction script is saved
        df_test (Dataframe): extracted features for the test set
        raw_texts_test (List[Str]): test set documents
        tokenized_docs_test (List[List[Str]]): tokenized test set documents
        output_path (str): where the decision path of the each test set document should be saved.

    """
    os.chngdir(dt_folder_path)
    import importlib
    import rule_extraction
    importlib.reload(rule_extraction)
    from rule_extraction import extraction
    branch_dict, edges_differentiated, edge_labels_differentiated = make_fired_rule_script(dt_folder_path)
    no_feats = len(df_test.columns) - 1
    X_test = df_test.iloc[:,:no_feats]
    branch_dict = extraction(X_test, branch_dict)

    all_test_doc_info = []
    for idx in range(len(df_test)):
        doc_info = {
            "doc_id": idx,
            "raw_text": raw_texts_test[idx],
            "tokenized_text": tokenized_docs_test[idx],
            "decision_path":[]
        }
        all_test_doc_info.append(doc_info)

    for key, vals in branch_dict.items():
        for idx in vals:
            child_at_each_step = key[1]
            all_test_doc_info[idx]["decision_path"].append("item_"+child_at_each_step)

    with open(f"{output_path}.json", "w") as outfile:
        json.dump(all_test_doc_info, outfile)
    with open(f"{output_path}.js", 'w') as f:
            f.write(f"var precomp_testset_info={all_test_doc_info};")

# similar word look up table extraction

In [None]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

In [None]:
import json

def construct_word_synonym_dict(input_file_path, output_file_path, save_as_js=True):
    """
    Based on WordNet knowledge, collate the synonym sets for words in the wordclouds of the decision tree.

    Args:
        input_file_path (str): where the tree node words are saved. e.g. '/content/drive/My Drive/Decision_Tree/doc_emb_tree/demo_json/MR_CNN_70_roberta_depth_3.json'
        output_file_path (str): where the synonym look up dict should be saved. e.g. '/content/drive/My Drive/Decision_Tree/doc_emb_tree/demo_json/mr_cnn_roberta_depth_3_wn_synonyms.js'
        save_as_js (bool, optional): a flag to decide whether to save output as js file or not, default as True

    Returns:
        word_wn_synonyms (Dict): the extracted similar word look up table, and save it as js file
    """
    with open(file_path, 'r') as f:
        data = json.load(f)

    words=[]
    for node_dict in data[0]:
        word_dicts = node_dict['contain_words']
        for word_dict in word_dicts:
            if word_dict['text'] not in words:
                words.append(word_dict['text'])

    word_wn_synonyms = {}
    for word in words:
        word_wn_synonyms[word] = []

    for word in words:
        for synset in wn.synsets(word):
            for lemma in synset.lemmas():
                if lemma.name() not in word_wn_synonyms[word] and lemma.name().lower() != word:
                    word_wn_synonyms[word].append(lemma.name().lower())
    if save_as_js:
        with open(output_file_path, 'w') as f:
            f.write(f"var synsets={word_wn_synonyms};")

    return word_wn_synonyms