In [1]:
import pandas as pd
import scipy.stats as stats
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy

nlp = spacy.load("en_core_web_md")

In [4]:
def jaccard_similarity(x, y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality / float(union_cardinality)


def tfidf_cosine_similarity(x, y):
    docx = " ".join(set(x))
    docy = " ".join(set(y))
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([docx.lower(), docy.lower()])
    result = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
    return result

def get_freq_dict(data):
    freq_dict = {}.fromkeys(data, 0)
    for ele in data:
        freq_dict[ele] += 1

    return freq_dict


def get_distribution(data):
    freq_dict = get_freq_dict(data)
    histogram_list = []
    i = 0
    for v in reversed(sorted(freq_dict.values())):
        for _ in range(int(v*100/len(data))):
            histogram_list.append(i)
        i += 1
    return histogram_list


def mann_whitney_test(x, y):
    x = get_distribution(x)
    y = get_distribution(y)
    if len(x) > 1 and len(y) > 1:
        if x[-1] != 0 and y[-1] != 0:
            result = stats.mannwhitneyu(x, y)[1]
            return result

    return 0


def attr_rel_similarity(attr_name, rel):
    
    def attr_seg(attr_name):
        s = ""
        if " " not in attr_name:
            for c in attr_name:
                if c.isupper():
                    s += " "
                if c.isalpha():
                    s += c
                else:
                    s += " "
            return s.strip()
        else:
            return attr_name
    
    def rel_seg(r):
        return " ".join(r.split("_")[1:])
    
    attr_name = attr_seg(attr_name)
    
    return nlp(attr_name).similarity(nlp(rel_seg(rel[0]))) + nlp(attr_name).similarity(nlp(rel_seg(rel[1])))


def ulan_search():
    pass

0.5