In [50]:
from collections import defaultdict
from sklearn.metrics import mutual_info_score
import numpy as np
import operator
import re

In [26]:
txt_data = [
    "It has a long battery life. this is a very good thing to buy if you don't want your child to have a phone, but they want to be able to contact their friends, listen to music, or play games. They don't break by themselves, so if something goes wrong, there is a defect, it was handled incorrectly, or you/ your child did something to mess it up. I recommend a protective case and a screen protector. You can download about 10-15 apps, and a lot of songs. But it does have little storage, so make sure you put the photos on your computer. if you are using it for taking pictures and videos or wants a lot of apps or songs, i recommend 36 or 64 gb. enjoy!",
    "So far I'm extremely disappointed with this iPod Touch. I find it much more difficult to use. I do not find it user friendly. I feel that I should have simply purchased an iPhone instead of this product. I upgrade from the classic iPod which is still in my opinion amazing and I will probably continue to use. So far I am not happy with this iPod Touch and would not recommend it to anyone",
    "received this iPod for Christmas (2016)! It works great and takes perfect photos. I love the fact that I can facetime with any other Apple phone. I'm still getting familiar with all the things I can do with it. The only downside is that it did not come with a charging block. Luckily, we have plenty extras. So far it works great and I am very pleased. The gold display is perfect. So, I am going to give this iPod five stars. Maddy 11 years old.",
    "I love my iPod touch so much! I use it for basically everything-even as a mini iPhone. But I do think the screen is a little small, but for now it will work-I'm getting an iPad-but I simply love it! If you are looking for a great device for your littler children or teens or tweens or even an adult I highly reccomend-and I have a tip for you-for your littler children, you probably don't want them to be on the internet sometimes when you can't supervise, there's a way to take Safari on and off. I really recommend this iPod to everyone, especially children!",
    "Battery life is really long. I just bought this iPod touch today and it's my first real iPod. I have a friend who has had this iPod for a long time and he loves his too. It is very small which can be a pro to some and a con to others (for me it's a con but I'll get used to it). I love that I can clear up the space from my iPhone 6s Plus by moving all my music and games over to this iPod and using the phone just for texting and calling."
]

In [27]:
vocab = set()
doc_term = defaultdict(dict)
for txt in txt_data:
    terms = [re.sub(r'[^\w\s]','',s).lower() for s in txt.split(' ')]
    vocab = vocab.union(terms)
    
for word in vocab:
    for doc_id in range(len(txt_data)):
        if doc_id not in doc_term.keys():
            doc_term[doc_id] = defaultdict(int)
        doc_term[doc_id][word] = 0

for word in vocab:
    for doc_id, txt in enumerate(txt_data):
        terms = [re.sub(r'[^\w\s]','',s).lower() for s in txt.split(' ')]
        if word in terms:
            doc_term[doc_id][word] = 1

In [39]:
w1 = 'battery'
N = len(doc_term.keys())

info_pairs = []
for w2 in vocab:
    p_X_w1_1 = 0
    p_X_w2_1 = 0
    p_X_w1_1_w2_1 = 0
    for doc_id in doc_term.keys():
        w1_present = False
        w2_present = False
        if doc_term[doc_id][w1] > 0:
            w1_present = True
            p_X_w1_1 += 1
        if doc_term[doc_id][w2] > 0:
            w2_present = True
            p_X_w2_1 += 1
        if w1_present and w2_present:
            p_X_w1_1_w2_1 += 1

    # P(X_w1 = 1)
    p_X_w1_1 = (p_X_w1_1 + 0.05) / (float(N) + 1)

    # P(X_w2 = 1)
    p_X_w2_1 = (p_X_w2_1 + 0.05) / (float(N) + 1)

    # P(X_w1=1, X_w2=1)
    p_X_w1_1_w2_1 = (p_X_w1_1_w2_1 + 0.025) / (float(N) + 1)

    # P(X_w1 = 0) = 1 - P(X_w1 = 1)
    p_X_w1_0 = 1. - p_X_w1_1

    # P(X_w2 = 0) = 1 - P(X_w2 = 1)
    p_X_w2_0 = 1. - p_X_w2_1

    # P(X_w1=1, X_w2=0) + P(X_w1=1, X_w2=1) = P(X_w1=1)
    p_X_w1_1_w2_0 = p_X_w1_1 - p_X_w1_1_w2_1

    # P(X_w1=0, X_w2=1) + P(X_w1=1, X_w2=1) = P(X_w2=1)
    p_X_w1_0_w2_1 = p_X_w2_1 - p_X_w1_1_w2_1

    # P(X_w1=0, X_w2=0) + P(X_w1=0, X_w2=1) = P(X_w1=0)
    p_X_w1_0_w2_0 = p_X_w1_0 - p_X_w1_0_w2_1
    
    I_w1_w2 = (p_X_w1_1_w2_1 * np.log2(p_X_w1_1_w2_1 / (p_X_w1_1 * p_X_w2_1))) +\
            (p_X_w1_0_w2_0 * np.log2(p_X_w1_0_w2_0 / (p_X_w1_0 * p_X_w2_0))) +\
            (p_X_w1_1_w2_0 * np.log2(p_X_w1_1_w2_0 / (p_X_w1_1 * p_X_w2_0))) +\
            (p_X_w1_0_w2_1 * np.log2(p_X_w1_0_w2_1 / (p_X_w1_0 * p_X_w2_1)))
    
    info_pairs.append((I_w1_w2, w2))

In [40]:
sorted(info_pairs, key=lambda x:x[0], reverse=True)[:10]

[(0.85751585655980955, 'has'),
 (0.85751585655980955, 'using'),
 (0.85751585655980955, 'battery'),
 (0.85751585655980955, 'by'),
 (0.85751585655980955, 'games'),
 (0.85751585655980955, 'long'),
 (0.85751585655980955, 'music'),
 (0.85751585655980955, 'life'),
 (0.85751585655980955, 'up'),
 (0.45039734091734007, 'do')]

In [30]:
w1 = 'christmas'
N = len(doc_term.keys())

# 1 0
#1
#0
info_pairs = []
for w2 in vocab:
    cont_mat = np.array([[0, 0], [0, 0]])
    for doc_id in doc_term.keys():
        if doc_term[doc_id][w1] > 0 and doc_term[doc_id][w2] > 0:
            cont_mat[0][0] += 1
        elif doc_term[doc_id][w1] > 0 and doc_term[doc_id][w2] == 0:
            cont_mat[0][1] += 1
        elif doc_term[doc_id][w1] == 0 and doc_term[doc_id][w2] > 0:
            cont_mat[1][0] += 1
        else:
            cont_mat[1][1] += 1
    info_pairs.append((mutual_info_score(None, None, contingency=cont_mat), w2))

In [31]:
sorted(info_pairs, key=lambda x:x[0], reverse=True)[:10]

[(0.50040242353818787, 'gold'),
 (0.50040242353818787, 'years'),
 (0.50040242353818787, 'pleased'),
 (0.50040242353818787, 'perfect'),
 (0.50040242353818787, 'only'),
 (0.50040242353818787, 'going'),
 (0.50040242353818787, 'familiar'),
 (0.50040242353818787, 'five'),
 (0.50040242353818787, 'works'),
 (0.50040242353818787, 'extras')]

In [32]:
N = len(doc_term.keys())

# 1 0
#1
#0
info_pairs = defaultdict(dict)
for w1 in vocab:
    for w2 in vocab:
        cont_mat = np.array([[0, 0], [0, 0]])
        for doc_id in doc_term.keys():
            if doc_term[doc_id][w1] > 0 and doc_term[doc_id][w2] > 0:
                cont_mat[0][0] += 1
            elif doc_term[doc_id][w1] > 0 and doc_term[doc_id][w2] == 0:
                cont_mat[0][1] += 1
            elif doc_term[doc_id][w1] == 0 and doc_term[doc_id][w2] > 0:
                cont_mat[1][0] += 1
            else:
                cont_mat[1][1] += 1
        mut_w1_w2 = mutual_info_score(None, None, contingency=cont_mat)
        info_pairs[w1][w2] = mut_w1_w2

In [65]:
N = len(doc_term.keys())

info_pairs = defaultdict(dict)
for w1 in vocab:
    for w2 in vocab:
        p_X_w1_1 = 0
        p_X_w2_1 = 0
        p_X_w1_1_w2_1 = 0
        for doc_id in doc_term.keys():
            w1_present = False
            w2_present = False
            if doc_term[doc_id][w1] > 0:
                w1_present = True
                p_X_w1_1 += 1
            if doc_term[doc_id][w2] > 0:
                w2_present = True
                p_X_w2_1 += 1
            if w1_present and w2_present:
                p_X_w1_1_w2_1 += 1

        # P(X_w1 = 1)
        p_X_w1_1 = (p_X_w1_1 + 0.5) / (float(N) + 1)

        # P(X_w2 = 1)
        p_X_w2_1 = (p_X_w2_1 + 0.5) / (float(N) + 1)

        # P(X_w1=1, X_w2=1)
        p_X_w1_1_w2_1 = (p_X_w1_1_w2_1 + 0.25) / (float(N) + 1)

        # P(X_w1 = 0) = 1 - P(X_w1 = 1)
        p_X_w1_0 = 1. - p_X_w1_1

        # P(X_w2 = 0) = 1 - P(X_w2 = 1)
        p_X_w2_0 = 1. - p_X_w2_1

        # P(X_w1=1, X_w2=0) + P(X_w1=1, X_w2=1) = P(X_w1=1)
        p_X_w1_1_w2_0 = p_X_w1_1 - p_X_w1_1_w2_1

        # P(X_w1=0, X_w2=1) + P(X_w1=1, X_w2=1) = P(X_w2=1)
        p_X_w1_0_w2_1 = p_X_w2_1 - p_X_w1_1_w2_1

        # P(X_w1=0, X_w2=0) + P(X_w1=0, X_w2=1) = P(X_w1=0)
        p_X_w1_0_w2_0 = p_X_w1_0 - p_X_w1_0_w2_1

        I_w1_w2 = (p_X_w1_1_w2_1 * np.log2(p_X_w1_1_w2_1 / (p_X_w1_1 * p_X_w2_1))) +\
                (p_X_w1_0_w2_0 * np.log2(p_X_w1_0_w2_0 / (p_X_w1_0 * p_X_w2_0))) +\
                (p_X_w1_1_w2_0 * np.log2(p_X_w1_1_w2_0 / (p_X_w1_1 * p_X_w2_0))) +\
                (p_X_w1_0_w2_1 * np.log2(p_X_w1_0_w2_1 / (p_X_w1_0 * p_X_w2_1)))

        info_pairs[w1][w2] = I_w1_w2

In [67]:
search_q = 'christmas'
search_terms = search_q.split(' ')
for s_t in search_terms:
    sorted_mut_infos = sorted(info_pairs[s_t].items(), key=lambda x:x[1], reverse=True)[:10]
    mut_assoc_words = [c[0] for c in sorted_mut_infos]
    print (mut_assoc_words)

['years', 'pleased', 'apple', 'perfect', 'gold', 'only', 'going', 'familiar', 'five', 'works']
