In [1]:
import pandas as pd
import numpy as np
import time

In [3]:
path = "E:/CS_Master_Degree_UIUC/CS410_Text_Information_system/Project/Project Submission/CourseProject/Dataset/"
context_units = pd.read_csv(path+"DBLP2000_context_units.csv")

In [4]:
context_units

Unnamed: 0,pattern,transaction_index,pattern_type
0,Ralf Steinmetz,"[70, 1323, 1325, 3362]",author
1,Cheng-Wen Wu,"[194, 196, 288, 1909]",author
2,Thomas S. Huang,"[652, 835, 1012, 1123]",author
3,Maja J. Mataric,"[660, 669, 2266, 2271]",author
4,K. Suzanne Barber,"[707, 1311, 2129, 2444, 2448]",author
...,...,...,...
175,robocup,"[1381, 2200, 2202, 2819]",title
176,contextu,"[138, 156, 793, 993]",title
177,intension,"[272, 312, 990, 1360]",title
178,nest,"[636, 2289, 3280, 3402]",title


In [5]:
D = pd.read_csv(path + "DBLP2000.csv") # Read the original transaction dataset
D

Unnamed: 0,author,title
0,Lothar Breuer,Operator-Geometric Solutions for the M/G/k Que...
1,"Christopher Lusena, Judy Goldsmith, Martin Mun...",Nonapproximability Results for Partially Obser...
2,"János Komlós, Ali Shokoufandeh, Miklós Simonov...",The Regularity Lemma and Its Applications in G...
3,Vijay V. Vazirani,Primal-Dual Schema Based Approximation Algorit...
4,"Isabel Fernández-Anta, Eva Millán, José-Luis P...",Adaptation and Generation in a Web-Based Lisp ...
...,...,...
3999,"Marek Teichmann, Bud Mishra",Probabilistic Algorithms for Efficient Graspin...
4000,Robert E. Filman,Arachnoid Tourist: 2020 Hindsight.
4001,"Lassaâd Gannoun, Julien Francioli, Stanislav C...",Domain Name eXchange: A Mobile-Agent-Based Sha...
4002,"Heecheol Jeon, Charles J. Petrie, Mark R. Cutk...",JATLite: A Java Agent Infrastructure with Mess...


In [6]:
# Suppose we use the context units themselves as FPs to 
# find out their individual weight vectors in the space made of themselves by 
# building weight matrix of pairwised context units. 
# Each weight between the pair of context unit patterns is the mutual information calculated by the formula per paper
# using the probabilities of four cases: 
# p11: prob of unit1 and unit2 both present in the transaction dataset D
# p10: prob of unit1 present and unit2 absent in the transaction dataset D
# p01: prob of unit1 absent and unit2 present in the transaction dataset D
# p00: of prob of neither unit1 nor unit2 prepsent in the transaction dataset D

In [7]:
D_size = len(D)
C_size = len(context_units)

# Initialize the matrix 
W = np.zeros([C_size, C_size])

for i in range(C_size):
    for j in range(C_size):
        ind1 = context_units['transaction_index'].iloc[i]
        ind2 = context_units['transaction_index'].iloc[j]
        intersection = set(ind1).intersection(ind2)
        #Calculate the probabilities with laplace smoothing
        p11 = (len(intersection)+0.25)/(D_size + 1)
        p01 = (len(ind2)- len(intersection) + 0.25)/(D_size + 1)
        p10 = (len(ind1)- len(intersection) + 0.25)/(D_size + 1)
        p00 = 1 - p11 - p01 - p10
        
        su1 = len(ind1)/D_size #support of u1
        su2 = len(ind2)/D_size #support of u2
        nu1 = 1-su1
        nu2 = 1-su2
        
        MI = p11*np.log10(p11/su1/su2) + \
             p01*np.log10(p01/nu1/su2) + \
             p10*np.log10(p10/su1/nu2) + \
             p00*np.log10(p00/nu1/nu2)
        W[i,j] = MI

In [8]:
W

array([[0.0050075 , 0.00327757, 0.0043857 , ..., 0.00445343, 0.00317912,
        0.00264941],
       [0.00327757, 0.00516234, 0.00385108, ..., 0.00390995, 0.00438971,
        0.00322972],
       [0.0043857 , 0.00385108, 0.0050075 , ..., 0.00385108, 0.00373678,
        0.00264941],
       ...,
       [0.00445343, 0.00390995, 0.00385108, ..., 0.00516234, 0.00379559,
        0.00322972],
       [0.00317912, 0.00438971, 0.00373678, ..., 0.00379559, 0.00486237,
        0.00313133],
       [0.00264941, 0.00322972, 0.00264941, ..., 0.00322972, 0.00313133,
        0.00368134]])

In [9]:
Weights = pd.DataFrame(W, columns = context_units["pattern"], index = context_units["pattern"])
Weights

pattern,Ralf Steinmetz,Cheng-Wen Wu,Thomas S. Huang,Maja J. Mataric,K. Suzanne Barber,Josef Kittler,Gerald Sommer,Edwin R. Hancock,Masaru Kitsuregawa,Roberto Gorrieri,...,warp,movement,trade,reusabl,phylogeni,robocup,contextu,intension,nest,distribut resourc
pattern,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ralf Steinmetz,0.005007,0.003278,0.004386,0.003792,0.003458,0.003737,0.004386,0.004574,0.004262,0.004524,...,0.003227,0.003914,0.003119,0.004322,0.003792,0.003134,0.003914,0.004453,0.003179,0.002649
Cheng-Wen Wu,0.003278,0.005162,0.003851,0.003851,0.004071,0.002692,0.004453,0.004651,0.004330,0.004026,...,0.003851,0.002292,0.003688,0.003796,0.003851,0.003743,0.003382,0.003910,0.004390,0.003230
Thomas S. Huang,0.004386,0.003851,0.005007,0.003227,0.003458,0.003737,0.004386,0.004574,0.004262,0.004524,...,0.003792,0.003331,0.003119,0.003737,0.003792,0.003684,0.003914,0.003851,0.003737,0.002649
Maja J. Mataric,0.003792,0.003851,0.003227,0.004386,0.003458,0.002649,0.004386,0.004003,0.003684,0.003958,...,0.002690,0.002779,0.003119,0.004322,0.003792,0.003134,0.003331,0.004453,0.003179,0.002649
K. Suzanne Barber,0.003458,0.004071,0.003458,0.003458,0.004684,0.003402,0.004003,0.004684,0.004434,0.004092,...,0.003458,0.002531,0.003241,0.003402,0.004574,0.003880,0.003579,0.004071,0.003940,0.003402
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
robocup,0.003134,0.003743,0.003684,0.003134,0.003880,0.003086,0.003684,0.003880,0.003576,0.003835,...,0.003134,0.002190,0.002550,0.003086,0.004262,0.004138,0.003238,0.003743,0.003629,0.003086
contextu,0.003914,0.003382,0.003914,0.003331,0.003579,0.003858,0.004526,0.004734,0.003806,0.004684,...,0.003914,0.004035,0.003759,0.003858,0.003914,0.003238,0.005328,0.003973,0.003283,0.002738
intension,0.004453,0.003910,0.003851,0.004453,0.004071,0.003230,0.004453,0.004651,0.004330,0.004601,...,0.003278,0.003382,0.003178,0.004390,0.004453,0.003743,0.003973,0.005162,0.003796,0.003230
nest,0.003179,0.004390,0.003737,0.003179,0.003940,0.002609,0.003737,0.004502,0.004198,0.003895,...,0.004322,0.002222,0.003064,0.003131,0.003737,0.003629,0.003283,0.003796,0.004862,0.003131


In [None]:
output_path = path
Weights.to_csv(output_path + "Context_units_weights.csv")