In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd

In [3]:
data = pd.read_excel("EFTSTUDY16.xlsx", sheet_name='Sheet1')

In [4]:
data.columns

Index(['ID's', 'EFT/ERT', 'AUC EFT/ERT', 'BAUC', 'DELAY', 'IP', 'IP value',
       'cue original', 'cue_spellcheck'],
      dtype='object')

In [5]:
IP_values = pd.DataFrame(data['IP value'])
all_cues = pd.DataFrame(data['cue_spellcheck'])
category = pd.DataFrame(data['EFT/ERT'])

In [6]:
eft_data = []
ert_data = []


In [7]:
for i in category.index:
    if category['EFT/ERT'][i] == "EFT":
        eft_data.append([all_cues['cue_spellcheck'][i], IP_values['IP value'][i]])
    else:
        ert_data.append([all_cues['cue_spellcheck'][i], IP_values['IP value'][i]])

In [8]:
len(eft_data), len(ert_data)

(219, 228)

In [9]:
def preprocess(individual_data):
    tokens = []
    for item in individual_data:
        temp = word_tokenize(item)
        for word in temp:
            tokens.append(word)
        
    words = [w.lower() for w in tokens]
    
    porter = nltk.PorterStemmer()
    stemmed_tokens = [porter.stem(t) for t in words]
    
    #Removing stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w for w in stemmed_tokens if not w in stop_words]
    
    #count words
    count = nltk.defaultdict(int)
    for word in filtered_tokens:
        count[word] += 1
    
    return count

In [10]:
def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    
    return dot_product/(norm_a * norm_b)

In [20]:
def getSimilarity(dict1, dict2):
    all_words_list = []
    for key in dict1:
        all_words_list.append(key)
    for key in dict2:
        all_words_list.append(key)
    
    
    v1 = np.zeros(len(all_words_list), dtype=np.int)
    v2 = np.zeros(len(all_words_list), dtype=np.int)
    
    i=0
    
    for (key) in all_words_list:
        v1[i] = dict1.get(key, 0)
        v2[i] = dict2.get(key, 0)
        i = i+1
    
#     print(v1, "\n\n\n", v2)
    return cos_sim(v1, v2)

In [23]:
similarity_index = []

for i in range(0, len(eft_data), i+3):
    ind_eft_data = []
    ind_ert_data = []
    IP_eft_values = []
    IP_ert_values = []
    
    for j in range(3):
        ind_eft_data.append(eft_data[i+j][0])
        ind_ert_data.append(ert_data[i+j][0])
        IP_eft_values.append(eft_data[i+j][1])
        IP_ert_values.append(ert_data[i+j][1])

    cue_eft_dict = preprocess(ind_eft_data)
    cue_ert_dict = preprocess(ind_ert_data)
    

#     print("EFT:  \n",IP_eft_values, "\n\n\n")
#     print("ERT: \n", IP_ert_values, "\n\n\n")
    
    cue_similarity = getSimilarity(cue_eft_dict, cue_ert_dict)
    IP_similarity = cos_sim(IP_eft_values, IP_ert_values)
    print(cue_similarity, IP_similarity, "\n\n")
    similarity_index.append(list([i, cue_similarity, IP_similarity]))

0.7320654489029711 0.961477958664592 


0.5389485627682215 0.843258436122533 


0.2169033791350711 0.8474210602882235 


0.28072655189149287 0.9967600557071877 


0.7201516119631453 0.9475557965597522 


0.6694000304026254 0.6501795057173233 


0.5328653210997697 0.9556377987087168 


0.3893661821726164 0.8367714208646314 


0.3700919289219636 0.8586119247748355 


0.6196834557538381 0.9540113953516025 


0.6626516174931566 0.9478012804403075 


0.6945370831998118 0.8574455208722735 


0.6591078530690873 0.865606073316307 


0.4244887253698285 0.893261999923505 


0.5535340874468364 0.9808393721203469 


0.6719944224097083 0.8773287975215743 


0.5806073971876431 0.8523734421066558 


0.19743348163446187 0.994803831137775 


0.2343718211502857 0.9999923249670243 


0.33348105827675406 0.9033442856673265 


0.5346752700174289 0.9486612027854415 


0.566646199504299 0.7124783878237405 


0.22892103967034522 0.889599803863362 


0.2617892890103688 0.9078646092777198 


0.5329193371241866 

In [24]:
similarity_index = np.array(similarity_index, dtype=np.float32)
similarity_index.shape

(73, 3)

In [25]:
print(eft_data[0][1],eft_data[1][1],eft_data[2][1], "\n\n", ert_data[0][1], ert_data[1][1], ert_data[2][1])

49.22 49.22 11.72 

 49.22 46.1 32.04


In [26]:
df = pd.DataFrame(similarity_index)

In [27]:
df.columns = ['Individual S/No', 'EFT-ERT Cosine Similarity', 'IP Values Cosine Similarity']
df

Unnamed: 0,Individual S/No,EFT-ERT Cosine Similarity,IP Values Cosine Similarity
0,0.0,0.732065,0.961478
1,3.0,0.538949,0.843258
2,6.0,0.216903,0.847421
3,9.0,0.280727,0.996760
4,12.0,0.720152,0.947556
...,...,...,...
68,204.0,0.511041,0.989963
69,207.0,0.736949,0.986317
70,210.0,0.249486,0.902403
71,213.0,0.345537,0.976359


In [28]:
print ("Max values: \n")
print(df.idxmax(axis=0), "\n\n", df.max(axis=0))

Max values: 

Individual S/No                72
EFT-ERT Cosine Similarity      69
IP Values Cosine Similarity    18
dtype: int64 

 Individual S/No                216.000000
EFT-ERT Cosine Similarity        0.736949
IP Values Cosine Similarity      0.999992
dtype: float32


In [32]:
eft_cues = []
IP_eft_all = []
ert_cues = []
IP_ert_all = []

for i in range(len(eft_data)):
    eft_cues.append(eft_data[i][0])
    IP_eft_all.append(eft_data[i][1])
    ert_cues.append(ert_data[i][0])
    IP_ert_all.append(ert_data[i][1])

In [35]:
all_eft_dict = preprocess(eft_cues)
all_ert_dict = preprocess(ert_cues)

similar = getSimilarity(all_eft_dict, all_ert_dict)
IP_sim = cos_sim(IP_eft_all, IP_ert_all)

In [38]:
print("{0:.3f}" .format(similar), "  {0:.3f}" .format(IP_sim))

0.696   0.816
