In [1]:
import csv
import pickle
from collections import defaultdict

import editdistance
import numpy as np
import pandas as pd
from minisom import MiniSom
from scipy.spatial import distance
from sklearn import preprocessing
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# import os
# os.chdir("C:\\Users\\ameet.chaubal\\Documents\\source\\text-anal")

In [3]:
x=6
y=6
iterations=5000
normalized_yn="N"
edit_distance = {}
cosine_distance = {}
euc_distance = {}

In [4]:

dataFeaturePath = './data/DataFeatures_Train.pickle'
test_featurePath = './data/DataFeatures_Match.pickle'

In [5]:
DataFeatures = pickle.load(open(dataFeaturePath, 'rb'))
TestFeatures = pickle.load(open(test_featurePath, 'rb'))
dims=len(list(DataFeatures.values())[0])

In [6]:
def convert_to_np(dict_vals):
    f = list(dict_vals.values())
    print ("Rows:{} cols:{}".format(len(f),len(f[0])))
    n_f = np.array(f)
    print (">>np features:{}".format(n_f.shape))
    return n_f

In [7]:
np_train_features = convert_to_np(DataFeatures)
np_test_features = convert_to_np(TestFeatures)

Rows:85 cols:20
>>np features:(85, 20)
Rows:76 cols:20
>>np features:(76, 20)


### Choose normalized or raw data

In [8]:
if normalized_yn=="N":
    normalized_feat = np_train_features
else:
    min_max_scaler = preprocessing.MinMaxScaler()
    normalized_feat = min_max_scaler.fit_transform(np_train_features)
    print (">>normalized features:{}".format(normalized_feat.shape))

In [9]:
# Combine keys and normalized features into dict
normalized_features_dict=dict(zip(DataFeatures.keys(),normalized_feat))
print ("Normalized features: rows={} cols={}".
       format(len(normalized_features_dict),len(normalized_features_dict['tr_hcp_footnote'])))

Normalized features: rows=85 cols=20


In [10]:
som = MiniSom(x, y, dims, sigma=0.3, learning_rate=0.5)
print ("Training...")
som.train_random(normalized_feat, iterations,verbose=True)
print ("...ready!")

Training...
 [    0 / 5000 ]   0% - ? it/s [    0 / 5000 ]   0% - ? it/s [    1 / 5000 ]   0% - 0:00:05 left  [    2 / 5000 ]   0% - 0:00:05 left  [    3 / 5000 ]   0% - 0:00:03 left  [    4 / 5000 ]   0% - 0:00:02 left  [    5 / 5000 ]   0% - 0:00:03 left  [    6 / 5000 ]   0% - 0:00:02 left  [    7 / 5000 ]   0% - 0:00:02 left  [    8 / 5000 ]   0% - 0:00:02 left  [    9 / 5000 ]   0% - 0:00:02 left  [   10 / 5000 ]   0% - 0:00:02 left  [   11 / 5000 ]   0% - 0:00:02 left  [   12 / 5000 ]   0% - 0:00:02 left  [   13 / 5000 ]   0% - 0:00:02 left  [   14 / 5000 ]   0% - 0:00:02 left  [   15 / 5000 ]   0% - 0:00:02 left  [   16 / 5000 ]   0% - 0:00:03 left  [   17 / 5000 ]   0% - 0:00:02 left  [   18 / 5000 ]   0% - 0:00:02 left  [   19 / 5000 ]   0% - 0:00:02 left  [   20 / 5000 ]   0% - 0:00:02 left  [   21 / 5000 ]   0% - 0:00:02 left  [   22 / 5000 ]   0% - 0:00:02 left  [   23 / 5000 ]   0% - 0:00:02 left  [   24 / 5000 ]   0% - 0:00:02 left  [   25 / 500

#### Predicting Cluster ID for each feature key & generating map of cluster ID => list of feature value vectors

In [11]:
attribute_cluster_map={}
cluster_data_vector_map=defaultdict(list)
cluster_attrib_map=defaultdict(list)
predicted_clusters=[]
for key,data in normalized_features_dict.items():
    winid = som.winner(data)
    clusterid=np.ravel_multi_index(winid,(x,y))
    predicted_clusters.append(clusterid)
    attribute_cluster_map[key]=clusterid
    cluster_attrib_map[clusterid].append(key)
    cluster_data_vector_map[clusterid].append(data)

In [12]:
print("Total uniq clusters:{}".format(len(cluster_data_vector_map)))
cluster_center={}
for k,v in cluster_data_vector_map.items():
    center  = [sum(j)/len(v) for j in zip(*v)]
    cluster_center[k]=center

Total uniq clusters:17


In [13]:
# for k,v in cluster_attrib_map.items():
#     print("{} =>{}".format(k,v))

#### Finding a match for each Test feature

In [14]:
test_attrib_clusterid_map={}
for k,v in TestFeatures.items():
    eudistance = []
    min_dist = 9000000
    for centerID, center in cluster_center.items():
        eudistance.append(distance.euclidean(v, center))
        min_d = min(eudistance)
        if min_d < min_dist:
            min_dist = min_d
            test_attrib_clusterid_map[k] = centerID

#### Map of test attribute and corresponding train attributes possibility

In [15]:
# for k,v in test_attrib_clusterid_map.items():
#     print ("{} => {}:{}\n".format(k,v, cluster_attrib_map[v]))

In [16]:
sil_score= silhouette_score(list(DataFeatures.values()),predicted_clusters)

In [17]:
def calculate_edit_distance(test_name, train_names_features):
    global edit_distance
    edit_distance[test_name] = {}
    for name in train_names_features:
        edit_distance[test_name][name] = editdistance.eval(test_name, name)


In [18]:
def cosine_euc_distance(test_name, test_feature, train_name_features):
    cosine_distance[test_name] = {}
    euc_distance[test_name] = {}
    for name in train_name_features.keys():
        # print "shape of test_features:%s train_name_features:%s => name:%s" % 
        # test_feature.shape, train_name_features[
        #     name].shape, name
        test_nd = np.asarray(test_feature).reshape(1, -1)
        train_nd = np.asarray(train_name_features[name]).reshape(1, -1)
        cosine_distance[test_name][name] = cosine_similarity(test_nd,train_nd)[0][0]
        euc_distance[test_name][name] = distance.euclidean(
                                                test_feature, train_name_features[name])


In [19]:
def cal_probability(distance_dic, dic2, dic3):
    prob_list=[]
    file_path = open('./results/avg_prob__'+str(x)+"_"+str(y)+".csv", 'w')
    Out = csv.writer(file_path, delimiter=',')
    new_row = ['test_attribute', 'train_attribute', 'distance', 'avg_probability']
    Out.writerow(new_row)
    prob_list.append(new_row)
    for test_attribute in distance_dic.keys():
        total = sum(distance_dic[test_attribute].values())
        total2 = sum(dic2[test_attribute].values())
        total3 = sum(dic3[test_attribute].values())

        for train_attribute in distance_dic[test_attribute].keys():
            prob = 1 - (distance_dic[test_attribute][train_attribute] / float(total))
            prob2 = 1 - (dic2[test_attribute][train_attribute] / float(total2))
            prob3 = 0 if total3==0 else 1 - (dic3[test_attribute][train_attribute] / float(total3))
            avg_dist = (distance_dic[test_attribute][train_attribute]+
                        dic2[test_attribute][train_attribute]+
                        dic3[test_attribute][train_attribute])/float(3)
            avg_prob = (prob+prob2+prob3)/float(3.0)
            new_row = [test_attribute, train_attribute, avg_dist, avg_prob]
            prob_list.append(new_row)
            Out.writerow(new_row)
    file_path.close()
    return prob_list

In [20]:
def pick_winner(prob_list, file_name):
    probDF = pd.DataFrame(prob_list[1:], columns=prob_list[0])
    print ("Probability Dataframe imported:{} ".format(probDF.shape))
    min_distDF = probDF[probDF['distance'] == probDF.groupby(['test_attribute'])['distance'].transform(min)][
        ['test_attribute', 'train_attribute', 'distance']].rename(columns={'train_attribute': 'mind_train_attribute'})
    max_probDF = \
        probDF[probDF['avg_probability'] == probDF.groupby(['test_attribute'])['avg_probability'].transform(max)][
            ['test_attribute', 'train_attribute', 'avg_probability']].rename(
            columns={'train_attribute': 'prob_train_attribute'})
    allDF = pd.merge(max_probDF, min_distDF, on=['test_attribute'])
    allDF['possible_train_attribute'] = allDF.apply(lambda r:
                                                    r['mind_train_attribute']
                                                    if r['prob_train_attribute'] == r[
                                                        'mind_train_attribute']
                                                    else ','.join(
                                                        [r['mind_train_attribute'], r['prob_train_attribute']]), axis=1)
    allDF = allDF[['test_attribute', 'distance', 'avg_probability', 'possible_train_attribute']]
    allDF.to_csv(file_name, index=False, sep=',')    

In [21]:
print ("test_attrib size:{} train attrib_clus size:{}".
       format(len(test_attrib_clusterid_map), len(attribute_cluster_map)))  
for key, val in test_attrib_clusterid_map.items():
    train_names = []
    train_name_features = {}
    for k, v in attribute_cluster_map.items():
        if val == v:
            train_names.append(k)
            train_name_features[k] = DataFeatures[k]
#     print("for key:{} \ntrainnames:{}\ntrain_feature:{}".
#           format(key,train_names,train_name_features))
    calculate_edit_distance(key, train_names)
    cosine_euc_distance(key, TestFeatures[key], train_name_features)

test_attrib size:72 train attrib_clus size:85


#### Calculate distance based probabilities

In [22]:
avg_prob = cal_probability(edit_distance, cosine_distance, euc_distance)

### Results

In [23]:
print("Cluster:[{},{}]\n{}\nTotal Uniq Clusters:{}\nSilhouetteScore:{}".
      format(x,y,"="*20,len(cluster_data_vector_map),sil_score))

Cluster:[6,6]
Total Uniq Clusters:17
SilhouetteScore:0.7788908797943188


### Final match file

In [24]:
file_name="./results/test_train_match__"+str(x)+"-"+str(y)+".csv"
matchDF = pick_winner(avg_prob,file_name)

Probability Dataframe imported:(816, 4) 
