In [75]:
import os;
import sys;
import numpy as np;
from sklearn.linear_model import LinearRegression;
from sklearn.linear_model import ElasticNet;
from sklearn.linear_model import Lasso;

In [2]:
def read_sudha_format( fname):
    model = dict();
    f = open( fname, 'r');
    lcnt = 0;
    for line in f:
        if lcnt == 0:
            lcnt += 1;
            continue;
        tokens = line.strip().split( ' ');
        key = tokens[ 0];
        val = np.asarray( [ float( token) for token in tokens[ 1:]]);
        word, pos, sid = key.split( '@');
        if pos == 'noun':
            pos = 'n';
        if pos == 'adjective' or pos == 'adj':
            pos = 'a';
        if pos == 'verb':
            pos = 'v';
        if pos == 'adverb' or pos == 'adv':
            pos = 'r';
        mkey = sid + '_' + pos;
        model[ mkey] = val;
    return model;

In [3]:
fprefix = '/home/development/kevin/workspace/linking/data/';
fname = fprefix + 'english-sense-vector_sb.txt';
eng_m = read_sudha_format( fname);

In [4]:
print( len( eng_m));

48397


In [5]:
to_remove = [];
for key in eng_m:
    val = eng_m[ key];
    if np.linalg.norm( val) < 1e-6:
        to_remove.append( key);
for elem in to_remove:
    del eng_m[ elem];

In [6]:
print( len( eng_m));

48358


In [7]:
fprefix = '/home/development/kevin/workspace/linking/data/';
fname = fprefix + 'hindi_cbow_50_5_10_sense.txt';
hin_m = read_sudha_format( fname);

In [8]:
print( len( hin_m));

29884


In [9]:
to_remove = [];
for key in hin_m:
    val = hin_m[ key];
    if np.linalg.norm( val) < 1e-6:
        to_remove.append( key);
for elem in to_remove:
    del hin_m[ elem];

In [10]:
print( len( hin_m));

29884


In [13]:
hin_dict = dict();
eng_dict = dict();
pos_list = [ 'a', 'n', 'r', 'v'];
for pos in pos_list:
    hin_dict[ pos] = dict();
    eng_dict[ pos] = dict();

In [14]:
for key in hin_m:
    if 'a' in key:
        hin_dict[ 'a'][ key] = hin_m[ key];
    if 'n' in key:
        hin_dict[ 'n'][ key] = hin_m[ key];
    if 'r' in key:
        hin_dict[ 'r'][ key] = hin_m[ key];
    if 'v' in key:
        hin_dict[ 'v'][ key] = hin_m[ key];

for key in eng_m:
    if 'a' in key:
        eng_dict[ 'a'][ key] = eng_m[ key];
    if 'n' in key:
        eng_dict[ 'n'][ key] = eng_m[ key];
    if 'r' in key:
        eng_dict[ 'r'][ key] = eng_m[ key];
    if 'v' in key:
        eng_dict[ 'v'][ key] = eng_m[ key];

In [17]:
def read_link_file( lfname):
    f = open( lfname, 'r');
    link_data = [];
    for line in f:
        tokens = line.strip().split('\t');
        link_data.append( ( tokens[ 0], tokens[ 2]));
    f.close();
    return link_data;

In [18]:
lfprefix = '/home/development/kevin/workspace/linking/data/links/';
lfname = lfprefix + 'noun_direct.links';
dn_link_list = read_link_file( lfname);

In [20]:
print( len( dn_link_list));

11493


In [47]:
def filter_link_list( link_list, src_model, tgt_model, pos):
    ret_list = [];
    for tup in link_list:
        src_id = tup[ 0];
        tgt_id = tup[ 1];
        src_str = str( src_id) + '_' + pos;
        tgt_str = str( tgt_id) + '_' + pos;
        if src_str in src_model and tgt_str in tgt_model:
            ret_list.append( ( src_str, tgt_str));
    return ret_list;

In [51]:
filtered_dn_link_list = filter_link_list( dn_link_list, hin_dict[ 'n'], eng_dict[ 'n'], 'n');

In [52]:
print( len( filtered_dn_link_list));

7063


In [73]:
def train_s2t( train_list, inp_m, out_m, pos = 'n', bias_p = False):
    ivlist = [];
    ovlist = [];
    for tup in train_list:
        k1, k2 = tup;
        ivlist.append( inp_m[ k1]);
        ovlist.append( out_m[ k2]);
    i_data = np.asarray( ivlist, dtype = 'f8');
    o_data = np.asarray( ovlist, dtype = 'f8');
    dim = len( o_data[ 0]);
    #print( dim);
    dmodel_list = [];
    print( 'Training', flush = True);
    for i in range( dim):
        #print( 'Dim = ', i, flush = True);
        X = i_data;
        Y = o_data[ :, i];
        #cur_model = LinearRegression( fit_intercept = bias_p);
        cur_model = ElasticNet( alpha = 0.01, l1_ratio = 0.7);
        #cur_model = Lasso( alpha = 0.001);
        cur_model.fit( X, Y);
        dmodel_list.append( cur_model);
    return dmodel_list;

In [67]:
def apply_model( model_list, test_list, inp_m, out_m, pos = 'n'):
    ivlist = [];
    ovlist = [];
    for tup in test_list:
        k1, k2 = tup;
        ivlist.append( inp_m[ k1]);
        ovlist.append( out_m[ k2]);
    i_data = np.array( ivlist, dtype = 'f8');
    dim = len( ovlist[ 0]);
    ypred = [];
    print( 'Translating', flush = True);
    for i in range( dim):
        #print( 'Dim = ', i, flush = True);
        X = i_data;
        cur_model = model_list[ i];
        Yp = cur_model.predict( X);
        ypred.append( Yp);
    pred_data = ypred[ 0];
    for i in range( 1, dim):
        pred_data = np.column_stack( ( pred_data, ypred[ i]));
    #print( np.shape( pred_data));
    return pred_data;

In [68]:
def get_acc( test_list, inp_m, out_m, pred_data, pos = 'n'):
    print( 'Sim Scoring', flush = True);
    ivlist = [];
    ovlist = [];
    ptr = 0;
    k2id = dict();
    id2k = dict();
    for key in out_m:
        ovlist.append( out_m[ key]);
        k2id[ key] = ptr;
        id2k[ ptr] = key;
        ptr += 1;
    o_data = np.asarray( ovlist, dtype = 'f8');
    no_data = ( o_data.T / np.linalg.norm( o_data, axis = 1)).T;
    npred_data = ( pred_data.T / np.linalg.norm( pred_data, axis = 1)).T;
    sim_mat = npred_data.dot( no_data.T);
    r, c = np.shape( sim_mat);
    #trim_sim_mat = np.sort( sim_mat)[ :, ::-1];
    trim_ind_mat = np.argsort( sim_mat)[ :, ::-1];
    indx = [];
    #posl = [];
    print( 'Ranking', flush = True);
    for i in range( r):
        true = test_list[ i][ 1];
        #pos = test_list[ i][ 2];
        tlist = trim_ind_mat[ i].tolist();
        tind = tlist.index( k2id[ true]);
        indx.append( tind);
        #posl.append( pos);
    acc_01 = sum( [ elem < 1 for elem in indx]);
    acc_03 = sum( [ elem < 3 for elem in indx]);
    acc_05 = sum( [ elem < 5 for elem in indx]);
    acc_08 = sum( [ elem < 8 for elem in indx]);
    acc_10 = sum( [ elem < 10 for elem in indx]);
    acc_20 = sum( [ elem < 20 for elem in indx]);
    acc_50 = sum( [ elem < 50 for elem in indx]);
    acc_100 = sum( [ elem < 100 for elem in indx]);
    print( acc_01 / len( indx));
    print( acc_03 / len( indx));
    print( acc_05 / len( indx));
    print( acc_08 / len( indx));
    print( acc_10 / len( indx));
    print( acc_20 / len( indx));
    print( acc_50 / len( indx));
    print( acc_100 / len( indx));
    print( min( indx), max( indx), ptr);

In [39]:
def cross_validation( inp_m, out_m, link_list, pos = 'n', bias_p = False):
    fold_beg = [ 0] * 10;
    fold_end = [ 0] * 10;
    for i in range( 1, 10):
        fold_end[ i - 1] = (len( link_list) // 10) * i;
        fold_beg[ i] = (len( link_list) // 10) * i;
    fold_end[ 9] = len( link_list);
    for fold_ptr in range( 10):
        print( 'Running fold ', fold_ptr, flush = True);
        train_list = [];
        test_list = [];
        for i in range( 10):
            if i != fold_ptr:
                train_list.extend( link_list[ fold_beg[ i] : fold_end[ i]]);
            else:
                test_list.extend( link_list[ fold_beg[ i] : fold_end[ i]]);
        dmodel_list = train_s2t( train_list, inp_m, out_m, pos, bias_p);
        pred_data = apply_model( dmodel_list, test_list, inp_m, out_m, pos);
        get_acc( test_list, inp_m, out_m, pred_data, pos);

In [74]:
p = 'n';
cross_validation( hin_dict[ p], eng_dict[ p], filtered_dn_link_list, p, bias_p=True);

Running fold  0
Training
Translating
Sim Scoring
Ranking
0.0028328611898017
0.00424929178470255
0.007082152974504249
0.0113314447592068
0.012747875354107648
0.018413597733711047
0.04107648725212465
0.06940509915014165
0 27429 28056
Running fold  1
Training
Translating
Sim Scoring
Ranking
0.00141643059490085
0.0028328611898017
0.0028328611898017
0.0028328611898017
0.0056657223796034
0.019830028328611898
0.049575070821529746
0.06232294617563739
0 27976 28056
Running fold  2
Training
Translating
Sim Scoring
Ranking
0.0
0.0
0.00424929178470255
0.0056657223796034
0.0056657223796034
0.007082152974504249
0.0169971671388102
0.026912181303116147
3 27403 28056
Running fold  3
Training
Translating
Sim Scoring
Ranking
0.00141643059490085
0.0028328611898017
0.007082152974504249
0.0084985835694051
0.009915014164305949
0.0113314447592068
0.015580736543909348
0.031161473087818695
0 28050 28056
Running fold  4
Training
Translating
Sim Scoring
Ranking
0.0028328611898017
0.0028328611898017
0.004249291784

In [72]:
p = 'n';
cross_validation( hin_dict[ p], eng_dict[ p], filtered_dn_link_list, p, bias_p=True);

Running fold  0
Training
Translating
Sim Scoring
Ranking
0.026912181303116147
0.049575070821529746
0.0708215297450425
0.10764872521246459
0.11614730878186968
0.1671388101983003
0.2507082152974504
0.32152974504249293
0 25566 28056
Running fold  1
Training
Translating
Sim Scoring
Ranking
0.0339943342776204
0.06373937677053824
0.08073654390934844
0.10056657223796034
0.11614730878186968
0.15722379603399433
0.22946175637393768
0.311614730878187
0 25684 28056
Running fold  2
Training
Translating
Sim Scoring
Ranking
0.0113314447592068
0.029745042492917848
0.04107648725212465
0.05807365439093484
0.06373937677053824
0.09348441926345609
0.14305949008498584
0.19688385269121814
0 25909 28056
Running fold  3
Training


KeyboardInterrupt: 

In [69]:
p = 'n';
cross_validation( hin_dict[ p], eng_dict[ p], filtered_dn_link_list, p);

Running fold  0
Training
Translating
Sim Scoring
Ranking
0.0
0.0
0.0
0.0
0.0
0.0
0.0028328611898017
0.0084985835694051
26 28026 28056
Running fold  1
Training
Translating
Sim Scoring
Ranking
0.0
0.0
0.0
0.0
0.0
0.0
0.00424929178470255
0.00424929178470255
23 28044 28056
Running fold  2
Training
Translating
Sim Scoring
Ranking
0.0
0.0
0.00141643059490085
0.00141643059490085
0.00141643059490085
0.00141643059490085
0.00141643059490085
0.0056657223796034
4 27942 28056
Running fold  3
Training
Translating
Sim Scoring
Ranking
0.0
0.0
0.0
0.0
0.0
0.0
0.0028328611898017
0.00424929178470255
24 28048 28056
Running fold  4
Training
Translating
Sim Scoring
Ranking
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.00141643059490085
60 28012 28056
Running fold  5
Training


KeyboardInterrupt: 

In [76]:
for key in eng_m:
    print( key);
    break;

1053920_n
