In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle
import numpy as np
from utilities import pickleFixLoad
from DCS import *
from sentences import *
from romtoslp import rom_slp

In [69]:
def SentencePreprocess(sentenceObj):
    """
    Considering word names only
    ***{Word forms or cngs can also be used}
    """
    chunkDict = {}
    wordList = []
    cngList = []
    revMap2Chunk = []
    qu = []

    cid = -1
    for chunk in sentenceObj.chunk:
        # print()
        cid = cid+1
        chunkDict[cid] = {}
        canBeQuery = 0
        if len(chunk.chunk_words.keys()) == 1:
            canBeQuery = 1 # Unsegmentable Chunk
        for pos in chunk.chunk_words.keys():
            chunkDict[cid][pos] = []
            if(canBeQuery == 1) and (len(chunk.chunk_words[pos]) == 1):
                canBeQuery = 2 # No cng alternative for the word
            for word_sense in chunk.chunk_words[pos]:
                if(len(word_sense.lemmas) > 0):
                    wordList.append(rom_slp(word_sense.lemmas[0]))
                    for form, config in word_sense.forms[0].items():                        
                        cngList.append(wordTypeCheck(form, config[0]))
                        break
                    
                    k = len(wordList) - 1
                    chunkDict[cid][pos].append(k)
                    revMap2Chunk.append((cid, pos))
                    if canBeQuery == 2:
                        # The word has a lemma available - in some pickle file it's not
                        # Make this word query
                        qu.append(k)
    return (chunkDict, wordList, revMap2Chunk, qu, cngList)

In [111]:
dcsObj = pickleFixLoad('../Text Segmentation/DCS_pick/3.p')
sentenceObj = pickleFixLoad('../TextSegmentation/Pickles//3.p')

In [112]:
(chunkDict, wordList, revMap2Chunk, qu, cngList) = SentencePreprocess(sentenceObj)

from pprint import pprint
# pprint(chunkDict)
print(wordList)
# pprint(revMap2Chunk)
print(qu)
print(cngList)

['vajra', 'ca', 'ca', 'mOktka', 'ca', '', 'eva', 'mÄ\x81á¹\x87kya', 'nÄ«la', 'eva', 'ca', 'ca']
[0, 3, 7, 8, 9]
[69, 2, 2, 71, 2, -44, 2, 71, 69, 2, 2, 2]


In [113]:
SeeDCS(dcsObj)

vajraM ca mOktikaM cEva mARikyaM nIlam eva ca   
[['vajra'], ['ca'], ['mauktika'], ['ca', 'eva'], ['mÄ\x81á¹\x87ikya'], ['nÄ«la'], ['eva'], ['ca']]
[['31'], ['1'], ['31'], ['1', '1'], ['31'], ['31'], ['1'], ['1']]


In [114]:
SeeSentence(sentenceObj)

vajraM ca mOktikaM cEva mARikyaM nIlam eva ca   
Analyzing  vajram
0 :  vajram ['vajra'] [{'noun': ['acc. sg. m.', 'acc. sg. n.', 'nom. sg. n.']}]
Analyzing  ca
0 :  ca ['ca'] [{'indeclinable': ['conj.']}]
0 :  ca ['ca'] [{'indeclinable': ['conj.']}]
Analyzing  mOktikam
0 :  mOktikam ['mOktka'] [{'noun': ['acc. sg. n.', 'nom. sg. n.']}]
Analyzing  cEva
0 :  ca ['ca'] [{'indeclinable': ['conj.']}]
1 :  Eva [''] [{'verb': ['impft. [2] ac. du. 1']}]
1 :  eva ['eva'] [{'indeclinable': ['prep.']}]
Analyzing  mARikyam
0 :  mÄá¹ikyam ['mÄ\x81á¹\x87kya'] [{'noun': ['acc. sg. n.', 'nom. sg. n.']}]
Analyzing  nIlam
0 :  nÄ«lam ['nÄ«la'] [{'noun': ['acc. sg. m.', 'acc. sg. n.', 'nom. sg. n.']}]
Analyzing  eva
0 :  eva ['eva'] [{'indeclinable': ['prep.']}]
Analyzing  ca
0 :  ca ['ca'] [{'indeclinable': ['conj.']}]
0 :  ca ['ca'] [{'indeclinable': ['conj.']}]


In [115]:
fullCo_ocMat = pickle.load(open('extras/all_dcs_lemmas_matrix.p', 'rb'))
word2IndexDict = pickle.load(open('dcsLemma2index.p', 'rb'))

In [116]:
def getCo_occurMat(wordList, fullCo_ocMat, word2IndexDict):
    nodeCount = len(wordList)
    wordIndexList = [-1]*nodeCount
    i = -1
    for w in wordList:
        i += 1
        try:
            wordIndexList[i] = word2IndexDict[w]
        except KeyError:
            continue
    TransitionMat = np.zeros((nodeCount, nodeCount))
    
    """
    FIXME:
    1. HOW TO DO SMOOTHING?
    2. HOW TO CONVERT WORD2VEC SIM. TO PROB.
    """
    
    for row in range(nodeCount):
        for col in range(nodeCount):
            if row != col:
                try:
                    TransitionMat[row][col] = fullCo_ocMat[wordIndexList[row]][wordIndexList[col]]
                except KeyError:
                    TransitionMat[row][col] = 0 #WHAT TO DO HERE??
            else:
                TransitionMat[row][col] = 0
        
        row_sum = np.sum(TransitionMat[row, :])
        if(row_sum > 0):
            TransitionMat[row, :] /= row_sum
        else:
            TransitionMat[row, :] = 1/(nodeCount - 1)
        
        TransitionMat[row, row] = 0
        # print((TransitionMat[row, :]))
    # MakeRowStochastic(TransitionMat)
    return TransitionMat

In [117]:
co_ocMat = getCo_occurMat(wordList, fullCo_ocMat, word2IndexDict)

In [12]:
from utilities import printProgress, validatePickleName, pickleFixLoad
model_cbow = pickleFixLoad('extras/modelpickle10.p')
print(model_cbow)

Word2Vec(vocab=66936, size=100, alpha=0.025)


In [13]:
print(model_cbow.similarity('sam', 'tad'))
print(fullCo_ocMat[word2IndexDict['sam']][word2IndexDict['tad']])

0.0557379937173
0.030479256073288212


In [14]:
fullCo_ocMat_counts = pickle.load(open('extras/all_dcs_lemmas_matrix_countonly.p', 'rb'))
unigram_counts = pickle.load(open('extras/counts_of_uniq_lemmas.p', 'rb'))

In [15]:
print(fullCo_ocMat_counts[word2IndexDict['sam']][word2IndexDict['tad']])
print(unigram_counts[word2IndexDict['sam']])
len(fullCo_ocMat_counts)

944739.0
43


66914

In [16]:
word2IndexDict['sam']

51640

In [118]:
cng_ordered_list = pickle.load(open('extras/list_of_uniq_cngs.p', 'rb'))
cng2cngFullMat = np.mat(pickleFixLoad('extras/all_dcs_cngs_matrix_countonly.p'))
cng_uni_count = pickle.load(open('extras/counts_of_uniq_cngs.p', 'rb'))

cng2index_dict = pickle.load(open('cng2index_dict.p', 'rb'))

In [119]:
def get_cng2cng_mat(cng2cngFullMat, cngList, cng2index_dict):
    nodeCount = len(cngList)
    cngIndexList = list(map(lambda x:cng2index_dict[str(x)], cngList))
    print(cngIndexList)
    TransitionMat = np.zeros((nodeCount, nodeCount))
    
    """
    FIXME:
    1. HOW TO DO SMOOTHING?
    2. HOW TO CONVERT WORD2VEC SIM. TO PROB.
    """
    
    for row in range(nodeCount):
        for col in range(nodeCount):
            if row != col:
                try:
#                     print(cngIndexList[row])
                    TransitionMat[row][col] = cng2cngFullMat[cngIndexList[row],cngIndexList[col]]
                except KeyError:
                    TransitionMat[row][col] = 0 #WHAT TO DO HERE??
            else:
                TransitionMat[row][col] = 0
        
        row_sum = np.sum(TransitionMat[row, :])
        if(row_sum > 0):
            TransitionMat[row, :] /= row_sum
        else:
            TransitionMat[row, :] = 1/(nodeCount - 1)
        
        TransitionMat[row, row] = 0
        # print((TransitionMat[row, :]))
    # MakeRowStochastic(TransitionMat)
    return TransitionMat

get_cng2cng_mat(cng2cngFullMat, cngList, cng2index_dict)

[168, 46, 46, 258, 46, 25, 46, 258, 168, 46, 46, 46]


array([[  0.00000000e+00,   1.33101129e-01,   1.33101129e-01,
          3.41404102e-02,   1.33101129e-01,   1.12793743e-05,
          1.33101129e-01,   3.41404102e-02,   0.00000000e+00,
          1.33101129e-01,   1.33101129e-01,   1.33101129e-01],
       [  2.59887503e-01,   0.00000000e+00,   0.00000000e+00,
          2.40079461e-01,   0.00000000e+00,   6.60708544e-05,
          0.00000000e+00,   2.40079461e-01,   2.59887503e-01,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  2.59887503e-01,   0.00000000e+00,   0.00000000e+00,
          2.40079461e-01,   0.00000000e+00,   6.60708544e-05,
          0.00000000e+00,   2.40079461e-01,   2.59887503e-01,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  3.67501360e-02,   1.32355369e-01,   1.32355369e-01,
          0.00000000e+00,   1.32355369e-01,   1.21415805e-05,
          1.32355369e-01,   0.00000000e+00,   3.67501360e-02,
          1.32355369e-01,   1.32355369e-01,   1.32355369e-01],
    

In [100]:
cng2cngFullMat[2,2]


0.0

In [None]:
sentenceObj = pickleFixLoad('../TextSegmentation/Pickles/3.p')
