Converts the sparse reference term-document matrix provided by Hoffman from matrix market format into a gensim compatible format.

Apllies an LSI projection to the term-document matrix in order to calculate document vectors and saves to vectors to a column based raw text file.

In [12]:
from pathlib import Path
import pandas as pd
from semsim.constants import SEMD_DIR

dirpath = SEMD_DIR / "document_vectors"

In [2]:
filename = "tdm.mm"
filepath = dirpath / filename
mm = pd.read_csv(filepath, sep=' ', comment='%')
mm

Unnamed: 0,74100,112798,16323790
0,51,1,0.573678
1,115,1,0.601497
2,171,1,0.300871
3,186,1,0.804711
4,208,1,0.296410
...,...,...,...
16323785,59391,112798,0.388743
16323786,60722,112798,0.361460
16323787,65987,112798,0.160685
16323788,66170,112798,0.235783


In [3]:
c = mm.columns
mm = mm[[c[1], c[0], c[2]]]
mm

Unnamed: 0,112798,74100,16323790
0,1,51,0.573678
1,1,115,0.601497
2,1,171,0.300871
3,1,186,0.804711
4,1,208,0.296410
...,...,...,...
16323785,112798,59391,0.388743
16323786,112798,60722,0.361460
16323787,112798,65987,0.160685
16323788,112798,66170,0.235783


In [4]:
filename = "tdm_gensim_format.mm"
filepath = dirpath / filename
with open(filepath, 'w') as fp:
    fp.write("%%MatrixMarket matrix coordinate real general\n")
    mm.to_csv(fp, index=None, sep=' ')

In [5]:
filename = "tdm_transpose.mm"
filepath = dirpath / filename
mm_t = pd.read_csv(filepath, sep=' ', comment='%')
mm_t

Unnamed: 0,112798,74100,16323790
0,133,1,1.657571
1,139,1,0.552524
2,156,1,1.105047
3,212,1,0.552524
4,276,1,0.875729
...,...,...,...
16323785,49644,74100,0.782991
16323786,57199,74100,0.782991
16323787,69170,74100,0.782991
16323788,70372,74100,0.782991


In [6]:
mm_t.sort_values([mm_t.columns[0], mm_t.columns[1]])

Unnamed: 0,112798,74100,16323790
4492,1,51,0.573678
9644,1,115,0.601497
14090,1,171,0.300871
19897,1,186,0.804711
21233,1,208,0.296410
...,...,...,...
12781116,112798,59391,0.388743
13042485,112798,60722,0.361460
14221492,112798,65987,0.160685
14379297,112798,66170,0.235783


In [7]:
from gensim.corpora import MmCorpus
from semsim.metric.semantic_diversity import lsi_projection_sklearn

In [8]:
filename = "tdm_gensim_format.mm"
filepath = dirpath / filename
corpus = MmCorpus(str(filepath))

In [9]:
_, document_vectors, _ = lsi_projection_sklearn(corpus, 300)

Size of train_set=112798
Training LSI model with 300 topics


In [10]:
assert len(document_vectors) == len(corpus)

file_path = dirpath / f'tdm_lsi_sklearn_document_vectors.csv'
print(f"Saving document vectors to {file_path}")
document_vectors.to_csv(file_path)

Saving document vectors to /home/andreas/workspace/github/semsim/data/SemD/document_vectors/tdm_lsi_sklearn_document_vectors.csv


In [11]:
document_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,5.661233,1.597329,-0.288752,1.787849,-0.249814,1.454051,0.202908,-0.606764,1.605352,0.195044,...,0.364749,0.247401,0.106645,0.165027,0.230202,-0.453065,-0.220881,-0.145809,0.245389,0.109229
1,7.134197,5.034968,2.597040,-2.629931,0.850380,1.944177,1.464680,-0.227673,-1.601640,0.692621,...,-0.524651,-0.302095,-0.107206,-0.188711,0.422112,0.281842,0.243042,-0.256614,0.598067,0.265421
2,7.711031,4.189832,1.736546,0.328510,0.225244,2.491384,-0.409053,0.050252,0.161229,-0.230116,...,0.329266,-0.197374,-0.496967,0.205771,-0.155806,-0.285419,0.602155,0.015906,-0.464941,-0.072496
3,0.756073,0.472403,0.099208,-0.377460,-0.091720,0.134921,-0.164273,-0.367873,-0.065432,-0.064822,...,-0.051666,-0.027596,-0.034111,0.007439,0.056977,-0.037387,0.011589,0.029340,0.023976,-0.102909
4,10.301142,-1.185442,3.300113,2.691332,-1.272150,-0.259554,-0.815327,1.550996,-1.144567,1.737661,...,0.010338,-0.631568,-0.090557,0.244185,0.581132,0.031367,1.033529,1.016773,0.472640,-0.607561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112793,1.256355,-0.361180,0.414874,0.495923,-0.193530,-0.172498,-0.739371,-0.353276,-0.410648,-0.485349,...,0.022854,0.028769,0.025791,0.036223,-0.019592,-0.003228,-0.030137,0.017751,0.058494,-0.091201
112794,0.204222,-0.033498,-0.107541,-0.074893,-0.099965,-0.102337,-0.101517,-0.084214,0.005515,-0.008868,...,-0.044121,0.071223,-0.031646,0.041503,-0.045194,0.106216,0.070345,0.032405,0.050307,-0.028758
112795,0.226754,-0.100169,-0.156836,0.035767,-0.082209,0.096347,-0.009232,-0.094390,0.037921,-0.223808,...,0.004104,0.035109,-0.026210,0.012526,0.001839,-0.007567,0.002319,-0.028798,-0.001864,-0.016641
112796,0.155547,0.007962,-0.047609,-0.120525,-0.078881,-0.061303,-0.094549,-0.118931,-0.016058,-0.036082,...,0.027202,0.019875,0.016551,0.039482,-0.030710,-0.021131,0.014979,-0.013671,-0.002602,0.035692
