# text_to_embeddings - quick overview 

In [1]:
import text_to_x as ttx

In [2]:
# test data
with open("test_data/fyrtårnet.txt", "r") as f:
    sample_text = f.read()

## preprocess

In [3]:
Tokenizer = ttx.TextToTokens(lang = 'da', method = 'stanza',
                                          lemmatize = True,
                                          stem = False,
                                          pos = True,
                                          mwf = True,
                                          depparse = False,
                                          casing = False,
                                          silent = False)

preprocessed_text = Tokenizer.texts_to_tokens(texts = sample_text)

2020-03-25 16:10:29 INFO: Loading these models for language: da (Danish):
| Processor | Package |
-----------------------
| tokenize  | ddt     |
| pos       | ddt     |
| lemma     | ddt     |

2020-03-25 16:10:29 INFO: Use device: gpu
2020-03-25 16:10:29 INFO: Loading: tokenize
2020-03-25 16:10:32 INFO: Loading: pos
2020-03-25 16:10:33 INFO: Loading: lemma
2020-03-25 16:10:33 INFO: Done loading processors!


## train pmi-svd word embeddings

Documentation and tutorial is in the making. 
Here are the docstrings:

In [4]:
SvdTest = ttx.SvdEmbeddings(texts = preprocessed_text)

help(SvdTest)

Help on SvdEmbeddings in module text_to_x.text_to_embeddings object:

class SvdEmbeddings(builtins.object)
 |  train and operate with PMI-SVD embeddings
 |  
 |  Examples:
 |  - training with a manually set class instance
 |      your_instance = SvdEmbeddings(texts)
 |      your_instance.train(front_window = int, back_window = int, embedding_dim = int)
 |  
 |  - training with default settings:
 |      your_instance = svd2vec_run_default(texts)
 |      
 |  - visualizing the fit
 |  
 |  Methods defined here:
 |  
 |  __init__(self, texts, texts_colname='lemma')
 |      texts (iterable | TextToToken): input list of dataframes acquired from text_to_x.text_to_token. The "lemma" column is used. 
 |      texts_colname (string): column from TextToToken DataFrame to process ("lemma" / "stem" / "word")
 |  
 |  find(self, query)
 |      input word, get vector representation
 |      
 |      Example:
 |      In: self.find('hpv')
 |      
 |      Out: array([ 0.00435369, ...,  0.332069  ])
 |  

__Method__:
skipgram counts -> cooccurrence matrix -> smoothed positive pointwise mutual information -> singular value decomposition

In [11]:
# train embeddings
SvdTest.train(back_window = 2, 
              front_window = 2,
              embedding_dim = 10)

SvdTest.find('soldat')

array([-0.07508417,  0.08030948,  0.10994217, -0.00910836,  0.15851683,
       -0.19071693,  0.14430531,  0.09943838, -0.0446294 , -0.17512717])

## operations with embeddings
similiraty = cosine similarity between vectors

In [6]:
# what words do we have
SvdTest.unigram_counts.head(10)

Unnamed: 0,index,token,count
9,9,",",270
18,18,og,124
12,12,han,85
11,11,!,81
2,2,en,77
44,44,"""",71
23,23,være,71
30,30,så,65
29,29,.,64
48,48,du,61


In [7]:
# cosine similarity between two words
SvdTest.similarity('soldat', 'hund')

0.0260258004513567

In [8]:
# most similar word to query
SvdTest.similar_to_query('hund')

[(0.10186149256779237, 'på'),
 (0.06650964729432995, 'han'),
 (0.05290202073497395, 'en'),
 (0.04972814516464932, 'al'),
 (0.04509147007380693, 'med')]

In [9]:
# vector operations: what's hund - soldat most similar to?
SvdTest.similar_to_vec(SvdTest.find('hund') - SvdTest.find('soldat'))

[(0.047091805822587894, 'en'),
 (0.04421838463625482, 'sin'),
 (0.04148780702089092, 'han'),
 (0.040723195133524466, 'have'),
 (0.03837001348076083, 'støvle'),
 (0.03748138021485341, 'på')]

In [12]:
# reduce dimensions
SvdTest.reduce_dim_umap()
SvdTest.model2d

array([[ 3.17229462e+00, -3.97640848e+00],
       [-1.13207877e+00, -1.22743690e+00],
       [ 2.65527821e+00, -2.88217831e+00],
       [ 5.55772841e-01,  2.92299151e-01],
       [ 8.05695951e-01, -6.56559408e-01],
       [ 8.60549510e-01, -7.75441766e-01],
       [ 7.46654391e-01, -7.10263610e-01],
       [ 6.25759602e-01, -9.30485249e-01],
       [-2.86595440e+00,  2.14058781e+00],
       [ 4.89108801e-01, -2.96910667e+00],
       [ 3.22912312e+00, -1.94064450e+00],
       [ 5.44184685e+00, -3.41471148e+00],
       [-1.47435784e+00, -2.38832903e+00],
       [ 3.47279477e+00, -1.81873214e+00],
       [ 5.99328578e-01, -2.57898211e+00],
       [ 6.64423525e-01, -2.67509341e+00],
       [-2.82447577e+00, -2.65601254e+00],
       [-2.59149861e+00, -2.80893493e+00],
       [-1.54658091e+00, -4.54943419e+00],
       [-2.26808572e+00, -3.10350347e+00],
       [-1.14531648e+00, -3.03550458e+00],
       [-8.77370477e-01, -3.33957791e+00],
       [ 3.36621761e+00, -4.28030157e+00],
       [ 3.