# Vector Transformation in Gensim

In [5]:
from pprint import pprint
from gensim import corpora

In [3]:
documents = [u"Football club Arsenal defeat local rivals this weekend.", 
             u"Weekend football frenzy takes over London.", 
             u"Bank open for take over bids after losing millions.", 
             u"London football clubs bid to move to Wembley stadium.", 
             u"Arsenal bid 50 million pounds for striker Kane.", 
             u"Financial troubles result in loss of millions for bank.", 
             u"Western bank files for bankruptcy after financial losses.", 
             u"London football club is taken over by oil millionaire from Russia.", 
             u"Banking on finances not working for Russia."]

In [6]:
import spacy
nlp = spacy.load("en")
texts = []

for document in documents:
    text = []
    doc = nlp(document)
    for w in doc:
        if not w.is_stop and not w.is_punct and not w.like_num:
            text.append(w.lemma_)
    texts.append(text)
    
pprint(texts)

[['football', 'club', 'arsenal', 'defeat', 'local', 'rival', 'weekend'],
 ['weekend', 'football', 'frenzy', 'take', 'london'],
 ['bank', 'open', 'bid', 'lose', 'million'],
 ['london', 'football', 'club', 'bid', 'wembley', 'stadium'],
 ['arsenal', 'bid', 'pound', 'striker', 'kane'],
 ['financial', 'trouble', 'result', 'loss', 'million', 'bank'],
 ['western', 'bank', 'file', 'bankruptcy', 'financial', 'loss'],
 ['london', 'football', 'club', 'take', 'oil', 'millionaire', 'russia'],
 ['bank', 'finance', 'work', 'russia']]


# Bag-of-words representation

In [7]:
dictionary = corpora.Dictionary(texts)
print(dictionary.token2id)

{'arsenal': 0, 'club': 1, 'defeat': 2, 'football': 3, 'local': 4, 'rival': 5, 'weekend': 6, 'frenzy': 7, 'london': 8, 'take': 9, 'bank': 10, 'bid': 11, 'lose': 12, 'million': 13, 'open': 14, 'stadium': 15, 'wembley': 16, 'kane': 17, 'pound': 18, 'striker': 19, 'financial': 20, 'loss': 21, 'result': 22, 'trouble': 23, 'bankruptcy': 24, 'file': 25, 'western': 26, 'millionaire': 27, 'oil': 28, 'russia': 29, 'finance': 30, 'work': 31}


In [19]:
print(dictionary.doc2bow.__doc__)
corpus = [dictionary.doc2bow(text) for text in texts]

Convert `document` into the bag-of-words (BoW) format = list of `(token_id, token_count)` tuples.

        Parameters
        ----------
        document : list of str
            Input document.
        allow_update : bool, optional
            Update self, by adding new tokens from `document` and updating internal corpus statistics.
        return_missing : bool, optional
            Return missing tokens (tokens present in `document` but not in self) with frequencies?

        Return
        ------
        list of (int, int)
            BoW representation of `document`.
        list of (int, int), dict of (str, int)
            If `return_missing` is True, return BoW representation of `document` + dictionary with missing
            tokens and their frequencies.

        Examples
        --------
        >>> from gensim.corpora import Dictionary
        >>> dct = Dictionary(["máma mele maso".split(), "ema má máma".split()])
        >>> dct.doc2bow(["this", "is", "máma"])
        [(2

In [14]:
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(3, 1), (6, 1), (7, 1), (8, 1), (9, 1)], [(10, 1), (11, 1), (12, 1), (13, 1), (14, 1)], [(1, 1), (3, 1), (8, 1), (11, 1), (15, 1), (16, 1)], [(0, 1), (11, 1), (17, 1), (18, 1), (19, 1)], [(10, 1), (13, 1), (20, 1), (21, 1), (22, 1), (23, 1)], [(10, 1), (20, 1), (21, 1), (24, 1), (25, 1), (26, 1)], [(1, 1), (3, 1), (8, 1), (9, 1), (27, 1), (28, 1), (29, 1)], [(10, 1), (29, 1), (30, 1), (31, 1)]]


In [20]:
print(dictionary.doc2idx.__doc__)
[dictionary.doc2idx(x) for x in texts]

Convert `document` (a list of words) into a list of indexes = list of `token_id`.
        Replace all unknown words i.e, words not in the dictionary with the index as set via `unknown_word_index`.

        Parameters
        ----------
        document : list of str
            Input document
        unknown_word_index : int, optional
            Index to use for words not in the dictionary.

        Returns
        -------
        list of int
            Token ids for tokens in `document`, in the same order.

        Examples
        --------
        >>> from gensim.corpora import Dictionary
        >>>
        >>> corpus = [["a", "a", "b"], ["a", "c"]]
        >>> dct = Dictionary(corpus)
        >>> dct.doc2idx(["a", "a", "c", "not_in_dictionary", "c"])
        [0, 0, 2, -1, 2]

        


[[3, 1, 0, 2, 4, 5, 6],
 [6, 3, 7, 9, 8],
 [10, 14, 11, 12, 13],
 [8, 3, 1, 11, 16, 15],
 [0, 11, 18, 19, 17],
 [20, 23, 22, 21, 13, 10],
 [26, 10, 25, 24, 20, 21],
 [8, 3, 1, 9, 28, 27, 29],
 [10, 30, 31, 29]]

# Saving the corpus

In [21]:
corpora.MmCorpus.serialize('top_corpus.mm', corpus)

# Bag of words to TF-IDF

In [24]:
from gensim import models
tfidf = models.TfidfModel(corpus)

In [25]:
for document in tfidf[corpus]:
    print(document)

[(0, 0.3292179861221233), (1, 0.24046829370585296), (2, 0.4809365874117059), (3, 0.1774993848325406), (4, 0.4809365874117059), (5, 0.4809365874117059), (6, 0.3292179861221233)]
[(3, 0.24212967666975266), (6, 0.4490913847888623), (7, 0.6560530929079719), (8, 0.32802654645398593), (9, 0.4490913847888623)]
[(10, 0.2184344336379748), (11, 0.29592528218102643), (12, 0.5918505643620529), (13, 0.4051424990000138), (14, 0.5918505643620529)]
[(1, 0.29431054749542984), (3, 0.21724253258131512), (8, 0.29431054749542984), (11, 0.29431054749542984), (15, 0.5886210949908597), (16, 0.5886210949908597)]
[(0, 0.354982288765831), (11, 0.25928712547209604), (17, 0.5185742509441921), (18, 0.5185742509441921), (19, 0.5185742509441921)]
[(10, 0.19610384738673725), (13, 0.3637247180792822), (20, 0.3637247180792822), (21, 0.3637247180792822), (22, 0.5313455887718271), (23, 0.5313455887718271)]
[(10, 0.18286519950508276), (20, 0.3391702611796705), (21, 0.3391702611796705), (24, 0.4954753228542582), (25, 0.4954

# Bigrams

In [36]:
bigram = models.Phrases(texts)
bigrams_texts = [bigram[line] for line in texts]

pprint(bigrams_texts)

[['football', 'club', 'arsenal', 'defeat', 'local', 'rival', 'weekend'],
 ['weekend', 'football', 'frenzy', 'take', 'london'],
 ['bank', 'open', 'bid', 'lose', 'million'],
 ['london', 'football', 'club', 'bid', 'wembley', 'stadium'],
 ['arsenal', 'bid', 'pound', 'striker', 'kane'],
 ['financial', 'trouble', 'result', 'loss', 'million', 'bank'],
 ['western', 'bank', 'file', 'bankruptcy', 'financial', 'loss'],
 ['london', 'football', 'club', 'take', 'oil', 'millionaire', 'russia'],
 ['bank', 'finance', 'work', 'russia']]




In [37]:
bigram_dictionary = corpora.Dictionary(bigrams_texts)
corpus = [dictionary.doc2bow(text) for text in bigrams_texts]

In [38]:
pprint(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(3, 1), (6, 1), (7, 1), (8, 1), (9, 1)],
 [(10, 1), (11, 1), (12, 1), (13, 1), (14, 1)],
 [(1, 1), (3, 1), (8, 1), (11, 1), (15, 1), (16, 1)],
 [(0, 1), (11, 1), (17, 1), (18, 1), (19, 1)],
 [(10, 1), (13, 1), (20, 1), (21, 1), (22, 1), (23, 1)],
 [(10, 1), (20, 1), (21, 1), (24, 1), (25, 1), (26, 1)],
 [(1, 1), (3, 1), (8, 1), (9, 1), (27, 1), (28, 1), (29, 1)],
 [(10, 1), (29, 1), (30, 1), (31, 1)]]
