# Gensim
## Key ideas
### Document
A string type object
### Corpus 
A list of docs
### Vectors
mathematical expressions of corpus
### models
tranformation of a matrix into another one

### A demo

In [27]:
import gensim
import numpy as np
from scipy import sparse

documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

stop_words = set("for a of the and to in".split())
# docs = [[word for word in doc.lower().split() if word not in stop_words] for doc in documents] 

# BoW

def create_vocab(corpus):
    vocab = set()
    for doc in corpus:
        for word in doc:
            vocab.add(word)
    return sorted(list(vocab))

def build_bow_matrix(corpus, vocab):
    # Create word to index mapping
    word_index = {word: i for i, word in enumerate(vocab)}
    
    # Initialize lists to construct sparse matrix
    row_ind = []
    col_ind = []
    data = []
    
    # Fill the lists
    for doc_id, doc in enumerate(corpus):
        for word in doc:
            if word in word_index:
                row_ind.append(doc_id)
                col_ind.append(word_index[word])
                data.append(1)
    
    # Create sparse matrix
    bow = sparse.csr_matrix((data, (row_ind, col_ind)), 
                          shape=(len(corpus), len(vocab)), 
                          dtype=np.float32)  # Use float32 instead of float64
    
    return bow


## Parenthesis about `yield`
compare to a regular function that returns all the results in the same time which occupies memory, `yield` returns a generator in which values are accessible by iteration

In [15]:
from typing import Generator
def regular_fun() -> list:
    return [i for i in range(1000000000)]

# f = regular_fun() dangeous!!!!!

def yield_fun() -> Generator:
    for i in range(1000000000):
        yield i

f = yield_fun()
for i in f:
    print(i)
    if i > 5:
        break

def read_large_text(path):
    with open(path,'w',encoding="utf-8") as f:
        for line in f:
            yield line.strip()


0
1
2
3
4
5
6


## Step into LDA model


### Load data

In [16]:
import io
import os.path
import re
import tarfile

import smart_open

def extract_documents(url):
    with smart_open.open(url, "rb") as file: # this returns data as bytes
        with tarfile.open(fileobj=file) as tar:
            for member in tar.getmembers(): # members = number of files in the current floder
                if member.isfile() and re.search(r'nipstxt/nips\d+/\d+\.txt', member.name):
                    member_bytes = tar.extractfile(member).read() 
                    yield member_bytes.decode('utf-8', errors='replace') # unrecognized character will be replaced by (U+FFFD)


docs = list(extract_documents("../nips12raw_str602.tgz"))

byte_str = b"hello world"
print(byte_str,type(byte_str),sep='\n')

decoded_str = byte_str.decode('utf-8')
print(decoded_str,type(decoded_str),sep='\n')
print()

nonascii = "éô行"
encoded_text = nonascii.encode("utf-8")
print(encoded_text,type(encoded_text),sep='\n')
print()
decoded_text = encoded_text.decode("utf-8")
print(decoded_text,type(decoded_text),sep='\n')


b'hello world'
<class 'bytes'>
hello world
<class 'str'>

b'\xc3\xa9\xc3\xb4\xe8\xa1\x8c'
<class 'bytes'>

éô行
<class 'str'>


## data preprocessing
- Tokenization based on a regex tokenizer from `nltk`
- Lemmatization
- compute bigrams
- build BoW
- remove stopwords (optional)

In [17]:
import nltk
from nltk import RegexpTokenizer
nltk.download('stopwords')
from nltk.corpus import stopwords

stoplist = set(stopwords.words("english"))
tokenizer = RegexpTokenizer(r'\w+') 

for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()
    docs[idx] = tokenizer.tokenize(docs[idx]) # each doc contains a list of tokens, the docs is a corpus contains a list of list, each list is a doc

# remove numbers but not words that contain numbers
docs = [[token for token in doc if not token.isnumeric() and token not in stoplist ]for doc in docs]

# remove words only one character
docs = [[token for token in doc if len(token)>1]for doc in docs]

# lemmatizaion
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token)for token in doc]for doc in docs]
print(len(docs))

[nltk_data] Downloading package stopwords to /home/chen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


1740


In [18]:
# find bigrams here us find entities like New York, or terms like machine learning
# and we get machine_learning in output, spaces are replaced by 
# of course n grams can be captured, use a name entity method is also recomm
# once find bigrams, they will add them to the original data, because they want to keep them both

In [None]:
def bigram(docs):
    # Compute bigrams.
    from gensim.models import Phrases

    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(docs, min_count=20)
    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)

def another_way(docs):
    from nltk import bigrams

    for idx in range(len(docs)):
        bigram = list(bigrams(docs[idx]))
        bigram = [f"{i[0]}_{i[1]}" for i in bigram]
        docs[idx].extend(bigram) # extend modify the list itself and returns None, so do not use it to assign a value

another_way(docs)

## Vectorization

In [None]:
vocab = create_vocab(docs)
bow = build_bow_matrix(docs,vocab)
bow.shape

In [None]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [26]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 14162
Number of documents: 1740


## Trainning

shown in run_ida py