# Other Tools: Gensim

```yaml
Course:   DS 5001
Module:   08a Visualization
Topic:    Other Tools
Author:   R.C. Alvarado
Date:     23 March 2023
```

## Set Up

### Config

In [1]:
import configparser
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

In [2]:
num_topics = 100
data_dir = f"{data_home}/newsgroups/20news-18828"

### Imports

In [3]:
import pandas as pd
import numpy as np
from gensim import corpora, models
from collections import defaultdict
import plotly_express as px
from glob import glob
import re 

## Import Data

In [4]:
def import_data():
    data = []
    for d in glob(data_dir+"/*"):
        label = d.split("/")[-1]
        print(label)
        for f in glob(d+"/*"):
            fid = f.split("/")[-1]
            flines = open(f, 'r', encoding="latin-1").read().split("\n")
            from_line = ':'.join(flines[0].split(':')[1:])
            subj_line = ':'.join(flines[1].split(':')[1:])
            data.append((fid, label, from_line, subj_line, ' '.join(flines[2:])))
    LIB = pd.DataFrame(data, columns=['doc_id','doc_label','doc_from', 'doc_subj', 'doc_content'])
    LIB.doc_id = LIB.doc_id.astype('int')
    LIB = LIB.set_index(['doc_label','doc_id'])
    return LIB

In [5]:
LIB = import_data()

In [6]:
LIB

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_from,doc_subj,doc_content
doc_label,doc_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [7]:
LIB.to_csv(f"{output_dir}/newsgroups-LIB.csv")

## Pre-Process the Gensim Way

### Stopwords

We create a set of frequent words. Of course, we can grab a premade list from somewhere else, such as NLTK.

In [8]:
stoplist = set('for a of the and to in is i that it you this be on are'.split(' '))

### Corpus

We loop through the list of docs and do some parsing and shaping on the fly. 

Again, we could do better with tools from NLTK.

Here we lowercase each document, split it by white space, remove non-alphanumeric characters, and filter out stopwords

In [9]:
texts = [[re.sub(r"[\W_]+", "", word) for word in document.lower().split() if word not in stoplist]
         for document in LIB.doc_content.values]

### Term Frequencies

We count word frequencies in order to filter out low-frequency words.

In [10]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

### Filtered Corpus

We filter by frequency, removing words that appear once.

In [11]:
filtered_corpus = [[token for token in text if frequency[token] > 1] for text in texts]

### Dictionary

We create a "dictionary," aka a vocabulary, which associates a term string with a numeric identifier.

In [12]:
dictionary = corpora.Dictionary(filtered_corpus)

### BOW

We create the BOW corpus from the text using the dictionary.

In [13]:
bow_corpus = [dictionary.doc2bow(text) for text in filtered_corpus]

In [14]:
# bow_corpus[0]

## Train models

### TFIDF

In [15]:
tfidf = models.TfidfModel(bow_corpus)

In [16]:
# tfidf[bow_corpus[5]]

### LDA

In [17]:
model = models.LdaModel(bow_corpus, id2word=dictionary, num_topics=num_topics)

ValueError: cannot compute LDA over an empty collection (no terms)

In [None]:
model2 = models.HdpModel(bow_corpus, id2word=dictionary)

## Convert to Pandas

### VOCAB

In [None]:
VOCAB = pd.DataFrame([(k, v) for k, v in dictionary.token2id.items()], columns=['term_str','term_id']) #.set_index('term_id')
VOCAB['n'] = VOCAB.term_str.map(lambda x: frequency[x])
VOCAB = VOCAB.set_index('term_id').sort_index()

In [None]:
VOCAB.sample(5)

### TFIDF

In [None]:
tfidf_data = []
for doc_id, doc in enumerate(bow_corpus):
    for term in tfidf[doc]:
        tfidf_data.append((doc_id, term[0], term[1]))
TFIDF = pd.DataFrame(tfidf_data, columns=['doc_id','term_id', 'tfidf']).set_index(['doc_id','term_id'])

In [None]:
TFIDF.tfidf.unstack(fill_value=0)

### BOW

In [None]:
bow_data = []
for i, doc in enumerate(bow_corpus):
    for term in doc:
        bow_data.append((i, term[0], term[1]))
BOW = pd.DataFrame(bow_data, columns=['doc_id','term_id', 'n']).set_index(['doc_id','term_id'])     
DTM = BOW.n.unstack(fill_value=0)

In [None]:
BOW.head()

In [None]:
DTM.head()

### LDA

#### PHI

In [None]:
PHI = pd.DataFrame(model.get_topics()).T
PHI.index.name = 'term_id'

In [None]:
PHI

#### THETA

In [None]:
theta_data = []
for doc_id, doc_bow in enumerate(bow_corpus):
    for topic in model.get_document_topics(doc_bow):
        theta_data.append((doc_id, topic[0], topic[1]))
THETA = pd.DataFrame(theta_data, columns=['doc_id', 'topic_id', 'topic_weight']).set_index(['doc_id','topic_id']).unstack(fill_value=0)

In [None]:
THETA

#### TOPIC

In [None]:
topic_data = []
for t in range(num_topics):
    for term_rank, term in enumerate(model.get_topic_terms(t)):
        term_id = term[0]
        topic_data.append((t, term_rank, dictionary.id2token[term_id]))

In [None]:
TOPIC = pd.DataFrame(topic_data, columns=['topic_id', 'term_rank', 'term_str'])\
    .set_index(['topic_id','term_rank']).term_str.unstack()

In [None]:
TOPIC.head(20)