# Topic Modeling

## Import Modules

In [2]:
# Import Modules

import re
import numpy as np
import pandas as pd
from pprint import pprint
import random
np.random.seed(42)

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, Word2Vec, LdaModel
from gensim.test.utils import common_corpus, common_dictionary
from gensim.corpora import Dictionary


# from gensim.models.wrappers import LdaMallet

# spacy for lemmatization
import spacy
# download LLM: python -m spacy download en_core_web_sm

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# import NLTK stop words
import nltk
from nltk.corpus import stopwords
from nltk import tokenize
from nltk.tokenize import word_tokenize
#nltk.download('punkt') # run once
#nltk.download('stopwords')  # run once
stop_words_eng = stopwords.words('english')
stop_words_fr = stopwords.words('french')

'''
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
'''

mallet_path = "/Users/aaron68lee/Documents/Coding-Projects/NLP-Research/Mallet"

print(stop_words_fr[0:10])


['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle']


## Import Datasets

In [1]:
'''
Document Extractor
'''

import io
import os.path
import re
import tarfile

import smart_open

docs_folder_path = "/Users/aaron68lee/Documents/Coding-Projects/NLP-Research/FRconquest_split" # corpus folder
max_docs = 1
max_chars = 500

'''
DESCRIPTION: helper function to process all documents inside a corpus folder
RETURN:
    1) documents: python list of document as one string
    2) document_names: python list of document names (list of strings)
    3) doc_dict: dictionary mapping doc names to doc contents
'''
def extract_documents(url, max_docs=1):
    doc_count = 0
    documents = []
    document_names = []
    doc_dict = {} # combines documents and document_names into dictionary data structure
    
    with os.scandir(url) as entries:
        for entry in entries:
            if doc_count < max_docs:
                if entry.is_file() and entry.name.endswith('.txt'):
                    with open(entry, "r", encoding="utf-8", errors="replace") as file:
                        curr_doc = file.read()
                        documents.append(curr_doc)
                        document_names.append(entry.name)
                        doc_dict[entry.name] = curr_doc
                doc_count += 1
    return documents, document_names, doc_dict


## Preprocess Data 

### **Remove Stopwords**

In [142]:


############# Test Dataset as CSV ################

# Test on sample document

doc_path = 'FRconquest_split/1838_Tableau-Algeria_pt1_0.txt'

'''
# Import Dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

# Pre-process Data
# 1) Remove Punctuation and Stop Words

# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])
'''

######### import datasets #########

docs, doc_names, doc_dict = list(extract_documents(docs_folder_path, max_docs))
print("There are: ", len(docs), " documents in the corpus folder\n")
print("Document Names: ", doc_names)
#print("Doc Dictionary: ", doc_dict)
print("Sample abridged doc: \n\n", docs[0][:500])

# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')

# docs is already list of list of words

# Remove numbers, but not words that contain numbers.
# print(docs[0][0][0])
print(docs[0][:500])
doc_parsed = [[] for _ in range(len(docs))]
for doc in range(len(docs)):
    for j in range(len(docs[doc])):
        if not (docs[doc][j].isnumeric()):
            doc_parsed[doc].append(docs[doc][j])

print(doc_parsed[0][:500])
#docs = [[token for token in doc if not token.isnumeric()] for doc in docs]
#print(docs[0][:500])

# Remove words that are only one character.
#docs = [[token for token in doc if len(token) > 1] for doc in docs]
print("Sample abridged doc: \n\n", docs[0][:500])


There are:  1  documents in the corpus folder

Document Names:  ['Laplagne-Barris_FRconq_s1v07_3.txt']
Sample abridged doc: 

 provisoires du Bardo par des écuries permanentes; terminer la caserne G; construire des casernes pour , hommes de divers corps, des écuries pour chevaux, des pavillons pour officiers et une prison pour détenus; ° Construire les bâtiments des accessoires de hôpital militaire; . A terminer les magasins aux vivres et construire des magasins à orge. Ces divers travaux nécessiteront une dépense environ ,, francs. SETIF. FORTIFICATIONS. TRAVAUX EXÉCUTÉS EN . S ler. Travaux ordinaires. On a terminé les
provisoires du Bardo par des écuries permanentes; terminer la caserne G; construire des casernes pour , hommes de divers corps, des écuries pour chevaux, des pavillons pour officiers et une prison pour détenus; ° Construire les bâtiments des accessoires de hôpital militaire; . A terminer les magasins aux vivres et construire des magasins à orge. Ces divers travaux néces

In [145]:
print(len(docs))
print(docs)

1
["provisoires du Bardo par des écuries permanentes; terminer la caserne G; construire des casernes pour , hommes de divers corps, des écuries pour chevaux, des pavillons pour officiers et une prison pour détenus; ° Construire les bâtiments des accessoires de hôpital militaire; . A terminer les magasins aux vivres et construire des magasins à orge. Ces divers travaux nécessiteront une dépense environ ,, francs. SETIF. FORTIFICATIONS. TRAVAUX EXÉCUTÉS EN . S ler. Travaux ordinaires. On a terminé les tours -- du réduit. intérieur de cette dernière tour a été converti en silos. On a continué le travail de enceinte de la ville; la portion comprise entre la porte du Sud, sur le front -, et le bastion , est élevée jusà la hauteur des créneaux. On a réparé une brèche de enceinte romaine, à la gorge du quartier militaire, et fait des dépenses entretien. ( ) On a construit des prolonges, des camions pour exploitation des bois coupés dans les forêts du Bou- Thaleb. On a dressé le plan directeur

### **Tokenization**

In [117]:
# Tokenization

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

#data_words = list(sent_to_words(data))
doc_words = list(sent_to_words(docs)) # returns list of [list of words]
max_words = 50

#print(data_words[:1])
print(doc_words[0][:max_words]) # prints first max_words in first doc

# 2D List Sentence Datatype:
# list of sentences, where each list element contains word elements

'''
=============================
'''
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

[]


In [74]:
# Use a Unigram Model
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


In [80]:
# Define functions for stopwords, bigrams, trigrams and lemmatization

def remove_stopwords(texts, stop_words):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [84]:
# Install LLMs
'''
!python -m spacy download fr_core_news_sm # French
!python -m spacy download en_core_web_sm # English
'''
'''
DOESN'T WORK
'''
!pip install fr-core-news-sm
!pip install en_core_web_sm


[31mERROR: Could not find a version that satisfies the requirement fr-core-news-sm (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for fr-core-news-sm[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement en_core_web_sm (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for en_core_web_sm[0m[31m
[0m

### **Lemmatization**

In [116]:
# Continue preprocess data: remove STOP WORDS
# Make N-grams

# Remove Stop Words
stop_words = ['.', ',']
data_words_nostops = remove_stopwords(data_words, stop_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('fr_core_news_sm')
# for French LLM: fr_core_news_sm
# for French LLM: en_core_web_sm

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

OSError: [E050] Can't find model 'fr_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

### Word to Vector

In [85]:
# import data
sentences = list(sent_to_words(data))

# train the model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# print the learned vocabulary
vocab = model.wv.index_to_key
word_vectors = [model.wv.get_vector(word) for word in vocab]

print(random.sample(vocab, k=10))

sample_word = 'try'
sample_vect = model.wv[sample_word] if sample_word in vocab else None

# get the vector representation of a word
print(sample_vect)

['benefitting', 'qtpgt', 'cete', 'technically', 'anaysis', 'orbitals', 'differ', 'hitoshi', 'strangeness', 'wppk']
[ 1.2043434   0.45090935  0.6814459   0.71514285 -0.20565455  0.04899871
 -1.2390375   1.3187934  -1.0867342  -1.1873775   0.7780586  -1.1127293
 -0.70415527 -1.469049    2.66402    -0.2349517  -1.2596971  -2.065122
 -1.2119664  -0.35730192  1.194244    0.541137    0.9512737   1.8481481
 -1.7076167   0.9615651  -0.24181955  1.0372014   0.13068132  0.3456982
  1.2665006  -1.5543257  -1.101264    0.8776683   1.2107173   2.3477097
  1.6224908   0.60920066  0.13322088 -1.2307758   0.44541365  0.43859813
  0.54882747  0.9254551  -0.14761221  0.00746863 -1.1300443  -0.00929341
  1.5262134  -1.0511706   0.254548   -1.1940298  -0.01581612  0.56519884
 -1.5968169   0.82404643  1.2795249  -1.4314567  -0.7828622   1.1949501
 -0.71451455  0.44535488 -0.48009786 -0.15477279 -0.9401595   1.1043555
 -0.13124456  0.44604433 -1.4503627   1.5139769  -0.78384197  1.2473552
 -0.04288181  0.30