In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')
nltk.download('punkt')

In [None]:
import re
import numpy as np
import pandas as pd
import os
import string
import spacy
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
%matplotlib inline

In [None]:
regex = re.compile('[^a-zA-Z]')

In [None]:
nlp = English()

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
grdf = pd.read_csv('C:/Users/Cactuar/Downloads/15k_send_to_mh.csv', engine='python')

In [None]:
grdf = grdf['descrip']

In [None]:
def tokenize(text):
    """Parses a string into a list of semantic units (words)

    Args:
        text (str): The string that the function will tokenize.

    Returns:
        list: tokens parsed out by the mechanics of your choice
    """
    
    tokens = re.sub(r'[^a-zA-Z ^0-9]', '', text)
    tokens = tokens.lower().split()
    
    return tokens

In [None]:
filtered_list = []
for val in grdf:
    tokens = word_tokenize(regex.sub(" ",val))
    #tokens = regex.sub('', tokens)
    filtered = []
    for word in tokens:
        if word not in stop_words:
            filtered.append(word.lower())
    lemmas = []        
    for word in filtered:
        word1 = wordnet_lemmatizer.lemmatize(word, pos = "n")
        word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
        word3 = wordnet_lemmatizer.lemmatize(word2, pos = "a")
        lemmas.append(word3.translate(str.maketrans('', '', string.punctuation)))
    filtered_list.append(lemmas)

In [None]:
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
for x,y in enumerate(filtered_list):
    try:
        file = open(f"C:/Users/Cactuar/Projects/bb_txt/book_doc{x}.txt", "w", encoding="utf-8")
    except:
        print(f"{x}")
    try:
        file.write(f"{y}")
    except:
        print(f"{x}")
    file.close()

In [None]:
def gather_data(filefolder):
    """ Produces List of Documents from a Directory
    
    filefolder (str): a path of .txt files
    
    returns list of strings 
    """
    
    data = []
    
    files = os.listdir(filefolder)
    
    for article in files: 
        
        path = os.path.join(filefolder, article)
                    
        if  path[-3:] == 'txt':
            with open(path, 'rb') as f:
                data.append(f.read())
    
    return data

In [None]:
data = gather_data('C:/Users/Cactuar/Projects/bb_txt')

In [None]:
sparse = tfidf.fit_transform(data)

In [None]:
dtm = pd.DataFrame(sparse.todense(), columns=tfidf.get_feature_names())

In [None]:
id2word = corpora.Dictionary(filtered_list)

In [None]:
texts = filtered_list

In [None]:
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, iterations=500,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=200,
                                           passes=30,
                                           alpha='auto',
                                           per_word_topics=False)

In [None]:
print('Perplexity: ', lda_model.log_perplexity(corpus))
# lower the better.
# -13.9373

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=filtered_list, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)
# higher is better
# 0.4770
# 0.59

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]