In [63]:
import pandas as pd
from cleaning import clean_doc
import numpy as np

from nlp import NLPPipe
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [64]:
# Load CSV of the articles 
articles = pd.read_csv('../src/clean_text.csv', index_col=0, sep='\t')

In [65]:
articles.head()

Unnamed: 0,article_id,date,text
0,42343,2020-09-03,data aggregation is the process of gathering d...
1,42356,2020-09-04,there are thousand of dataset repository on th...
2,42424,2020-09-05,one key operation in preparing the datasets in...
3,42431,2020-09-05,this is a follow up of my introduction to the ...
4,42445,2020-09-06,structured query language sql is famously know...


In [90]:
stop_words = stopwords.words('english')
words_to_add = ['file', 'python', 'code','wa', 'people','need', 'model', 'models']
for word in words_to_add:
    stop_words.append(word)

In [91]:
vectorizer = TfidfVectorizer(stop_words=stop_words)
nmf = NMF(n_components=30)
pipe = NLPPipe(
    cleaning_function=clean_doc,
    vectorizer=vectorizer,
    model=nmf
)

In [92]:
dtm = pipe.vectorizer.fit_transform(articles['text'])

In [93]:
topic_results = pipe.model.fit_transform(dtm)

In [94]:
vocab = pipe.vectorizer.get_feature_names()

In [105]:
pipe.save_pipe('../models/nmf_model')

In [108]:
test = NLPPipe()
test.load_pipe(filename='../models/nmf_model.mdl')

In [110]:
len(test.vectorizer.get_feature_names())

157001

In [96]:
display_topics(pipe.model, vocab, 30)


Topic  0
time, one, would, like, thing, get, work, could, way, know, think, even, make, something, day, might, really, want, problem, much, good, lot, go, question, going, take, look, see, say, many

Topic  1
image, images, pixel, convolution, color, convolutional, style, cnn, filter, augmentation, vision, face, layer, size, transfer, pooling, opencv, channel, segmentation, recognition, rgb, computer, vgg, trained, dog, label, cat, picture, folder, classification

Topic  2
learning, machine, deep, algorithm, ml, learn, course, neural, supervised, book, computer, language, min, unsupervised, intelligence, artificial, problem, read, algorithms, task, knowledge, reinforcement, field, networks, programming, concept, research, library, network, vision

Topic  3
app, api, web, command, notebook, project, install, page, create, package, google, use, jupyter, run, click, html, script, py, folder, environment, text, using, github, server, directory, request, library, flask, url, git

Topic  4


In [57]:
def display_topics(model, feature_names, no_top_words, topic_names=None, show_weights=False):
    """
    Displays Top words associated with topics from Topic Modeling

    model: trained NLP Model (SVD, NMF)
    feature_names: feature names from vectorizers
    no_top_words: number of words to show
    topic_names: List of topic names to assign topics
    show_weights: True to show weights of important words. 
    """
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        if show_weights:
            print([(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-no_top_words - 1:-1]])
        
        else:
            print(", ".join([feature_names[i]
                            for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [55]:
words = [np.argsort(row) for row in pipe.model.components_]

In [98]:
doc_topic_mat = pd.DataFrame(topic_results, columns=[f'Topic {str(i)}' for i in range(30)], index=articles.article_id)

In [99]:
doc_topic_mat

Unnamed: 0_level_0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,...,Topic 20,Topic 21,Topic 22,Topic 23,Topic 24,Topic 25,Topic 26,Topic 27,Topic 28,Topic 29
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42343,0.000000,0.000000,0.000000,0.000000,0.000000,0.026794,0.000000,0.000000,0.000000,0.000000,...,0.037588,0.000000,0.000000,0.000000,0.000000,0.000000,0.019837,0.000000,0.000000,0.000000
42356,0.000000,0.001041,0.000000,0.017656,0.000000,0.013377,0.000000,0.034122,0.009723,0.000000,...,0.000000,0.000000,0.000486,0.000000,0.000000,0.000000,0.000000,0.015989,0.000000,0.007457
42424,0.000000,0.000000,0.000000,0.000000,0.000000,0.000127,0.000000,0.000000,0.000000,0.000000,...,0.124587,0.050166,0.000000,0.000000,0.000000,0.000000,0.000000,0.002118,0.000000,0.000000
42431,0.003051,0.000000,0.000000,0.000000,0.000000,0.013785,0.000443,0.000000,0.000000,0.000000,...,0.011021,0.000000,0.000281,0.012307,0.000000,0.000000,0.000000,0.000000,0.000000,0.010401
42445,0.002229,0.000000,0.000000,0.000000,0.000000,0.020910,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.148953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42291,0.006118,0.000000,0.000000,0.001568,0.000000,0.003258,0.003031,0.000000,0.000000,0.003085,...,0.003591,0.001500,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
42303,0.008548,0.000000,0.000000,0.000000,0.000416,0.008736,0.005694,0.002180,0.000680,0.000000,...,0.000648,0.002010,0.001475,0.000000,0.001695,0.000380,0.000000,0.000000,0.000000,0.000524
42315,0.004919,0.000000,0.000000,0.000000,0.001852,0.035587,0.000000,0.000700,0.000000,0.000000,...,0.000000,0.016961,0.000000,0.000000,0.009622,0.018686,0.026147,0.006165,0.000000,0.000000
42324,0.000000,0.000000,0.000000,0.014829,0.036062,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.006374,0.007678,0.000000,0.002436,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [97]:
from collections import Counter
Counter(np.argmax(topic_results, axis=1))

Counter({18: 1189,
         7: 1273,
         20: 1975,
         5: 2344,
         29: 1001,
         21: 1177,
         26: 1211,
         12: 724,
         16: 864,
         27: 1744,
         13: 1396,
         2: 1542,
         14: 1024,
         22: 612,
         9: 774,
         17: 834,
         1: 1187,
         3: 2076,
         23: 995,
         4: 1997,
         10: 687,
         8: 1722,
         19: 893,
         6: 1730,
         24: 787,
         25: 741,
         0: 1357,
         28: 449,
         11: 643,
         15: 675})

In [104]:
pd.DataFrame(np.argmax(topic_results,axis=1), index=articles.article_id, columns=['topic']).reset_index().to_csv('../src/nmf_topics.csv')