# Code for topic extraction based on unsupervised learning 



Topic model aims to extract themes from large-scale unstructured corpus. Topic is the core idea expressed in corpus. From a statistical point of view, word frequency distribution is used to describe themes. A sentence, a paragraph and a document are generated from a topic-based and word-based probabilistic model. 
    
In the field of subject model, the most commonly used algorithm is LDA (Latent Dirichlet Allocation), which was proposed by Blei et al. in 2003. Specifically, LDA is an unsupervised machine learning technology, which regards the nature of documents as a collection of potential topics, and each topic is based on word distribution. LDA uses the BoW (Bag of Words) method to transform each document to a word frequency vector, thus converting unstructured data into easy-to-build, numerical information of modules.

In [None]:
# Import related modules 
import #Word Segmentation Models
import pandas as pd  # Used to process Excel files
import pyLDAvis  # Topic model visualization tool  
import pyLDAvis.sklearn  # pyLDAvis's sklearn interface, sklearn = Scikit-learn
# TfidfVectorizer transforms the document into TF-IDF matrix, 
# and CountVectorizer transforms the document into the word frequency matrix. 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# online variational Bayes Algorithm
from sklearn.decomposition import LatentDirichletAllocation


In [None]:
df = pd.read_csv('file.csv', names=['sentence'])  # read file 
df.tail()
df.shape

## 1.Word segmentation

In [None]:
#Word Segmentation
# Function for word segmentation
def word_cut(sentence):
    return " ".join(fool.cut(sentence))

df['sentence_cutted'] = df['sentence'].apply(word_cut)
df['sentence_cutted'].head()

## 2.Remove the stop word

In [None]:
stopwords = list()
# imput the stop word
with open('stopwords.txt', 'r') as f:
    for item in f.readlines():
        stopwords.append(item.strip())
stopwords = list(set(stopwords))
print(len(stopwords))

## 3.Document matrix transformation 

In [None]:
%%time
n_features = 1000
tf_vectorizer = CountVectorizer(strip_accents='unicode',  # Three options for deleting accents ：ascii, unicode, None
                                max_features=n_features,  # Consider only the first n words with the largest word frequency in the corpus.
                                stop_words=stopwords,  # stop word can be list
                                max_df=0.80,  # If a word appears in more than 50% of the corpus, then give up.  | 0.50 0.75 0.70
                                min_df=10)  # The truncated value (cut-off) is an integer, which means that the word frequency is less than 10.  | 10 6 9
tf = tf_vectorizer.fit_transform(df.sentence_cutted)  # Learn vocabulary dictionary and return document word matrix. 

In [None]:
%%time
n_features = 1000
tf_idf_vectorizer = TfidfVectorizer(strip_accents='unicode', 
                                    stop_words=stopwords, 
                                    max_df=0.80, 
                                    min_df=10, 
                                    max_features=n_features, 
                                    use_idf=True)
tf_idf = tf_idf_vectorizer.fit_transform(df.sentence_cutted)

## 4.Topic model training 

In [None]:
%%time
n_components = 4  # Number of topics 
lda = LatentDirichletAllocation(n_components=n_components, 
                                max_iter=50,  # Maximum iterations 
                                learning_method='online',  # Two options: batch and online
                                learning_offset=50., 
                                random_state=0, 
                                n_jobs=-1)
lda.fit(tf)  # Learn from TF (matrix) and build a model. 
lda.get_params()

## 5.View the theme 

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print('theme #%d:'%topic_idx)
        print(' '.join([feature_names[i] for i in topic.argsort()[: -n_top_words-1: -1]]))
    print()

n_top_words = 20
tf_feature_names = tf_vectorizer.get_feature_names()  # Array mapping from index to name 
# tf_idf_feature_names = tf_idf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)
# print_top_words(lda, tf_idf_feature_names, n_top_words)

In [None]:
tf_feature_names = tf_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    _ = topic.sum()
    print('==========', 'theme #%d'%topic_idx, '==========')
    target = topic.argsort()[::-1][:50]
    formula = ''
    for item in target:
        print(tf_feature_names[item], ',', topic[item] / _)
#         print('%s*%f'%(tf_feature_names[item], topic[item] / _))
        formula += '"%s"*%f + '%(tf_feature_names[item], topic[item] / _)
    print(formula.rstrip(' +'), '\n')

## 6.Interactive topic model      

Interpretation of interactive visual topic model            
Each circle on the left represents a theme. 
The size of the circle represents the proportion of articles covered by each topic.            
Without hovering over the topic, the right-hand keyword represents the 30 most important keywords extracted from the entire text.            
When hovering over a topic, the red progress bar on the right indicates the frequency of the corresponding keyword under the topic. 

In [None]:
%%time
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
pyLDAvis.save_html(vis, 'vis.html')  
# Because of the problem of visual class library, you need to save the reference path of CSS and JS after saving it as a web file. 

### Edited by:

#### Yuru LI
the Communication University of China

Laboratory of Data Mining and Social Computing