In [1]:
# -*- coding:utf-8 -*-
"""
        LDA Practise
        
        使用示例：
            # 已知的推特
            known_tweets = \
            '''
            first tweet balabala ...
    
            second tweet balabala ...
        
            third tweet balabala ...
            
            ...
            '''
            
            # 待分类的推特
            test_tweets = \
            '''
            Hillary is happy for the Election result!

            Obama is not supporting Trump.
            '''
            
            # 直接获取主题
            lda = LDA(df, num_topics=5)
            lda.get_topic(test_tweets)
            # 结果示例
            [(0, 0.8397428), (0, 0.7999622)]
            # 结果解释：第一条推特有84.0%的概率属于第0个主题；第二条推特也有80%的概率属于第0个主题
"""
from gensim import corpora, models, similarities
import gensim
import numpy as np
import pandas as pd
import re
import io

In [2]:
def clean_tweet_text(text):
    text = text.replace('\n'," ") 
    text = re.sub(r"\.+", " ", text)  # 省略号 (如 watching...never 不能成为 watchingnever)
    text = re.sub(r"-", " ", text) 
    text = re.sub(r"\d+/\d+/\d+", "", text) 
    text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text) 
    text = re.sub(r"[\w]+@[\.\w]+", "", text) 
    text = re.sub(r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text)
    pure_text = ''
    for letter in text:
        if letter.isalpha() or letter==' ':
            pure_text += letter
    text = ' '.join(word for word in pure_text.split() if len(word)>1)
    return text

class LDA:
    stoplist = ['very', 'ourselves', 'am', 'doesn', 'through', 'me', 'against', 'up', 'just', 'her', 'ours', 
                'couldn', 'because', 'is', 'isn', 'it', 'only', 'in', 'such', 'too', 'mustn', 'under', 'their', 
                'if', 'to', 'my', 'himself', 'after', 'why', 'while', 'can', 'each', 'itself', 'his', 'all', 'once', 
                'herself', 'more', 'our', 'they', 'hasn', 'on', 'ma', 'them', 'its', 'where', 'did', 'll', 'you', 
                'didn', 'nor', 'as', 'now', 'before', 'those', 'yours', 'from', 'who', 'was', 'm', 'been', 'will', 
                'into', 'same', 'how', 'some', 'of', 'out', 'with', 's', 'being', 't', 'mightn', 'she', 'again', 'be', 
                'by', 'shan', 'have', 'yourselves', 'needn', 'and', 'are', 'o', 'these', 'further', 'most', 'yourself', 
                'having', 'aren', 'here', 'he', 'were', 'but', 'this', 'myself', 'own', 'we', 'so', 'i', 'does', 'both', 
                'when', 'between', 'd', 'had', 'the', 'y', 'has', 'down', 'off', 'than', 'haven', 'whom', 'wouldn', 
                'should', 've', 'over', 'themselves', 'few', 'then', 'hadn', 'what', 'until', 'won', 'no', 'about', 
                'any', 'that', 'for', 'shouldn', 'don', 'do', 'there', 'doing', 'an', 'or', 'ain', 'hers', 'wasn', 
                'weren', 'above', 'a', 'at', 'your', 'theirs', 'below', 'other', 'not', 're', 'him', 'during', 'which']

    def __init__(self, df, num_topics=1):
        """
        初始化
        df: 文件对象或文件内容字符串
        num_topics: 目标主题数量
        """
        if not isinstance(df, io.IOBase):  
            if isinstance(df, str):
                df = df.split("\n")
            else:
                return
        
        corpus, dictionary = self.get_corpus(df)

        self.lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)

    def get_topic(self, tweets):
        """
        获取主题概率列表
        """
        if not isinstance(tweets, io.IOBase):  
            if isinstance(tweets, str):
                tweets = tweets.split("\n")
            else:
                return
            
        corpus, dictionary = self.get_corpus(tweets)
        
        topics = [{t[0]: t[1] for t in self.lda.get_document_topics(bow)} for bow in corpus]
        print(topics)
        closest_topics = []
        for topic_items in topics:
            posibilities = [psb for psb in topic_items.values()]
            max_psb_index = posibilities.index(max(posibilities))
            closest_topic = (max_psb_index, topic_items[max_psb_index])
            closest_topics.append(closest_topic)
        return closest_topics
#         return list(self.lda.get_document_topics(corpus))

    def get_corpus(self, df):
        # 由输入生成文档列表（这里每一个文档就是一条推特）
        docs = [clean_tweet_text(line) for line in df if line]
        doclist = np.array(docs)
        
        # 由文档列表生成词袋
        texts = [[word for word in doc.lower().split() if word not in LDA.stoplist] for doc in doclist]
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        
        return corpus, dictionary

In [3]:
# df = open("./input/HillaryTweets.txt")
df = \
"""
To all the little girls watching...never doubt that you are valuable and powerful & deserving of every chance & opportunity in the world.

I was greeted by this heartwarming display on the corner of my street today. Thank you to all of you who did this. Happy Thanksgiving. -H

Hoping everyone has a safe & Happy Thanksgiving today, & quality time with family & friends. -H

Scripture tells us: Let us not grow weary in doing good, for in due season, we shall reap, if we do not lose heart.

Let us have faith in each other. Let us not grow weary. Let us not lose heart. For there are more seasons to come and...more work to do

We have still have not shattered that highest and hardest glass ceiling. But some day, someone will

To Barack and Michelle Obama, our country owes you an enormous debt of gratitude. We thank you for your graceful, determined leadership

Our constitutional democracy demands our participation, not just every four years, but all the time

You represent the best of America, and being your candidate has been one of the greatest honors of my life

Last night I congratulated Donald Trump and offered to work with him on behalf of our country

Already voted? That's great! Now help Hillary win by signing up to make calls now

It's Election Day! Millions of Americans have cast their votes for Hillary—join them and confirm where you vote

We don’t want to shrink the vision of this country. We want to keep expanding it

We have a chance to elect a 45th president who will build on our progress, who will finish the job

I love our country, and I believe in our people, and I will never, ever quit on you. No matter what

"""

In [4]:
test_tweets = \
"""
Hillary is happy for the Election result!

Obama is not supporting Trump.
"""
LDA(df, num_topics=5).get_topic(test_tweets)

[{0: 0.040005475, 1: 0.040008996, 2: 0.83909035, 3: 0.040886905, 4: 0.040008295}, {0: 0.050005645, 1: 0.050009314, 2: 0.79996574, 3: 0.050010726, 4: 0.050008588}]


[(2, 0.83909035), (2, 0.79996574)]