In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
def clean_tweet_text(text):
    text = text.replace('\n'," ") #新行，我们是不需要的
    text = re.sub(r"\.+", " ", text)  # 省略号 (如 watching...never 不能成为 watchingnever)
    text = re.sub(r"-", " ", text) #把 "-" 的两个单词，分开。（比如：pre-processing ==> pre processing）
    text = re.sub(r"\d+/\d+/\d+", "", text) #日期，对主体模型没什么意义
    text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text) #时间，没意义
    text = re.sub(r"[\w]+@[\.\w]+", "", text) #邮件地址，没意义
    text = re.sub(r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text) #网址，没意义
    pure_text = ''
    # 以防还有其他特殊字符（数字）等等，我们直接把他们loop一遍，过滤掉
    for letter in text:
        # 只留下字母和空格
        if letter.isalpha() or letter==' ':
            pure_text += letter
    # 再把那些去除特殊字符后落单的单词，直接排除。
    # 我们就只剩下有意义的单词了。
    text = ' '.join(word for word in pure_text.split() if len(word)>1)
    return text

In [3]:
# df = open("./input/HillaryTweets.txt")
df = \
"""
To all the little girls watching...never doubt that you are valuable and powerful & deserving of every chance & opportunity in the world.

I was greeted by this heartwarming display on the corner of my street today. Thank you to all of you who did this. Happy Thanksgiving. -H

Hoping everyone has a safe & Happy Thanksgiving today, & quality time with family & friends. -H

Scripture tells us: Let us not grow weary in doing good, for in due season, we shall reap, if we do not lose heart.

Let us have faith in each other. Let us not grow weary. Let us not lose heart. For there are more seasons to come and...more work to do

We have still have not shattered that highest and hardest glass ceiling. But some day, someone will

To Barack and Michelle Obama, our country owes you an enormous debt of gratitude. We thank you for your graceful, determined leadership

Our constitutional democracy demands our participation, not just every four years, but all the time

You represent the best of America, and being your candidate has been one of the greatest honors of my life

Last night I congratulated Donald Trump and offered to work with him on behalf of our country

Already voted? That's great! Now help Hillary win by signing up to make calls now

It's Election Day! Millions of Americans have cast their votes for Hillary—join them and confirm where you vote

We don’t want to shrink the vision of this country. We want to keep expanding it

We have a chance to elect a 45th president who will build on our progress, who will finish the job

I love our country, and I believe in our people, and I will never, ever quit on you. No matter what

""".split("\n")

In [4]:
docs = [clean_tweet_text(line) for line in df if line != "\n"]

In [5]:
doclist = np.array(docs)

In [6]:
from gensim import corpora, models, similarities
import gensim

In [7]:
stoplist = ['very', 'ourselves', 'am', 'doesn', 'through', 'me', 'against', 'up', 'just', 'her', 'ours', 
            'couldn', 'because', 'is', 'isn', 'it', 'only', 'in', 'such', 'too', 'mustn', 'under', 'their', 
            'if', 'to', 'my', 'himself', 'after', 'why', 'while', 'can', 'each', 'itself', 'his', 'all', 'once', 
            'herself', 'more', 'our', 'they', 'hasn', 'on', 'ma', 'them', 'its', 'where', 'did', 'll', 'you', 
            'didn', 'nor', 'as', 'now', 'before', 'those', 'yours', 'from', 'who', 'was', 'm', 'been', 'will', 
            'into', 'same', 'how', 'some', 'of', 'out', 'with', 's', 'being', 't', 'mightn', 'she', 'again', 'be', 
            'by', 'shan', 'have', 'yourselves', 'needn', 'and', 'are', 'o', 'these', 'further', 'most', 'yourself', 
            'having', 'aren', 'here', 'he', 'were', 'but', 'this', 'myself', 'own', 'we', 'so', 'i', 'does', 'both', 
            'when', 'between', 'd', 'had', 'the', 'y', 'has', 'down', 'off', 'than', 'haven', 'whom', 'wouldn', 
            'should', 've', 'over', 'themselves', 'few', 'then', 'hadn', 'what', 'until', 'won', 'no', 'about', 
            'any', 'that', 'for', 'shouldn', 'don', 'do', 'there', 'doing', 'an', 'or', 'ain', 'hers', 'wasn', 
            'weren', 'above', 'a', 'at', 'your', 'theirs', 'below', 'other', 'not', 're', 'him', 'during', 'which']

In [8]:
texts = [[word for word in doc.lower().split() if word not in stoplist] for doc in doclist]

In [9]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [10]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5)

In [16]:
_texts = [["hillary", "is", "happy", "for", "the", "election", "result"], 
         ["obama", "is", "not", "supporting", "trump"]]
dictionary = corpora.Dictionary(_texts)
corpus = [dictionary.doc2bow(text) for text in _texts]
corpus
list(lda.get_document_topics(corpus))
lda.show_topics()

[(0,
  '0.073*"us" + 0.061*"let" + 0.033*"work" + 0.030*"grow" + 0.029*"lose" + 0.029*"heart" + 0.028*"weary" + 0.019*"faith" + 0.019*"come" + 0.019*"day"'),
 (1,
  '0.031*"country" + 0.031*"want" + 0.017*"thank" + 0.017*"happy" + 0.017*"today" + 0.017*"make" + 0.017*"thanksgiving" + 0.017*"already" + 0.017*"greeted" + 0.017*"matter"'),
 (2,
  '0.025*"chance" + 0.025*"time" + 0.025*"progress" + 0.025*"president" + 0.025*"four" + 0.025*"demands" + 0.025*"participation" + 0.025*"elect" + 0.025*"job" + 0.025*"years"'),
 (3,
  '0.030*"time" + 0.030*"thanksgiving" + 0.030*"quality" + 0.030*"friends" + 0.030*"everyone" + 0.030*"safe" + 0.030*"family" + 0.030*"today" + 0.030*"hoping" + 0.030*"happy"'),
 (4,
  '0.025*"country" + 0.025*"every" + 0.025*"never" + 0.025*"chance" + 0.025*"opportunity" + 0.025*"deserving" + 0.025*"world" + 0.025*"little" + 0.025*"leadership" + 0.025*"girls"')]