In [45]:
import sys,os
import numpy as np
import pyLDAvis
from biterm.btm import oBTM 
from sklearn.feature_extraction.text import CountVectorizer
from biterm.utility import vec_to_biterms, topic_summuary # helper functions

import pandas as pd
import preprocessor as p
import re
import numpy as np

import string
import nltk

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

import stop_words
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from textblob import Word
from nltk.stem import WordNetLemmatizer
import wordsegment 

wordsegment.load()
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

stop_words = stop_words.get_stop_words('en') + nltk.corpus.stopwords.words('english')
stop_words = list(set(list(ENGLISH_STOP_WORDS) + stop_words + list(string.ascii_lowercase)))
lemmatizer = WordNetLemmatizer()

wordNet_lemmatizer = lambda x: " ".join([ lemmatizer.lemmatize(word) for word in x.split()])
texblob_lemmatizer = lambda x: " ".join([Word(word).lemmatize() for word in x.split()])

def segment_harsh_tags(tags):
    return " ".join([" ".join(wordsegment.segment(t)) for t in str(tags).split(" ") ])

def remove_links(tweet):
    tweet = re.sub(r'http\S+', ' ', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', ' ', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    tweet = re.sub(r'(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', ' ', tweet) # remove retweet
    tweet = re.sub(r'(@[A-Za-z]+[A-Za-z0-9-_]+)', ' ', tweet) # remove tweeted at
    return tweet

def clean_tweets(tweet):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.encode('ascii', 'ignore').decode('ascii') # remove emojis
    tweet = tweet.lower() # lower case
    tweet = re.sub(r'['+string.punctuation+ ']+', ' ', tweet) # strip punctuation
    tweet = re.sub(r'\s+', ' ', tweet) #remove double spacing
    tweet = re.sub(r'([0-9]+)', '', tweet) # remove numbers
#     tweet = wordNet_lemmatizer(tweet)
    tweet_token_list = [word for word in tweet.split(' ')if word not in stop_words and len(word)>2] # remove stopwords
    tweet = ' '.join(tweet_token_list)
    tweet = p.clean(tweet)
    return tweet

def get_users_topics(texts,num_topics=10 ):
    vec = CountVectorizer(stop_words='english')    # vectorize texts
    X = vec.fit_transform(texts).toarray()
    vocab = np.array(vec.get_feature_names())     # get vocabulary
    biterms = vec_to_biterms(X)    # get biterms
    btm = oBTM(num_topics=num_topics, V=vocab)    # create btm

    print("Train Online BTM ..")
    for i in range(0, len(biterms), 100): # prozess chunk of 200 texts
        biterms_chunk = biterms[i:i + 100]
        btm.fit(biterms_chunk, iterations=50)
    topics = btm.transform(biterms)
    summary = topic_summuary(btm.phi_wz.T, X, vocab, num_topics, verbose=False)
    return [t.argmax() for t in topics], summary


all_user_data_df = pd.read_csv("data/tweets_info.csv")
all_user_data_df['segmented_harshtags']=all_user_data_df['harsh_tags'].map(segment_harsh_tags)
all_user_data_df['segmented_harshtags']=all_user_data_df['segmented_harshtags'].map(clean_tweets)

texts_df = pd.read_csv("data/tweets_standalone.csv")
texts_df['clean_tweets'] = texts_df['tweets'].map(clean_tweets) 
texts_df['topics'] = ''
user_topics_df = pd.DataFrame(columns=["follower", "topic_id", "coherence", "top_words", "theme"])

for user in texts_df['followers_screen_name'].unique():
    print("processing for user:{} with {} tweets".format(user, len(texts_df)))

    user_query = texts_df['followers_screen_name']==user
    text = list(texts_df[user_query]['clean_tweets'])

    #actual topic modelling
    topics, summary = get_users_topics(text, num_topics=10)

    for i in range(len(summary['coherence'])):
        user_topics_df=user_topics_df.append({
                           "follower":user, 
                            "topic_id":i,
                           "coherence":summary['coherence'][i], 
                           "top_words":" ".join( summary['top_words'][i])
                          }, 
                          ignore_index=True)
    texts_df.loc[user_query, 'topics']=topics
#     texts_df[user_query]['topics'] = topics
    print("\n")
        
#     print("\n\n Texts & Topics ..")
#     for i in range(len(texts)):
#         print("{} (topic: {})".format(texts[i], topics[i].argmax()))

processing for user:MrMarketingPhD with 441 tweets
Train Online BTM ..


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:49<00:00,  1.00s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:31<00:00,  1.69it/s]




processing for user:Garrettsrock_24 with 441 tweets
Train Online BTM ..


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 20.55it/s]




processing for user:AmyFox04307552 with 441 tweets
Train Online BTM ..


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:18<00:00,  2.88it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:11<00:00,  4.53it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 659.66it/s]




processing for user:notyouravgwoman with 441 tweets
Train Online BTM ..


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 19.62it/s]






In [44]:
# user_topics_df.to_csv("user_topics.csv")
# texts_df.to_csv("tweets_user.csv")

topic_freq_df = texts_df\
                    .groupby(['followers_screen_name','topics'],as_index=False)\
                    .size()\
                    .reset_index()\
                    .rename(columns={0: 'topic_freq_per_user'})
top_n_topics_per_user = topic_freq_df\
                    .groupby(['followers_screen_name'])['topic_freq_per_user'].nlargest(3)\
                    .reset_index()\
                    .rename(columns={"level_1": 'topics'})

# top_n_topics_per_user
# topic_freq_df
# user_topics_df
# # texts_df
# all_user_data_df

In [35]:
txt = all_user_data_df['segmented_harshtags'][1]
topics, summary = get_users_topics([txt], num_topics=3)
summary['top_words']

Train Online BTM ..


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:03<00:00, 13.29it/s]


[array(['nsmq', 'thousand', 'xij'], dtype='<U12'),
 array(['xij', 'city', 'iba'], dtype='<U12'),
 array(['xij', 'city', 'iba'], dtype='<U12')]

In [7]:
import sys,os
import numpy as np
import pyLDAvis
from biterm.btm import oBTM 
from sklearn.feature_extraction.text import CountVectorizer
from biterm.utility import vec_to_biterms, topic_summuary # helper functions

import pandas as pd
import preprocessor as p
import re
import numpy as np

import string
import nltk

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

import stop_words
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from textblob import Word
from nltk.stem import WordNetLemmatizer
import wordsegment 

wordsegment.load()
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

stop_words = stop_words.get_stop_words('en') + nltk.corpus.stopwords.words('english')
stop_words = list(set(list(ENGLISH_STOP_WORDS) + stop_words + list(string.ascii_lowercase)))
lemmatizer = WordNetLemmatizer()

wordNet_lemmatizer = lambda x: " ".join([ lemmatizer.lemmatize(word) for word in x.split()])
texblob_lemmatizer = lambda x: " ".join([Word(word).lemmatize() for word in x.split()])

def remove_links(tweet):
    tweet = re.sub(r'http\S+', ' ', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', ' ', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    tweet = re.sub(r'(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', ' ', tweet) # remove retweet
    tweet = re.sub(r'(@[A-Za-z]+[A-Za-z0-9-_]+)', ' ', tweet) # remove tweeted at
    return tweet

def clean_tweets(tweet):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.encode('ascii', 'ignore').decode('ascii') # remove emojis
    tweet = tweet.lower() # lower case
    tweet = re.sub(r'['+string.punctuation+ ']+', ' ', tweet) # strip punctuation
    tweet = re.sub(r'\s+', ' ', tweet) #remove double spacing
    tweet = re.sub(r'([0-9]+)', '', tweet) # remove numbers
    tweet = wordNet_lemmatizer(tweet)
    tweet = texblob_lemmatizer(tweet)
    tweet_token_list = [word for word in tweet.split(' ')if word not in stop_words and len(word)>2] # remove stopwords
#     tweet_token_list = [word_rooter(word) if '#' not in word else word for word in tweet_token_list] # apply word rooter
#     if bigrams:
#         tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
#                                             for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    tweet = p.clean(tweet)
#     tweet = word_tokenize(tweet)
    return tweet


if __name__ == "__main__":

    texts = open(os.path.realpath('./data/reuters.titles.txt')).read().splitlines() # path of data file
    texts = pd.read_csv("tweets_standalone.csv")["tweets"]
    texts = [clean_tweets(txt) for txt in list(texts)]
    
    # vectorize texts
    vec = CountVectorizer(stop_words='english')
    X = vec.fit_transform(texts).toarray()

    # get vocabulary
    vocab = np.array(vec.get_feature_names())

    # get biterms
    biterms = vec_to_biterms(X)

    # create btm
    btm = oBTM(num_topics=10, V=vocab)

    print("\n\n Train Online BTM ..")
    for i in range(0, len(biterms), 100): # prozess chunk of 200 texts
        biterms_chunk = biterms[i:i + 100]
        btm.fit(biterms_chunk, iterations=50)
    topics = btm.transform(biterms)

#     print("\n\n Visualize Topics ..")
#     vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
#     pyLDAvis.save_html(vis, './vis/online_btm.html')  # path to output

    print("\n\n Topic coherence ..")
    topic_summuary(btm.phi_wz.T, X, vocab, 10)

    print("\n\n Texts & Topics ..")
    for i in range(len(texts)):
        print("{} (topic: {})".format(texts[i], topics[i].argmax()))



 Train Online BTM ..


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:38<00:00,  1.85s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:50<00:00,  1.19it/s]




 Visualize Topics ..


ValidationError: 
 * Not all rows (distributions) in doc_topic_dists sum to 1.