In [1]:
# Imports
import pandas as pd
import gensim
import numpy as np
import nltk
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt

import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import remove_stopwords
import pyLDAvis
import pyLDAvis.gensim

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

np.random.seed(2018) # set random seed


In [2]:
# Load data
datafolder = './data/'
tweets1 = pd.read_csv(datafolder + 'IRAhandle_tweets_1.csv')
tweets2 = pd.read_csv(datafolder + 'IRAhandle_tweets_2.csv')
tweets = pd.concat([tweets1, tweets2], axis=0, join='outer', join_axes=None, ignore_index=False,
          keys=None, levels=None, names=None, verify_integrity=False,
          copy=True)



In [3]:
# Function to perform processing after tweet datasets have been concatenated

def process_tweets(dataset, group_by = 'author', filter_language = 'English', extract_hashtags = True, filtersize = 3):
    """This function takes in the dataset in the format found directly from Kaggle. It filters by token length, 
    selectable language and has the option to extract hashtags into words. 
    
    The function returns the tokens as an array, with each 'sample' as a list of tokens written by a single 
    author. The function also returns the tags (hashtags) counts for each author
    
    The function takes in a parameter group_by that can be either 'hashtag' or 'author' (default). grouping by 
    hashtag retuns a dataset that has each row a concatenated number of tweets with that hashtag, while
    grouping by author concatenates all the tweets from the author"""
    import pandas as pd
    from nltk.corpus import wordnet as wn
    from collections import Counter

    pd.options.mode.chained_assignment = None  # default='warn', suppress the setting with copy warning

    # Filter for languages if true 
    if filter_language:
        # selecting content columns for subject categorization by language
        dataset = dataset[dataset.language == filter_language]
        cont = dataset.content
    else:
        cont = dataset.content
    
    content_filtered = cont.apply(lambda x: re.sub(r'http\S+', '', x)).apply(lambda x: re.sub(r"'|\"|`|:|\?|~|,|\.", '', x))\
                .apply(lambda x: remove_stopwords(x))


    # redefine content column for dataset
    dataset['content'] = content_filtered.values
    # Drop NaN values in content    
    dataset.dropna(axis=0,subset=['content'], inplace=True)
    # Concatenate tweets by user
    
    ##### GROUP BY AUTHOR ######
    if group_by == 'author':
        tweets_concatenated = dataset.groupby('author')['content'].apply(lambda x : x.sum()
                                                                    if x.dtype=='float64' else ' '.join(x))
        content = tweets_concatenated.copy()

        if extract_hashtags == True:

            def hashtagger(text):
                # Locate string of hashtag
                tags = re.findall(r'#\w+', text)
                tag_split = []
                for tag in tags:
                    no_hash = tag[1:]
                    tag_split.append(" ".join([a for a in re.split('([A-Z][a-z]+)', no_hash) if a])) # Separate out words    
                return ' '.join(tag_split) # return sentence
            def hashtag_counter(text):
                tags = re.findall(r'#\w+', text)
                tags = " ".join(tags)
                hashtag_count = Counter(tags.split())
                return hashtag_count



            # Count the hashtag frequency for each user
            hashtag_count = tweets_concatenated.apply(lambda x: hashtag_counter(x))

            # Extract words that are in hashtags
            hashtagged = tweets_concatenated.apply(lambda x: hashtagger(x))
            # Concatenate the words to the entire tweets
            hashtags_gone = hashtagged + tweets_concatenated
            # Remove hashtags since they are no longer needed and make all words lower case
            hashtags_gone = hashtags_gone.apply(lambda x: re.sub(r"#\w+", '', x)).apply(lambda x: x.lower())

            # Convert to NumPy array
            content = hashtags_gone.values

        # Get list of words that are stop words    
        en_stop = set(nltk.corpus.stopwords.words('english'))

        def get_lemma(word):
            '''Lemmatize the words into their roots'''
            lemma = wn.morphy(word)
            if lemma is None:
                return word
            else:
                return lemma


        content_tokens = [nltk.word_tokenize(x) for x in content]
        tokens = []
        for sublist in content_tokens:
            tokens.append([get_lemma(token) for token in sublist if token not in en_stop and len(token) > 3])

        return tokens, hashtag_count
    
    ##### GROUP BY HASHTAG ######
    
    if group_by == 'hashtag':
        
        # extract hashtag from each tweet and put it in a new column 
        def hashtag_counter(text):
            tags = re.findall(r'#\w+', text)
            tags = " ".join(tags)
            return tags
        
        hashtag_column = dataset['content'].apply(lambda x: hashtag_counter(x))
        df_hashtags = pd.concat([dataset['content'], hashtag_column], axis=1)
        df_hashtags.columns = ['content', 'hashtags']
        
        
        # make the series that has as the index values the hashtag and the column that has the concatenated 
        # tweets.
        tweets_concatenated = df_hashtags.groupby('hashtags')['content'].apply(lambda x : x.sum()
                                                                    if x.dtype=='float64' else ' '.join(x))
        # remove the hashtag shit
        hashtags_gone = tweets_concatenated.apply(lambda x: re.sub(r"#\w+", '', x)).apply(lambda x: x.lower())
        content = hashtags_gone.values
        # Get list of words that are stop words    
        en_stop = set(nltk.corpus.stopwords.words('english'))

        def get_lemma(word):
            '''Lemmatize the words into their roots'''
            lemma = wn.morphy(word)
            if lemma is None:
                return word
            else:
                return lemma


        content_tokens = [nltk.word_tokenize(x) for x in content]
        tokens = []
        for sublist in content_tokens:
            tokens.append([get_lemma(token) for token in sublist if token not in en_stop and len(token) > 3])
            
        return tweets_concatenated, tokens

In [4]:
data_hashtagged, tokens = process_tweets(tweets, group_by='hashtag')

In [18]:
ind_sport = data_hashtagged.index.get_loc("#sports")
ind_MAGA = data_hashtagged.index.get_loc('#MAGA')
ind_BlackLivesMatter = data_hashtagged.index.get_loc('#BlackLivesMatter')
ind_business = data_hashtagged.index.get_loc('#business')
ind_USFA = data_hashtagged.index.get_loc('#USFA')

# Concatenate the sublists into one to feed to the model
list_concat = []
list_concat.append(tokens[ind_sport])
list_concat.append(tokens[ind_MAGA])
list_concat.append(tokens[ind_BlackLivesMatter])
list_concat.append(tokens[ind_business])
list_concat.append(tokens[ind_USFA])
data_processed = list_concat.copy()

In [None]:
# Create dictionary of sum of frequencies across all author IDs
dict1 = dict(hashtags.sum())
# Export into pandas for ordering
hashtag_frequencies = pd.DataFrame.from_dict(dict1, orient='index', columns=['count'])
hashtag_frequencies.sort_values(['count'], ascending=False)

In [None]:
#sports, #MAGA, #BlackLivesMatter, #business, #USFA
# Load data
datafolder = './data/'
tweets1 = pd.read_csv(datafolder + 'IRAhandle_tweets_1.csv')
tweets2 = pd.read_csv(datafolder + 'IRAhandle_tweets_2.csv')
tweets = pd.concat([tweets1, tweets2], axis=0, join='outer', join_axes=None, ignore_index=False,
          keys=None, levels=None, names=None, verify_integrity=False,
          copy=True)


In [19]:
# Create dictionary to know the frequency of words
id2word = gensim.corpora.Dictionary(data_processed)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_processed]


In [23]:
# Build LDA model
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=20, 
#                                            random_state=100,
#                                            update_every=1,
#                                            chunksize=100,
#                                            passes=10,
#                                            alpha='auto',
#                                            per_word_topics=True)

lda_model = gensim.models.LdaMulticore(corpus = corpus,
                                       num_topics=6,
                                       id2word=id2word,
                                       chunksize=2000,
                                       passes=10, 
                                       per_word_topics=False,
                                       workers=2)

In [24]:
print(lda_model.print_topics())

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_processed, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

[(0, '0.008*"say" + 0.006*"latest" + 0.005*"saint" + 0.005*"orleans" + 0.005*"game" + 0.004*"report" + 0.004*"warrior" + 0.004*"deal" + 0.004*"state" + 0.003*"stocks"'), (1, '0.107*"enlist" + 0.019*"patriot" + 0.015*"america" + 0.014*"join" + 0.013*"stand" + 0.009*"need" + 0.009*"people" + 0.009*"read" + 0.008*"socialist" + 0.007*"time"'), (2, '0.026*"black" + 0.012*"police" + 0.012*"people" + 0.011*"movement" + 0.009*"white" + 0.008*"life" + 0.007*"trump" + 0.006*"call" + 0.006*"jackson" + 0.006*"woman"'), (3, '0.018*"saint" + 0.013*"game" + 0.013*"warrior" + 0.012*"orleans" + 0.007*"pelican" + 0.006*"draft" + 0.006*"2016" + 0.006*"coach" + 0.006*"final" + 0.006*"player"'), (4, '0.000*"enlist" + 0.000*"retweet" + 0.000*"rt_america" + 0.000*"saint" + 0.000*"game" + 0.000*"america" + 0.000*"orleans" + 0.000*"say" + 0.000*"warrior" + 0.000*"trump"'), (5, '0.075*"retweet" + 0.075*"rt_america" + 0.016*"trump" + 0.009*"realdonaldtrump" + 0.007*"president" + 0.005*"potus" + 0.004*"america" +

In [25]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [None]:
mallet_path = '/Users/halimaschede/Documents/GitHub/Adarmonic_mean_project/data/mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=15, id2word=id2word)

# Show Topics
print(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=content_stemmed, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)


In [None]:
# Check shit is working
test = bow_corpus[126]
for i in range(len(test)):
    print("Word {} (\"{}\") appears {} time.".format(test[i][0],dictionary[test[i][0]], 
test[i][1]))

In [None]:
# Create model on bag-of-words

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=2, workers=2)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
print(lda_model.show_topic(0))
# for topic in range(10):
#     plt.figure()
#     plt.imshow(WordCloud().fit_words(lda_model.show_topic(topic)))
#     plt.axis("off")
#     plt.title("Topic #" + str(topic))
#     plt.show()
