In [1]:
import pandas as pd
import numpy as np

tweet_df = pd.read_pickle('../data/agg_trans_tweets.pickle.gz', compression='gzip')
tweet_df['i'] = list(range(len(tweet_df)))
tweet_df.set_index('date', inplace=True)
tweet_df

Unnamed: 0_level_0,tweet,author,id,i
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-04-27 21:45:50,@Alicoh1 @qwosl @kdqd3 @HotlineJosh But strang...,Platonista1,1387161089121701892,0
2021-04-27 21:45:49,RT @DavidPaisley: Not only do we export our tr...,daniellismore,1387161086479241218,1
2021-04-27 21:45:45,RT @vickiringer: NC’s bathroom bill discrimina...,CherieAnne,1387161067000840198,2
2021-04-27 21:45:40,@yelloweycat @Mushr00mSteve @BaddestJeanette A...,encyclopath,1387161048352841728,3
2021-04-27 21:45:40,@thehill This CEO is fired for belittling a gu...,rynob11,1387161046612336641,4
...,...,...,...,...
2021-04-20 08:31:11,@idkwtwhnid @GhostlyWeevil @GayestFesh @bigges...,natleah_,1384424393124761600,303843
2021-04-20 08:30:31,RT @HANNAHLUVSDSGC: north carolina wants to pa...,taoIings,1384424225113624578,303844
2021-04-20 08:30:31,RT @mcclure111: If you live in a country where...,Angstlers1,1384424224715067394,303845
2021-04-20 08:30:08,"RT @Native_Orchid: Hey, y'all, please:\n\nIn d...",ExhaustedIsaac,1384424128816443392,303846


## LDA Performance

How well does LDA perform? First we must tokenize and vectorize all of our tweets.

In [2]:
import sklearn
import gzip
import pickle
import sys
import os
sys.path.append("..")
import utils.tokens as ut

stopwords = ut.build_stopword_set()

def tokenizer(tweet):
    return ut.tokenize_tweet(tweet, stopwords)

def autopickle(path):
    """Helper utility to either load a model from file, or store a newly-trained one."""
    def decorator(func):
        if os.path.exists(path):
            with gzip.open(path, 'rb') as file:
                model = pickle.load(file)
        else:
            model = func()
            with gzip.open(path, 'wb') as file:
                pickle.dump(model, file)
        return model
    return decorator
        
@autopickle('../models/gs-tfidf.pickle.gz')
def vectorizer():
    vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(tokenizer=tokenizer)
    vectorizer.fit(tweet_df['tweet'])
    return vectorizer

[nltk_data] Downloading package stopwords to /home/astrid/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/astrid/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Next, we will split our tweets into a few different datasets.

In [3]:
N_HYPER_TUNING = 20_000
N_TEST = 50_000

shuffled = tweet_df.sample(frac=1)
df_hyper_tuning = shuffled.iloc[:N_HYPER_TUNING]
df_training = shuffled.iloc[N_HYPER_TUNING:-N_TEST]
df_test = shuffled.iloc[:-N_TEST]
del shuffled  # save memory

print(f'Hyperparameter Tuning: {len(df_hyper_tuning)}')
print(f'Training: {len(df_training)}')
print(f'Test: {len(df_test)}')

Hyperparameter Tuning: 20000
Training: 233848
Test: 253848


Let's run a LDA on a small sample of the training data first to see what it looks like.

In [4]:
@autopickle('../models/gs-sample-lda.pickle.gz')
def sample_lda():
    lda = sklearn.decomposition.LatentDirichletAllocation(
        n_components=20,
        max_iter=15,
        n_jobs=-1,  # all processors
    )
    lda.fit(vectorizer.transform(df_training['tweet'].sample(50_000)))
    return lda

In [5]:
import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(sample_lda, vectorizer.transform(df_test['tweet']), vectorizer, mds='tsne')



Although the results are not too bad, they can definitely be improved.

### Hyperparameter Tuning

We want to find better hyperparameters for our dataset. One way to do this is with grid search. First, let's see what parameters we have available to us for tuning.

In [6]:
sklearn.decomposition.LatentDirichletAllocation().get_params().keys()

  and should_run_async(code)


dict_keys(['batch_size', 'doc_topic_prior', 'evaluate_every', 'learning_decay', 'learning_method', 'learning_offset', 'max_doc_update_iter', 'max_iter', 'mean_change_tol', 'n_components', 'n_jobs', 'perp_tol', 'random_state', 'topic_word_prior', 'total_samples', 'verbose'])

Now, we will run our grid search on our small hyperparameter tuning dataset.

In [7]:
@autopickle('../models/gs-gridsearch-lda.pickle.gz')
def grid_search_lda():
    gs = sklearn.model_selection.GridSearchCV(
        estimator=sklearn.decomposition.LatentDirichletAllocation(
            n_jobs=-1,
        ),
        param_grid={
            'n_components': [10, 20, 30], 
            'doc_topic_prior': [1/40, 1/30, 1/20, 1/10],
            'topic_word_prior': [1/40, 1/30, 1/20, 1/10],
        },
        verbose=2,
    )
    vectorized_train = vectorizer.transform(df_hyper_tuning['tweet'])
    gs.fit(vectorized_train)
    return gs

grid_search_lda.best_params_

  and should_run_async(code)


{'doc_topic_prior': 0.1, 'n_components': 10, 'topic_word_prior': 0.1}

The first grid search was fairly coarse. Let's try narrowing our grid search range.

In [10]:
@autopickle('../models/gs-gridsearch2-lda.pickle.gz')
def grid_search_lda_2():
    bp = grid_search_lda.best_params_
    gs = sklearn.model_selection.GridSearchCV(
        estimator=sklearn.decomposition.LatentDirichletAllocation(
            n_jobs=-1,
            random_state=1,
        ),
        param_grid={
            'n_components': [bp['n_components'] + dn for dn in [-2, 1, 0, 1, 2]], 
            'doc_topic_prior': [1 / (1 / bp['doc_topic_prior'] + dn) for dn in [-2, 0, 2]],
            'topic_word_prior': [1 / (1 / bp['topic_word_prior'] + dn) for dn in [-2, 0, 2]],
        },
        verbose=2,
    )
    vectorized_train = vectorizer.transform(df_hyper_tuning['tweet'])
    gs.fit(vectorized_train)
    return gs

grid_search_lda_2.best_params_

  and should_run_async(code)


{'doc_topic_prior': 0.125, 'n_components': 8, 'topic_word_prior': 0.125}

Finally, we will run the best parameters on the full dataset.

In [11]:
@autopickle('../models/gs-hpt-lda.pickle.gz')
def hp_tuned_lda():
    lda = sklearn.decomposition.LatentDirichletAllocation(
        **grid_search_lda_2.best_params_,
        n_jobs=-1,
    )
    lda.fit(vectorizer.transform(df_training['tweet']))
    return lda

  and should_run_async(code)


In [13]:
pyLDAvis.sklearn.prepare(hp_tuned_lda, vectorizer.transform(df_training['tweet']), vectorizer, mds='tsne')

  and should_run_async(code)


In [None]:
hp_tuned_lda.perplexity(vectorizer.transform(df_test['tweet']))

In [None]:
hp_tuned_lda.perplexity(vectorizer.transform(df_training['tweet']))