In [1]:
import pandas as pd 
import re
import os
from pathlib import Path
import little_mallet_wrapper
import seaborn
import glob
import random
import nltk
from nltk.corpus import stopwords


path_to_mallet = "/config/workspace/Dissertation/Dissertation/Data/mallet-2.0.8/bin/mallet"
pd.options.display.max_colwidth = 100

In [2]:
data = pd.read_csv("3-18-21RedForEDtweetdata.csv")

### Data Preparation

In [3]:
data.date = pd.to_datetime(data.date, format='%m/%d/%y')
data = pd.DataFrame(data)

start_date = pd.to_datetime('01/01/18',format='%m/%d/%y')
end_date = pd.to_datetime('12/31/19',format='%m/%d/%y')

mask = (data['date'] > start_date) & (data['date'] <= end_date)
data = data.loc[mask]


In [4]:
data['tweet'] = data.tweet.map(lambda x: re.sub(r'(^|[^@\w])@(\w{1,15})\b','',x))

data['tweet'] = data['tweet'].map(lambda x: re.sub(r"http\S+", '', x))

data['tweet'] = data['tweet'].map(lambda x: re.sub(r'[^\w\s]', '', x))

data['tweet'] = data['tweet'].map(lambda x: x.lower())

data['tweet'] = data['tweet'].map(lambda x: re.sub('redfored', '', x))

data['tweet'] = data['tweet'].map(lambda x: re.sub('red for ed', '', x))

data = data.drop_duplicates(subset='tweet')

In [5]:
training_data = data['tweet'].astype(str)
stop_words = stopwords.words('english')
stop_words.extend(["teacher","teachers","the"])
training_data = [little_mallet_wrapper.process_string(text, numbers='remove', stop_words=stop_words,remove_stop_words=True, remove_short_words=True) for text in data['tweet']]

### Train model (This takes ~4 minutes)

In [None]:
num_topics = 16

#Change to your desired output directory
output_directory_path = 'topic-model-output'

#No need to change anything below here
Path(f"{output_directory_path}").mkdir(parents=True, exist_ok=True)

path_to_training_data           = f"{output_directory_path}/training.txt"
path_to_formatted_training_data = f"{output_directory_path}/mallet.training"
path_to_model                   = f"{output_directory_path}/mallet.model.{str(num_topics)}"
path_to_topic_keys              = f"{output_directory_path}/mallet.topic_keys.{str(num_topics)}"
path_to_topic_distributions     = f"{output_directory_path}/mallet.topic_distributions.{str(num_topics)}"

little_mallet_wrapper.quick_train_topic_model(path_to_mallet, output_directory_path, num_topics, training_data)
topic_distributions = little_mallet_wrapper.load_topic_distributions(path_to_topic_distributions)
tweet_dict = dict(zip(training_data, data.tweet))

### Explore models

In [7]:
def display_top_tweets_per_topic(topic_number, number_of_documents):
    
    print(f"✨Topic {topic_number}✨\n\n{topics[topic_number]}\n")

    for probability, document in little_mallet_wrapper.get_top_docs(training_data, topic_distributions, topic_number, n=number_of_documents):
        print(round(probability, 4), tweet_dict[document] + "\n")
    return

from IPython.display import Markdown, display
import re

def display_bolded_topic_words_in_context(topic_number=3, number_of_documents=3, custom_words=None):

    print(f"✨Topic {topic_number}✨\n\n{topics[topic_number]}\n")

    for probability, document in little_mallet_wrapper.get_top_docs(training_data, topic_distributions, topic_number, n=number_of_documents):

        probability = f"✨✨✨\n\n**{probability}**"
        original_text = tweet_dict[document]
        original_text_lowered = original_text.lower()
        topic_words = topics[topic_number]
        topic_words = custom_words if custom_words != None else topic_words

        for word in topic_words:
            if word in original_text_lowered:
                original_text = re.sub(f"\\b{word}\\b", f"**{word.upper()}**", original_text, flags=re.I)

        display(Markdown(probability)), display(Markdown(original_text))
    return

### Display All Topics

In [None]:
topics = little_mallet_wrapper.load_topic_keys(path_to_topic_keys)

for topic_number, topic in enumerate(topics):
    print(f"✨Topic {topic_number}✨\n\n{topic}\n")

### Use these methods for exploring a single topic in greater detail

In [None]:
topic_to_explore = 11

display_top_tweets_per_topic(topic_to_explore,1)
display_bolded_topic_words_in_context(topic_number=topic_to_explore, number_of_documents=4)


### Add time series data to topic model

In [10]:
topics = little_mallet_wrapper.load_topic_keys(path_to_topic_keys)
topic_distributions = little_mallet_wrapper.load_topic_distributions(path_to_topic_distributions)
data['topic_distributions'] = pd.Series(topic_distributions)
topic_distributions_df = data['topic_distributions'].apply(pd.Series)
topic_distributions_df.columns = [" ".join(topic[:4]) for topic in topics]
data = pd.concat([data, topic_distributions_df], axis=1)
data['date'] = pd.to_datetime(data['date'])
data['year'] = pd.to_datetime(data['date'].dt.year, format='%Y')
data['year-month'] = data['date'].dt.to_period('M')
data['Date (by month)'] = [month.to_timestamp() for month in data['year-month']]

data = data.set_index('Date (by month)')

### Create a graph for each topic

In [None]:
for x in range(0,num_topics):
    topic_number = x
    topic_label = " ".join(topics[topic_number][:4])
    data.groupby(data.index)[[topic_label]].mean().plot(title=f'RedForEd Tweets By Topic', linewidth=3, color="red")