In [1]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import matutils, corpora, models
import gensim
import pandas as pd
pd.options.mode.chained_assignment = None
import string
import numpy as np
import pyLDAvis.gensim as gensimvis
import pyLDAvis

import time
import re
import os
import json
import subprocess
import csv

import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
init_notebook_mode(connected=True)

# Tokenizers
from nltk.tokenize.casual import TweetTokenizer
from nltk.tokenize import RegexpTokenizer

# Set Notebook Parameters
Here we set variables necessary for pre-processing.

In [2]:
#tokenizer = RegexpTokenizer(r'\w+')
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) # which tokenizer to use

stop_words_en = get_stop_words('en') # English stoplist
stop_words_sp = get_stop_words('spanish') # Spanish stoplist

stop_words = []

for word in stop_words_en:
    stop_words.append(word)
for word in stop_words_sp:
    stop_words.append(word)


# Get Data

Twitter prohibits us from sharing the raw tweets, and Russell's raw data is not publicly available at this time (we received Russell's raw data via private email). As such, though we include all code we used to prepare the data to build our lda model and apply our model to for reference, we have commented out those lines which require raw datasets. 

The processed data that we used to build the model can be found at:

    '../../../../data/unsupervised/model_source/congress115_preprocessed.csv.gz'
    
The processed data that we used to test the model can be found at:

    '../../../../data/unsupervised/model_source/russell_preprocessed.csv.gz'

In [3]:
## data we should use to build the model


# data_file = 'RAWDATA_congress' 
# tweets = pd.read_csv(data_file, compression='gzip', dtype=str)

##  data we should use apply the model to (complete set of Russell's labeled tweets)
# data_file_r = 'RAWDATA_russell'
# russell_tweets = pd.read_csv(data_file_r, compression='gzip', dtype=str)

# Prep Data for Modeling

Tokenize data using one of the following tokenizers:
* [TweetTokenizer](http://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.casual)
* [RegexpTokenizer](http://www.nltk.org/api/nltk.tokenize.html?highlight=regexp#module-nltk.tokenize.regexp)

With some help from [Libelli](# with help from https://bbengfort.github.io/tutorials/2016/05/19/text-classification-nltk-sckit-learn.html
).

In [3]:
def process_training_data(tweets):
    
    ## to train lda model
    tweets['original'] = np.where(tweets.text.str.startswith(('RT', 'nan')), 0, 1)
    original_tweets = tweets[tweets['original'] == 1]
    
    return original_tweets

def process_inference_data(tweets):
    
    ## to make inference from trained lda model
    tweets['RT'] = tweets.RT.astype(float)
    original_tweets = tweets[tweets['RT'] == 0]
    original_tweets.head()
    
    return original_tweets

In [4]:
def preprocess(df, stop_words, tokenizer, text_column, id_column, userid_column, output_csv):
    
    texts = []
    
    doc_set = list(df[text_column])
    
    for doc in doc_set:
        
        # remove URLs
        doc = re.sub(r'^https?:\/\/.*[\r\n]*', '', doc, flags=re.MULTILINE)

        # tokenize
        tokens = tokenizer.tokenize(doc)

        # remove punctuation from each token
        table = str.maketrans('', '', string.punctuation)
        nopunct_tokens = [w.translate(table) for w in tokens]

        # keep words length 3 or more
        long_tokens = [token for token in nopunct_tokens if len(token) > 2]
        
        # remove remaining tokens that are not alphabetic
        alpha_tokens = [word for word in long_tokens if word.isalpha()]

        # remove stop words from tokens
        stopped_tokens = [i for i in alpha_tokens if not i in stop_words]

        # add tokens to list
        texts.append(stopped_tokens)
    
    new_df = pd.DataFrame()
    new_df['id_str'] = df[id_column]
    new_df['user_id'] = df[userid_column]
    new_df['preprocessed_text'] = texts
    
    new_df.to_csv(output_csv, index=False, compression='gzip')
        
    return texts

In [68]:
## prepping data to build the model, commented since we do not include 
## raw data in public repo so function fill not run
# original_tweets = process_training_data(tweets)
# texts = preprocess(original_tweets, stop_words, tokenizer,
#                    'full_text', 'id_str', 'user.id_str', 
#                    '../../../../data/unsupervised/model_source/congress115_preprocessed.csv.gz')

## prepping data to apply the model
# original_tweets_r = process_inference_data(russell_tweets)
# texts = preprocess(original_tweets_r, stop_words, tokenizer, 'text', 'KEYID', 'Name',
#                   '../../../../data/unsupervised/model_source/russell_preprocessed.csv.gz')

# Command Line Model Training

Navigate to the appropriate directory:

    cd ~/Desktop/icwsm-2019/external_respositories/mallet-2.0.8
 

Set up corpus upon which to train the model using the preprocessed tokens made in the preceding cell of this notebook:

    ./bin/mallet import-file --input ../../data/unsupervised/model_source/congress115_preprocessed.csv.gz --keep-sequence --output ../../data/unsupervised/model_source/congress115_train.mallet


Train the model:

    ./bin/mallet train-topics --input ../data/congress115_train.mallet --inferencer-filename ../../data/unsupervised/model_output/lda_tw_50_unigram_congress115_inferencer_.mallet --num-topics 50 --output-topic-keys ../../data/unsupervised/model_output/congress115_topics_FULL.txt --optimize-interval 10 --output-model ../../data/unsupervised/model_output/lda_tw_50_unigram_congress115.model --output-state ../../data/unsupervised/model_output/congress115_topic_state_50.gz --output-topic-keys ../../data/unsupervised/model_output/congress115_keys_50.txt --output-doc-topics ../../data/unsupervised/model_output/congress115_compostion_50.txt 


Open the file "congress115_keys.txt" in excel, and create three columns: "ams label", "policy codebook label", and "policy codebook book". The first of these columns is my (ams) intial manual labeling attempt. The second of these columns is the corresponding policy codebook label if available. If not avaialable, I (ams) come up with a new label. The third of these columns is the corresponding policy codebook code number if available. If not avaialable, I (ams) come up with a new code number.


# Command Line Topic Inference

Navigate to the appropriate directory:

    cd ~/Desktop/icwsm-2019/external_respositories/mallet-2.0.8
 

Using corpus created in training of the model to ensure compatibility of new dataset (i.e. '--use-pipe-from' field), import documents into mallet format:

    ./bin/mallet import-file --input ../../data/unsupervised/model_source/russell_preprocessed.csv.gz --keep-sequence --output ../../data/unsupervised/model_source/russell_infer.mallet --use-pipe-from ../../data/unsupervised/model_source/congress115_train.mallet

Using the appropriate model inferencer (i.e. the '--inferencer' field), infer the topic distribution (i.e. the '--output-doc-topics' field) for a set of documents prepared in the preceding step (i.e. the '--input' field):

    ./bin/mallet infer-topics --input ../../data/unsupervised/model_source/russell_infer.mallet --inferencer ../../models/best/lda/lda_tw_50_unigram_congress115_inferencer.mallet --output-doc-topics ../../data/unsupervised/model_output/russell-topic-composition.txt

