## Set up

Load libraries and data.

In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize.casual import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pickle
import scipy.sparse

In [2]:
# function to load datasets
def load_tweet_df(dataset_name):
    """
    Load tweet dataframes (assumes filename structure)
    """
    filepath_in = f'../data/derived/tweets_supervised_{dataset_name}.csv'
    df = pd.read_csv(filepath_in)
    return df

In [3]:
# load datasets
train_df = load_tweet_df('train')
dev_df   = load_tweet_df('dev')
test_df  = load_tweet_df('test')

Confirm datasets have expected size and format.

In [4]:
for df in [train_df, dev_df, test_df]:
    print(f'{len(df)} records.')

6497 records.
1393 records.
1396 records.


In [5]:
train_df.sample(3)

Unnamed: 0,tweet_id,text,label
1202,598502011269554176,Twitter is more likely to act on threats to co...,0
121,599285769727938560,@Claire_Medeiros i always tweet my location.,0
1521,596593444765573120,@ashleylynch me too!,0


In [6]:
dev_df.sample(3)

Unnamed: 0,tweet_id,text,label
343,564691340013740032,@Transition @GlennF @Spacekatgal that's the one.,0
1352,839855905902559238,RT @LIMMediaGroup: Happy #INTERNATIONALWOMENSD...,1
560,599229500849684481,@ginmarrienne can you keep me updated? Links t...,0


In [7]:
test_df.sample(3)

Unnamed: 0,tweet_id,text,label
57,596379188635930624,RT @petfish: .@freebsdgirl I feel I should als...,0
207,568136950227419137,NPR asked to speak to me about the wadhwa thin...,0
481,596949842661773313,@hypatiadotca you should see the shoes that @s...,0


In [8]:
# set random seed for singular value decomposition
random_seed = 466

## Tokenize text

Here, I lowercase and tokenize the text with NLTK's TweetTokenizer. In reality, this step of the analysis is done as part of vectorizing the data, but I added it here manually to explore the tokens created.

In [9]:
def tokenize_tweet_text(df, dataset_name):
    """
    Create tokenized text column in dataframe with text lowercased and then tokenized by NLTK's TweetTokenizer
    """
    
    # initialize tokenizer
    tokenizer = TweetTokenizer()
    # extract text
    text = df['text']
    # tokenize tweets and add to dataframe
    df['tokenized_text'] = [tokenizer.tokenize(t.lower()) for t in text]
    # reorder columns
    df = df[['tweet_id','text','tokenized_text','label']]
    # write to CSV
    filepath_out = f'../data/derived/tweets_supervised_{dataset_name}_tokens.csv'
    df.to_csv(path_or_buf=filepath_out, index=False)
    
    return df

In [10]:
# tokenize text in datasets
train_df = tokenize_tweet_text(train_df, 'train')
dev_df   = tokenize_tweet_text(dev_df,   'dev')
test_df  = tokenize_tweet_text(test_df,  'test')

Confirm tokens match expectation.

In [11]:
# preview dataframe
train_df.head()

Unnamed: 0,tweet_id,text,tokenized_text,label
0,573246882501148672,RT @mercurypixel: @freebsdgirl At this point.....,"[rt, @mercurypixel, :, @freebsdgirl, at, this,...",0
1,569200611415035904,plz stop posting pics of me that i posted a fe...,"[plz, stop, posting, pics, of, me, that, i, po...",0
2,564568918321147905,Been doing things. Going to have some pretty a...,"[been, doing, things, ., going, to, have, some...",0
3,573222765957816320,"hello grafana/graphite/statsd server, let's se...","[hello, grafana, /, graphite, /, statsd, serve...",0
4,564632454179213312,@sschinke @TsundereRager previous versions of ...,"[@sschinke, @tsundererager, previous, versions...",0


## Bag of words representation

Here, I generate a bag of words representation of each tweet using sklearn's CountVectorizer on the tokenized tweet text. I do not remove stopwords, or remove words with a cutoff for high or low document frequency.

In [12]:
def train_count_vectorizer(df, vectorizer_name):
    """
    Train sklearn's CountVectorizer with text tokenized by NLTK's TweetTokenizer
    """
    
    # initialize vectorizer and tokenizer
    vectorizer = CountVectorizer(tokenizer = TweetTokenizer().tokenize)
    vectorizer.build_tokenizer()
    # fit vectorizer to text
    vectorizer.fit(df['text'])
    
    # write to file
    filepath_out = f'../data/derived/models/vectorizer_{vectorizer_name}.pkl'
    pickle.dump(vectorizer, open(filepath_out, 'wb'))
    
    return vectorizer

def count_vectorize(count_vectorizer, df, vectorizer_name, dataset_name):
    """
    Use a trained CountVectorizer to transform text into count vectors
    """
    
    # transform data into count vectors
    vectors = count_vectorizer.transform(df['text'])
    
    # save vectors to files
    filepath_out = f'../data/derived/vectors/vector{vectorizer_name}_{dataset_name}.npz'
    scipy.sparse.save_npz(filepath_out, vectors)
    
    return vectors

In [13]:
# train vectorizer
count_vectorizer = train_count_vectorizer(train_df, 'count')

# transform data into count vectors
train_count_vectors = count_vectorize(count_vectorizer, train_df, 'count', 'train')
dev_count_vectors   = count_vectorize(count_vectorizer, dev_df,   'count', 'dev')
test_count_vectors  = count_vectorize(count_vectorizer, test_df,  'count', 'test')

# confirm vectors have expected format and shape
for vectors in [train_count_vectors, dev_count_vectors, test_count_vectors]:
    print(f'{vectors.shape}    {type(vectors)}')

(6497, 14740)    <class 'scipy.sparse.csr.csr_matrix'>
(1393, 14740)    <class 'scipy.sparse.csr.csr_matrix'>
(1396, 14740)    <class 'scipy.sparse.csr.csr_matrix'>


## TF-IDF representation

Here, I generate a term frequency-inverse document frequency representation of each tweet using sklearn's TfidfVectorizer on the tokenized tweet text. I do not remove stopwords, or remove words with a cutoff for high or low document frequency.

In [14]:
def train_tfidf_vectorizer(df, vectorizer_name):
    """
    Train sklearn's TfidfVectorizer with text tokenized by NLTK's TweetTokenizer
    """
    
    # initialize vectorizer and tokenizer
    vectorizer = TfidfVectorizer(tokenizer = TweetTokenizer().tokenize)
    vectorizer.build_tokenizer()
    # fit vectorizer to text
    vectorizer.fit(df['text'])
    
    # write to file
    filepath_out = f'../data/derived/models/vectorizer_{vectorizer_name}.pkl'
    pickle.dump(vectorizer, open(filepath_out, 'wb'))
    
    return vectorizer

def tfidf_vectorize(tfidf_vectorizer, df, vectorizer_name, dataset_name):
    """
    Use a trained TfidfVectorizer to transform text into TF-IDF vectors
    """
    
    # transform data into TF-IDF vectors
    vectors = tfidf_vectorizer.transform(df['text'])
    
    # save vectors to files
    filepath_out = f'../data/derived/vectors/vector{vectorizer_name}_{dataset_name}.npz'
    scipy.sparse.save_npz(filepath_out, vectors)
    
    return vectors

In [15]:
# train vectorizer
tfidf_vectorizer = train_tfidf_vectorizer(train_df, 'tfidf')

# transform data into TF-IDF vectors
train_tfidf_vectors = tfidf_vectorize(tfidf_vectorizer, train_df, 'tfidf', 'train')
dev_tfidf_vectors   = tfidf_vectorize(tfidf_vectorizer, dev_df,   'tfidf', 'dev')
test_tfidf_vectors  = tfidf_vectorize(tfidf_vectorizer, test_df,  'tfidf', 'test')

# confirm vectors have expected format and shape
for vectors in [train_tfidf_vectors, dev_tfidf_vectors, test_tfidf_vectors]:
    print(f'{vectors.shape}    {type(vectors)}')

(6497, 14740)    <class 'scipy.sparse.csr.csr_matrix'>
(1393, 14740)    <class 'scipy.sparse.csr.csr_matrix'>
(1396, 14740)    <class 'scipy.sparse.csr.csr_matrix'>


## LSI representation

Here, I use the TF-IDF vectors as an input to singular value decomposition to create a latent semantic indexing (LSI) representation. LSI is a dimesnionality reduction technique that deals with polysemy - words that have similar meanings are combined into a single feature. I will fit LSI with three numbers of components, all fewer than the number of features in the original vocabulary.

In [16]:
def train_lsi_transformer(tfidf_vectors, n_components, random_seed):
    """
    Train sklearn's TfidfVectorizer with text tokenized by NLTK's TweetTokenizer
    """
    
    # initialize transformer
    transformer = TruncatedSVD(n_components = n_components, random_state = random_seed)
    # fit transformer to TF-IDF vectors
    transformer.fit(tfidf_vectors)
    
    # write to file
    filepath_out = f'../data/derived/models/transformer_lsi{n_components}.pkl'
    pickle.dump(transformer, open(filepath_out, 'wb'))
    
    return transformer

In [17]:
def lsi_transform(transformer, tfidf_vectors, dataset_name):
    """
    Use a trained TfidfVectorizer to transform text into TF-IDF vectors
    """
    
    # transform data into TF-IDF vectors
    lsi_vectors = transformer.transform(tfidf_vectors)

    # save vectors to files
    filepath_out = f'../data/derived/vectors/vectorlsi{transformer.n_components}_{dataset_name}.csv'
    np.savetxt(filepath_out, lsi_vectors, delimiter=',')
    
    return lsi_vectors

In [18]:
# iterate over components hyperparameter
for n_components in [5, 10, 50, 100]:
    
    # train transformer
    lsi_transformer = train_lsi_transformer(train_tfidf_vectors, n_components, random_seed)

    # transform data into TF-IDF vectors
    train_lsi_vectors = lsi_transform(lsi_transformer, train_tfidf_vectors, 'train')
    dev_lsi_vectors   = lsi_transform(lsi_transformer, dev_tfidf_vectors,   'dev')
    test_lsi_vectors  = lsi_transform(lsi_transformer, test_tfidf_vectors,  'test')

    # confirm vectors have expected format and shape
    for vectors in [train_lsi_vectors, dev_lsi_vectors, test_lsi_vectors]:
        print(f'{vectors.shape}    {type(vectors)}')

(6497, 5)    <class 'numpy.ndarray'>
(1393, 5)    <class 'numpy.ndarray'>
(1396, 5)    <class 'numpy.ndarray'>
(6497, 10)    <class 'numpy.ndarray'>
(1393, 10)    <class 'numpy.ndarray'>
(1396, 10)    <class 'numpy.ndarray'>
(6497, 50)    <class 'numpy.ndarray'>
(1393, 50)    <class 'numpy.ndarray'>
(1396, 50)    <class 'numpy.ndarray'>
(6497, 100)    <class 'numpy.ndarray'>
(1393, 100)    <class 'numpy.ndarray'>
(1396, 100)    <class 'numpy.ndarray'>
