## Set up

Load libraries and data.

In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize.casual import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import scipy.sparse

In [2]:
# load datasets
train_df = pd.read_csv('../data/derived/tweets_supervised_train.csv')
dev_df   = pd.read_csv('../data/derived/tweets_supervised_dev.csv')
test_df  = pd.read_csv('../data/derived/tweets_supervised_test.csv')

Confirm datasets have expected size and format.

In [3]:
for df in [train_df, dev_df, test_df]:
    print(f'{len(df)} records.')

6497 records.
1393 records.
1396 records.


In [4]:
train_df.sample(3)

Unnamed: 0,tweet_id,text,label
3807,575429403653640192,@ksorbs ðŸ˜‚ðŸ˜†ðŸ˜‚ #classic #coon... As pawpaw would ...,0
2050,568429176576872448,@Transition I had a monoprice mechanical keybo...,0
151,568302340156973056,I've had a macbook survive a 90+ mph motorcycl...,0


In [5]:
dev_df.sample(3)

Unnamed: 0,tweet_id,text,label
413,567613868298358784,@hridaybala I don't read breitbart ;),0
1306,839845268015071234,RT @iamshehnaaz: Best msg of the day:\n\nWitho...,1
1190,563510149675892736,"@jsydneym We found out quickly most of these ""...",1


In [6]:
test_df.sample(3)

Unnamed: 0,tweet_id,text,label
1394,838237650968592384,@Ddebn8R @Lester_Bikila @VABVOX \nIf a man mar...,1
173,576239908781629440,RT @jilevin: Here's where Rush Limbaugh is wro...,0
1261,534727955264847872,RT @andythewookie1: @YesYoureSexist your right...,1


## Tokenize text

Here, I lowercase and tokenize the text with NLTK's TweetTokenizer. In reality, this step of the analysis is done as part of vectorizing the data, but I added it here manually to explore the tokens created.

In [7]:
def tokenize_tweet_text(df):
    """
    Create tokenized text column in dataframe with text lowercased and then tokenized by NLTK's TweetTokenizer
    """
    
    # initialize tokenizer
    tokenizer = TweetTokenizer()
    # extract text
    text = df['text']
    # tokenize tweets and add to dataframe
    df['tokenized_text'] = [tokenizer.tokenize(t.lower()) for t in text]
    # reorder columns
    df = df[['tweet_id','text','tokenized_text','label']]
    
    return df

In [8]:
# tokenize text in datasets
train_df = tokenize_tweet_text(train_df)
dev_df   = tokenize_tweet_text(dev_df)
test_df  = tokenize_tweet_text(test_df)

Confirm tokens match expectation.

In [9]:
# preview dataframe
train_df.head()

Unnamed: 0,tweet_id,text,tokenized_text,label
0,573246882501148672,RT @mercurypixel: @freebsdgirl At this point.....,"[rt, @mercurypixel, :, @freebsdgirl, at, this,...",0
1,569200611415035904,plz stop posting pics of me that i posted a fe...,"[plz, stop, posting, pics, of, me, that, i, po...",0
2,564568918321147905,Been doing things. Going to have some pretty a...,"[been, doing, things, ., going, to, have, some...",0
3,573222765957816320,"hello grafana/graphite/statsd server, let's se...","[hello, grafana, /, graphite, /, statsd, serve...",0
4,564632454179213312,@sschinke @TsundereRager previous versions of ...,"[@sschinke, @tsundererager, previous, versions...",0


In [10]:
# write tokenized text to CSV
train_df.to_csv(path_or_buf='../data/derived/tweets_supervised_train_tokens.csv', index=False)
dev_df.to_csv(path_or_buf='../data/derived/tweets_supervised_dev_tokens.csv', index=False)
test_df.to_csv(path_or_buf='../data/derived/tweets_supervised_test_tokens.csv', index=False)

## Train count vectorizer

Here, I generate a bag of words representation of each tweet using sklearn's CountVectorizer on the tokenized tweet text. I do not remove stopwords, or remove words with a cutoff for high or low document frequency.

In [11]:
def train_vectorizer(df):
    """
    Train sklearn's CountVectorizer with text tokenized by NLTK's TweetTokenizer
    """
    
    # initialize vectorizer and tokenizer
    vectorizer = CountVectorizer(tokenizer = TweetTokenizer().tokenize)
    vectorizer.build_tokenizer()
    # fit vectorizer to text
    vectorizer.fit(df['text'])
    
    return vectorizer    

In [12]:
# train vectorizer
count_vectorizer = train_vectorizer(train_df)

In [13]:
# save vectorizer
pickle.dump(count_vectorizer, open('../data/derived/count_vectorizer.pkl', 'wb'))

## Transform data into count vectors

In [14]:
def count_vectorize(count_vectorizer, df):
    """
    Use a trained CountVectorizer to transform text into count vectors
    """
    vectors = count_vectorizer.transform(df['text'])
    return vectors

In [15]:
# transform data into count vectors
train_vectors = count_vectorize(count_vectorizer, train_df)
dev_vectors   = count_vectorize(count_vectorizer, dev_df)
test_vectors  = count_vectorize(count_vectorizer, test_df)

Confirm vectors have expected format and shape.

In [16]:
for vectors in [train_vectors, dev_vectors, test_vectors]:
    print(f'{vectors.shape}    {type(vectors)}')

(6497, 14740)    <class 'scipy.sparse.csr.csr_matrix'>
(1393, 14740)    <class 'scipy.sparse.csr.csr_matrix'>
(1396, 14740)    <class 'scipy.sparse.csr.csr_matrix'>


In [17]:
# save vectors
scipy.sparse.save_npz('../data/derived/tweets_supervised_train_vectors_count.npz', train_vectors)
scipy.sparse.save_npz('../data/derived/tweets_supervised_dev_vectors_count.npz', dev_vectors)
scipy.sparse.save_npz('../data/derived/tweets_supervised_test_vectors_count.npz', test_vectors)