## Set up

Load libraries and data.

In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize.casual import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import scipy.sparse

In [2]:
# load datasets
train_df = pd.read_csv('../data/derived/tweets_supervised_train.csv')
dev_df   = pd.read_csv('../data/derived/tweets_supervised_dev.csv')
test_df  = pd.read_csv('../data/derived/tweets_supervised_test.csv')

Confirm datasets have expected size and format.

In [3]:
for df in [train_df, dev_df, test_df]:
    print(f'{len(df)} records.')

6497 records.
1393 records.
1396 records.


In [4]:
train_df.sample(3)

Unnamed: 0,tweet_id,text,label
3402,565353393955565568,"@milfgaardian N'rage pink after ~1 week, turqu...",0
1129,596029273665744896,"hah. someone believes i was ""schooled"" instead...",0
3227,573270910729261056,"We have @crashoverridenw, an invaluable resour...",0


In [5]:
dev_df.sample(3)

Unnamed: 0,tweet_id,text,label
274,568667538495651841,all of these quotes are from @wadhwa on @tldr.,0
872,572328240993464320,Feminazi ðŸ˜³,1
498,563596116831125504,@LarryWest42 nah. Most of the trees here are p...,0


In [6]:
test_df.sample(3)

Unnamed: 0,tweet_id,text,label
864,576516317882290176,@LauraaSilveira feminazi,1
1065,569720771863564288,@mistaphill Honestly... Belzer or Ric Ocasek o...,1
209,565365278138458112,@kencf0618 asking people to read some of the s...,0


## Tokenize text

Here, I lowercase and tokenize the text with NLTK's TweetTokenizer. In reality, this step of the analysis is done as part of vectorizing the data, but I added it here manually to explore the tokens created.

In [7]:
def tokenize_tweet_text(df, dataset_name):
    """
    Create tokenized text column in dataframe with text lowercased and then tokenized by NLTK's TweetTokenizer
    """
    
    # initialize tokenizer
    tokenizer = TweetTokenizer()
    # extract text
    text = df['text']
    # tokenize tweets and add to dataframe
    df['tokenized_text'] = [tokenizer.tokenize(t.lower()) for t in text]
    # reorder columns
    df = df[['tweet_id','text','tokenized_text','label']]
    # write to CSV
    filepath_out = f'../data/derived/tweets_supervised_{dataset_name}_tokens.csv'
    df.to_csv(path_or_buf=filepath_out, index=False)
    
    return df

In [8]:
# tokenize text in datasets
train_df = tokenize_tweet_text(train_df, 'train')
dev_df   = tokenize_tweet_text(dev_df,   'dev')
test_df  = tokenize_tweet_text(test_df,  'test')

Confirm tokens match expectation.

In [9]:
# preview dataframe
train_df.head()

Unnamed: 0,tweet_id,text,tokenized_text,label
0,573246882501148672,RT @mercurypixel: @freebsdgirl At this point.....,"[rt, @mercurypixel, :, @freebsdgirl, at, this,...",0
1,569200611415035904,plz stop posting pics of me that i posted a fe...,"[plz, stop, posting, pics, of, me, that, i, po...",0
2,564568918321147905,Been doing things. Going to have some pretty a...,"[been, doing, things, ., going, to, have, some...",0
3,573222765957816320,"hello grafana/graphite/statsd server, let's se...","[hello, grafana, /, graphite, /, statsd, serve...",0
4,564632454179213312,@sschinke @TsundereRager previous versions of ...,"[@sschinke, @tsundererager, previous, versions...",0


## Train count vectorizer

Here, I generate a bag of words representation of each tweet using sklearn's CountVectorizer on the tokenized tweet text. I do not remove stopwords, or remove words with a cutoff for high or low document frequency.

In [10]:
def train_count_vectorizer(df, vectorizer_name):
    """
    Train sklearn's CountVectorizer with text tokenized by NLTK's TweetTokenizer
    """
    
    # initialize vectorizer and tokenizer
    vectorizer = CountVectorizer(tokenizer = TweetTokenizer().tokenize)
    vectorizer.build_tokenizer()
    # fit vectorizer to text
    vectorizer.fit(df['text'])
    
    # write to file
    filepath_out = f'../data/derived/{vectorizer_name}_vectorizer.pkl'
    pickle.dump(vectorizer, open(filepath_out, 'wb'))
    
    return vectorizer

In [11]:
# train vectorizer
count_vectorizer = train_count_vectorizer(train_df, 'count')

## Transform data into count vectors

In [12]:
def count_vectorize(count_vectorizer, df, vectorizer_name, dataset_name):
    """
    Use a trained CountVectorizer to transform text into count vectors
    """
    
    # transform data into count vectors
    vectors = count_vectorizer.transform(df['text'])
    
    # save vectors to files
    filepath_out = f'../data/derived/vectors/vector{vectorizer_name}_{dataset_name}.npz'
    scipy.sparse.save_npz(filepath_out, vectors)
    
    return vectors

In [13]:
# transform data into count vectors
train_vectors = count_vectorize(count_vectorizer, train_df, 'count', 'train')
dev_vectors   = count_vectorize(count_vectorizer, dev_df,   'count', 'dev')
test_vectors  = count_vectorize(count_vectorizer, test_df,  'count', 'test')

Confirm vectors have expected format and shape.

In [14]:
for vectors in [train_vectors, dev_vectors, test_vectors]:
    print(f'{vectors.shape}    {type(vectors)}')

(6497, 14740)    <class 'scipy.sparse.csr.csr_matrix'>
(1393, 14740)    <class 'scipy.sparse.csr.csr_matrix'>
(1396, 14740)    <class 'scipy.sparse.csr.csr_matrix'>
