## Set up

Load libraries and data.

In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize.casual import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pickle
import scipy.sparse

In [2]:
# load dataset
filepath_in = f'../data/derived/tweets_unsupervised.csv'
tweet_df = pd.read_csv(filepath_in)

Note: Much of this code is repurposed from the train_supervised_data.ipynb file, with minor changes for file names, and the lack of split train/dev/test datasets.

## Tokenize text

Here, I lowercase and tokenize the text with NLTK's TweetTokenizer. In reality, this step of the analysis is done as part of vectorizing the data, but I added it here manually to explore the tokens created.

In [3]:
def tokenize_tweet_text(df):
    """
    Create tokenized text column in dataframe with text lowercased and then tokenized by NLTK's TweetTokenizer
    """
    
    # initialize tokenizer
    tokenizer = TweetTokenizer()
    # extract text
    text = df['text']
    # tokenize tweets and add to dataframe
    df['tokenized_text'] = [tokenizer.tokenize(t.lower()) for t in text]
    # reorder columns
    df = df[['tweet_id','text','tokenized_text','label']]
    # write to CSV
    filepath_out = f'../data/derived/tweets_unsupervised_tokens.csv'
    df.to_csv(path_or_buf=filepath_out, index=False)
    
    return df

In [4]:
# tokenize text in datasets
tweet_df = tokenize_tweet_text(tweet_df)

Confirm tokens match expectation.

In [5]:
# preview dataframe
tweet_df.head()

Unnamed: 0,tweet_id,text,tokenized_text,label
0,575742743730745344,@HPluckrose @Feminazi_Front men and women have...,"[@hpluckrose, @feminazi_front, men, and, women...",0
1,575417042150387712,@nat_com1 @Feminazi_Front We know there is a b...,"[@nat_com1, @feminazi_front, we, know, there, ...",0
2,603682460191203328,@ShaePhoenix Would looooove to see a feminazi ...,"[@shaephoenix, would, looooove, to, see, a, fe...",0
3,570994652993556481,RT @PeerWorker: @freebsdgirl You just lost $10...,"[rt, @peerworker, :, @freebsdgirl, you, just, ...",0
4,576554873468035072,"@eugenegaytard BUT BUT WOMEN CAN""T RAPE! no th...","[@eugenegaytard, but, but, women, can, "", t, r...",0


## Bag of words representation

Here, I generate a bag of words representation of each tweet using sklearn's CountVectorizer on the tokenized tweet text. I do not remove stopwords, or remove words with a cutoff for high or low document frequency.

In [6]:
def train_count_vectorizer(df, vectorizer_name):
    """
    Train sklearn's CountVectorizer with text tokenized by NLTK's TweetTokenizer
    """
    
    # initialize vectorizer and tokenizer
    vectorizer = CountVectorizer(tokenizer = TweetTokenizer().tokenize)
    vectorizer.build_tokenizer()
    # fit vectorizer to text
    vectorizer.fit(df['text'])
    
    # write to file
    filepath_out = f'../data/derived/models/unsupervised_vectorizer_{vectorizer_name}.pkl'
    pickle.dump(vectorizer, open(filepath_out, 'wb'))
    
    return vectorizer

def count_vectorize(count_vectorizer, df, vectorizer_name, dataset_name):
    """
    Use a trained CountVectorizer to transform text into count vectors
    """
    
    # transform data into count vectors
    vectors = count_vectorizer.transform(df['text'])
    
    # save vectors to files
    filepath_out = f'../data/derived/vectors/vectorcount_{dataset_name}.npz'
    scipy.sparse.save_npz(filepath_out, vectors)
    
    return vectors

In [7]:
# train vectorizer
count_vectorizer = train_count_vectorizer(tweet_df, 'count')

# transform data into count vectors
count_vectors = count_vectorize(count_vectorizer, tweet_df, 'count', 'unsupervised')

# confirm vectors have expected format and shape
print(f'{count_vectors.shape}    {type(count_vectors)}')

(3839, 10536)    <class 'scipy.sparse.csr.csr_matrix'>


## TF-IDF representation

Here, I generate a term frequency-inverse document frequency representation of each tweet using sklearn's TfidfVectorizer on the tokenized tweet text. I do not remove stopwords, or remove words with a cutoff for high or low document frequency.

In [8]:
def train_tfidf_vectorizer(df, vectorizer_name):
    """
    Train sklearn's TfidfVectorizer with text tokenized by NLTK's TweetTokenizer
    """
    
    # initialize vectorizer and tokenizer
    vectorizer = TfidfVectorizer(tokenizer = TweetTokenizer().tokenize)
    vectorizer.build_tokenizer()
    # fit vectorizer to text
    vectorizer.fit(df['text'])
    
    # write to file
    filepath_out = f'../data/derived/models/unsupervised_vectorizer_{vectorizer_name}.pkl'
    pickle.dump(vectorizer, open(filepath_out, 'wb'))
    
    return vectorizer

def tfidf_vectorize(tfidf_vectorizer, df, vectorizer_name, dataset_name):
    """
    Use a trained TfidfVectorizer to transform text into TF-IDF vectors
    """
    
    # transform data into TF-IDF vectors
    vectors = tfidf_vectorizer.transform(df['text'])
    
    # save vectors to files
    filepath_out = f'../data/derived/vectors/vector{vectorizer_name}_{dataset_name}.npz'
    scipy.sparse.save_npz(filepath_out, vectors)
    
    return vectors

In [9]:
# train vectorizer
tfidf_vectorizer = train_tfidf_vectorizer(tweet_df, 'tfidf')

# transform data into count vectors
tfidf_vectors = tfidf_vectorize(tfidf_vectorizer, tweet_df, 'tfidf', 'unsupervised')

# confirm vectors have expected format and shape
print(f'{tfidf_vectors.shape}    {type(tfidf_vectors)}')

(3839, 10536)    <class 'scipy.sparse.csr.csr_matrix'>
