In [1]:
import pandas as pd
from transformers import pipeline
from modules.tweet_data import read_raw_data
from modules.spacy import spacy_twitter_model
import datetime as dt
import time

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
nlp = spacy_twitter_model()
sentiment_model = pipeline('sentiment-analysis',device=0)

In [3]:
tweet_df = read_raw_data()

Reading data


In [4]:
def twitter_tokenizer(data,
                      model=nlp,
                      urls=True,
                      stop_words=False,
                      lowercase=True,
                      alpha_only=True,
                      hashtags=True,
                      lemma=False):
    """
    Full tokenizer with flags for processing steps

    Parameters:
        data: string
            String to be tokenized
        model: Spacy model
            Ideally, an output from the method spacy_twitter_model() from modules.spacy
        urls: bool
            If True, remove URLs and Twitter picture links
        stop_words: bool
            If True, removes stop words
        lowercase: bool
            If True, turns all tokens to lowercase
        alpha_only: bool
            If True, removes all non-alpha characters
        hashtags: bool
            If True, removes hashtags
        lemma: bool
            If True, lemmatizes words
    """
    parsed = model(data)
    # token collector
    tokens = []
    for t in parsed:
        # remove URLs abd Twitter picture links
        if t.like_url or t._.is_piclink & urls:
            continue
        # remove stopwords
        if t.is_stop & stop_words:
            continue
        # alpha characters only
        if not t.is_alpha & alpha_only:
            # if not alpha only, remove hashtags
            if hashtags:
                continue
            else:
                if not t._.is_hashtag:
                    continue
        # lemmatize
        if lemma:
            t = t.lemma_
        else:
            t = t.text
        # turn to lowercase
        if lowercase:
            t = t.lower()
        tokens.append(t)
    return tokens

In [5]:
sentiment = tweet_df[['tweet_id']].copy()
sentiment['sentiment'] = 0
sentiment['score'] = 0

In [None]:
t = time.time()
for i in range(0,len(tweet_df),16):
    if i%10000 == 0:
        print("{}/{}, time {:.2f}".format(i, len(tweet_df),(time.time() - t)/60),end="\r")
    tokenized = [' '.join(twitter_tokenizer(tweet)) for tweet in tweet_df.tweet.iloc[i:i+16]]
    sentiment_dict = sentiment_model(tokenized)
    sentiment.sentiment.iloc[i:i+16] = [1 if d['label'] == 'POSITIVE' else -1 for d in sentiment_dict]
    sentiment.score.iloc[i:i+16] = [d['score'] for d in sentiment_dict]
    
print("{}/{}, time {:.2f}".format(i, len(tweet_df),(time.time() - t)/60),end="\r")

2540000/35109352, time 208.55