The goal of this notebook is to extract the various feature dimensions that will asssist in network construction and community detection:

1. Tweet attributes (from previous notebooks):
  * Retweeted from...
  * Mentions
  * Links
  * Hashtags
1. Parts of Speech (POS) recognition
  * Named Entity Recognition
  * Noun-phrases
  * Pronouns
1. List and tabulate counts for:
  * Clustivity terms
  * Affinity terms
  * "Absolute" terms
1. Form probability arrays from:
  * Sentiments
  * Political typologies
1. Generate data frame for network construction

# Setup for Session

Session GPU information:

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

In [None]:
!nvcc --version

Session runtime information:

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

## Load Libraries

Install required libraries:

In [None]:
!pip install emoji
!pip install --upgrade spacy
!pip install spacy[transformers]
!python -m spacy download en_core_web_trf
!pip install transformers[sentencepiece]


In [None]:
import spacy
import en_core_web_trf
import emoji
import pandas as pd
import re
from spacy.matcher import Matcher
from IPython.utils import io
import gc
from tqdm.auto import tqdm
import torch


## Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#offline_tweets_df = pd.read_pickle('/content/drive/MyDrive/Piper Gradient/Not-So-Twitterpated/cleaned_offline_tweets_df_large.pickle')
offline_tweets_df = pd.read_pickle('/content/drive/MyDrive/Piper Gradient/Not-So-Twitterpated/cleaned_tweets_large_Piper_sentiment.pickle')

offline_tweets_df.info()

In [None]:
offline_tweets_df.head()

Clean up tweets:

In [None]:
pat1_user = r'@[A-Za-z0-9_:]+'
pat2_http = r'https?://[A-Za-z0-9./]+'

def preprocess(text, pat1repl='@user', pat2repl='http'):
    subbed = re.sub(pat1_user, pat1repl, text)
    subbed = re.sub(pat2_http, pat2repl, subbed)
    # remove retweet tags in tweet
    subbed = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', subbed)    
    return subbed

offline_tweets_df['text4'] = offline_tweets_df['text3'].map(lambda x: preprocess(x))
offline_tweets_df['user_descr'] = offline_tweets_df['user_descr'].map(lambda x: preprocess(x))
display(offline_tweets_df[['user_descr','text3','text4']])

# Extract Part of Speech for Analysis

In [None]:
# Let spacy use GPU if possible (much faster runtimes)
spacy.prefer_gpu()

# Load language model
nlp = en_core_web_trf.load()
nlp.get_pipe("transformer").model.attrs["flush_cache_chance"] = 0.4


Testing `spacy` extraction pipeline.

In [None]:
%%time

my_tweets = offline_tweets_df.sample(n=5)['text4']

with io.capture_output() as captured:
  for twt in nlp.pipe(my_tweets, batch_size=300):
    print("Noun phrases: ", [chunk.text for chunk in twt.noun_chunks])
    print("Pronouns: ", [token for token in twt if token.pos_ == "PRON"])
    print("Determiners: ", [token for token in twt if token.pos_ == "DET"])
    print("Named Entities: ", [(entity.text, entity.label_) for entity in twt.ents])

print(captured)

# Extract Clusivity, Affinity, Othering pronouns and Absolutives

Functions for extracting selected parts of speech.

In [None]:
def extract_pronouns(texts):
  progress_bar = tqdm(range(texts.shape[0]))
  pronouns = []
  for twt in nlp.pipe(texts, batch_size=16):
    pronouns.append([token.lower_ for token in twt if token.pos_ == "PRON"])
    progress_bar.update(1)
  return pronouns

def extract_determiners(texts):
  progress_bar = tqdm(range(texts.shape[0]))
  determiners = []
  for twt in nlp.pipe(texts, batch_size=16):
    determiners.append([token.lower_ for token in twt if token.pos_ == "DET"])
    progress_bar.update(1)
  return determiners

def extract_noun_phrases(texts):
  progress_bar = tqdm(range(texts.shape[0]))
  noun_phrases = []
  for twt in nlp.pipe(texts, batch_size=16):
    noun_phrases.append([chunk.text for chunk in twt.noun_chunks])
    progress_bar.update(1)
  return noun_phrases

def extract_named_entities(texts):
  progress_bar = tqdm(range(texts.shape[0]))
  named_entities = []
  for twt in nlp.pipe(texts, batch_size=16):
    named_entities.append([(entity.text, entity.label_) for entity in twt.ents])
    progress_bar.update(1)
  return named_entities


Set up `spaCy` morphology matching rules to categorize clusivity (inlusive/exclusive), affinity (affiliated/associated), and absoluting terms.

Adapted from [Finding linguistic patterns using `spaCy`](https://applied-language-technology.readthedocs.io/en/latest/notebooks/part_iii/02_pattern_matching.html)

In [None]:
morph_matcher = Matcher(vocab=nlp.vocab)

# Matching for strong inlcusivity
incl_affil = [{'POS': 'PRON', 'MORPH': {'IS_SUPERSET': ['Number=Sing','Person=1']}}]
morph_matcher.add('incl_affil', patterns=[incl_affil])

# Matching for weak inlcusivity
incl_assoc = [{'POS': 'PRON', 'MORPH': {'IS_SUPERSET': ['Number=Plur','Person=1']}}]
morph_matcher.add('incl_assoc', patterns=[incl_assoc])

# Matching for strong exclusivity
excl_affil = [{'POS': 'PRON', 'MORPH': {'IS_SUPERSET': ['Person=2']}}]
morph_matcher.add('excl_affil', patterns=[excl_affil])

# Matching for weak exclusivity
excl_assoc = [{'POS': 'PRON', 'MORPH': {'IS_SUPERSET': ['Number=Plur','Person=3']}}]
morph_matcher.add('excl_assoc', patterns=[excl_assoc])

# Matching for absolute words (i.e. "each", "every", "all", "none")
abs_terms = [{'LOWER':{'IN':['each','all','every','none']}}]
morph_matcher.add('abs_terms',patterns=[abs_terms])


Define functions to extract feature tokens and scores from tweets:

In [None]:
def extract_features(texts, matcher_obj, batch:int=8):
  print('Beginning feature extraction:')
  progress_bar = tqdm(range(texts.shape[0]))

  features = {
      'incl_affil_tok': [],
      'incl_affil_score': [],
      'incl_assoc_tok': [],
      'incl_assoc_score': [],
      'excl_affil_tok': [],
      'excl_affil_score': [],
      'excl_assoc_tok': [],
      'excl_assoc_score': [],
      'abs_terms_tok': [],
      'abs_terms_score': []
  }

  for twt in nlp.pipe(texts, batch_size=batch):
    twt_features = {
        'incl_affil_tok': [],
        'incl_affil_score': 0,
        'incl_assoc_tok': [],
        'incl_assoc_score': 0,
        'excl_affil_tok': [],
        'excl_affil_score': 0,
        'excl_assoc_tok': [],
        'excl_assoc_score': 0,
        'abs_terms_tok': [],
        'abs_terms_score': 0
    }

    matcher_result = matcher_obj(twt)

    for match_id, start, end in matcher_result:
      feat_name = nlp.vocab.strings[match_id]
      feat_tk = feat_name + '_tok'
      feat_score = feat_name + '_score'
      
      twt_features[feat_tk].append(twt[start:end].text)
      twt_features[feat_score] += 1
      
    for k in twt_features:
      features[k].append(twt_features[k])
    
    progress_bar.update(1)

  print("Feature extraction complete.")
  return features

Create data frame for extracted parts of speech.

In [None]:
tweet_features_df = offline_tweets_df.copy()

tweet_features_df.head()


Extract features:

In [None]:
discourse_features = extract_features(tweet_features_df['text4'], morph_matcher)

Add discourse features to new data frame:

In [None]:
%%time
for key in discourse_features:
  tweet_features_df[key] = pd.Series(discourse_features[key])

A coule last features:

* whether it is a reply tweet or not, and
* whether it has been retweeted
* named entities in the tweet
* the noun phrases contained in the tweet

In [None]:
tweet_features_df['is_reply'] = (tweet_features_df['reply_to_id'] > 0)
tweet_features_df['been_retweeted'] = (tweet_features_df['retweet_count'] > 0)
tweet_features_df['named_entities'] = extract_named_entities(tweet_features_df['text4'])
tweet_features_df['noun_phrases'] = extract_noun_phrases(tweet_features_df['text4'])

In [None]:
tweet_features_df.head()

# Save results to `pickle`

In [None]:
tweet_features_df.columns

In [None]:
include_cols = ['id', 'reply_to_id', 'retweet_count', 'favorite_count', 
                'is_quote_status', 'user_id', 'user_description', 'text', 
                'tweet category', 'text2', 'text3', 'user_descr', 
                'is_retweet', 'retweeted_from', 'mentioned', 'hashtags', 
                'links', 'text4', 'named_entities', 'noun_phrases', 
                'incl_affil_tok', 'incl_affil_score', 'incl_assoc_tok', 
                'incl_assoc_score', 'excl_affil_tok', 'excl_affil_score', 
                'excl_assoc_tok', 'excl_assoc_score', 'abs_terms_tok', 
                'abs_terms_score', 'is_reply', 'been_retweeted','Piper_typ', 
                'tweet', 'cat', 'neg', 'pos', 'neu', 'comp', 'TBpol', 'hfs', 
                'cdf_neg', 'cdf_neu', 'cdf_pos' 
                ]

feat_df = tweet_features_df.loc[:, tweet_features_df.columns.isin(include_cols)].copy()

feat_df.to_pickle('/content/drive/MyDrive/Piper Gradient/Not-So-Twitterpated/data/NST03_extracted_features.pickle')

In [None]:
feat_df.head()