In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir("/content/drive/My Drive/SPIRS")

In [None]:
pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.5 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 19.9 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 65.2 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 10.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 59.4 MB/s 
Collecting tokenizers

In [None]:
import zipfile
import pandas as pd
from csv import reader
import re
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import numpy as np
import csv
import tensorflow

In [None]:
EMOJI_DESCRIPTION_SCRUB = re.compile(r':(\S+?):')
HASHTAG_BEFORE = re.compile(r'#(\S+)')
FIND_MENTIONS = re.compile(r'@(\S+)')
LEADING_NAMES = re.compile(r'^\s*((?:@\S+\s*)+)')
TAIL_NAMES = re.compile(r'\s*((?:@\S+\s*)+)$')

In [None]:
from gensim.parsing.preprocessing import remove_stopwords
import emoji

In [None]:
pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-1.7.0.tar.gz (175 kB)
[K     |████████████████████████████████| 175 kB 7.5 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171046 sha256=9960a101c509b0dc8115d3db74853825a4550977e331b8eaea2c9473b5d228aa
  Stored in directory: /root/.cache/pip/wheels/8a/4e/b6/57b01db010d17ef6ea9b40300af725ef3e210cb1acfb7ac8b6
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-1.7.0


In [None]:
#helper function
def process_tweet(s, keep_emoji=True, keep_usernames=False):

  s = s.lower()

  #removing urls, htmls tags, etc
  s = re.sub(r'https\S+', r'', str(s))
  s = re.sub(r'\\n', ' ', s)
  s = re.sub(r'\s', ' ', s)
  s = re.sub(r'<br>', ' ', s)
  s = re.sub(r'&amp;', '&', s)
  s = re.sub(r'&#039;', "'", s)
  s = re.sub(r'&gt;', '>', s)
  s = re.sub(r'&lt;', '<', s)
  s = re.sub(r'\'', "'", s)

  #removing stopwords
  s = remove_stopwords(s)

  #removing emojis
  if keep_emoji:
      s = emoji.demojize(s)
  else:
      emoj = re.compile("["
      u"\U0001F600-\U0001F64F"  # emoticons
      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
      u"\U0001F680-\U0001F6FF"  # transport & map symbols
      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
      u"\U00002500-\U00002BEF"  # chinese char
      u"\U00002702-\U000027B0"
      u"\U00002702-\U000027B0"
      u"\U000024C2-\U0001F251"
      u"\U0001f926-\U0001f937"
      u"\U00010000-\U0010ffff"
      u"\u2640-\u2642" 
      u"\u2600-\u2B55"
      u"\u200d"
      u"\u23cf"
      u"\u23e9"
      u"\u231a"
      u"\ufe0f"  # dingbats
      u"\u3030"
                    "]+", re.UNICODE)

      s = emoj.sub(r'',s)

 #   s = re.sub(r"\\x[0-9a-z]{2,3,4}", "", s)

  #removing hashtags
  s = re.sub(HASHTAG_BEFORE, r'\1!!', s)


  #removing usernames

  #removing just @sign
  if keep_usernames:
      s = ' '.join(s.split())

      s = re.sub(LEADING_NAMES, r' ', s)
      s = re.sub(TAIL_NAMES, r' ', s)

      s = re.sub(FIND_MENTIONS, r'\1', s)

  #removing username completely
  else:
      s = re.sub(FIND_MENTIONS, r' ', s)
    
  #removing username tags - just in case ??
  s = re.sub(re.compile(r'@(\S+)'), r'@', s)
  user_regex = r".?@.+?( |$)|<@mention>"    
  s = re.sub(user_regex," @user ", s, flags=re.I)
  
  # Just in case -- remove any non-ASCII and unprintable characters, apart from whitespace  
  s = "".join(x for x in s if (x.isspace() or (31 < ord(x) < 127)))
  s = ' '.join(s.split())

  return s

In [None]:
#sampling user tweets
file = None

with zipfile.ZipFile('spirs_history.zip') as zip:
  file = zip.open('spirs_history/SPIRS-non-sarcastic-history.txt', mode='r')

dictionary = {}
old_user = None
sentences = []

i = 0
n_user_tweets = 0

for line in file:

    try:
  
      user_id, tweet_id, tweet = re.split(r'\t+', line.decode('utf-8'))

      if user_id in dictionary:
        dictionary[user_id] = dictionary[user_id] + process_tweet(tweet) + '\t'
      else:
        dictionary[user_id] = process_tweet(tweet) + '\t'

    except:

      print(re.split(r'\t+', line.decode('utf-8')))

  i += 1

['301817957', '1325115154195996673', '@RobAdamsFL @SportyMama @MillerMitsu @DSofia21 @ernesto3311 @HardBodyCraig @mlandres12 @RomanGarciaJr @PeteTheStorm @marlid83 @CutlerRidgeLAZ @ChinoLutz @WMGarbageman @dpburnette @Ballgameboss @STEM08 @mojicapr @razincane83 @HugeHoopsFan @ColtenMetzger @ofcourseimajew @KING_G_GILMORE @bigpunisher305 @RB4420 @KappaCane @RobertPerera5 @ChicoHull @chadmch @4feldman @SmallsLaw @lockhart_jesse @cfhell31 @Raymond3633 @HeavyFaithSteve @Romancane @FatherOfKane @gatorfan960608 @TwentyER @youfit @grinders @madiadams_ @JacOnMac @Apple @Maureen52375 @MiamiHEAT @FSUFootball BBR370876756', '1334534491063390208', '@KateMcLoughney @BobbiPeach58 I always have some in\r\n']
['818041150066032641', '1290739498741952512', 'RT @CHARMINGMYG: please spread this https://t.co/qdq34bSDsj624692430', '1334671088559775744', 'RT @Eric_A_Stanley: It’s not that people don’t understand “defund the police” it’s that they want the antiblack, ablest, homo/transphobic,…\r\n']


In [None]:
#creating tweets representation
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

from csv import writer


device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
model = model.to(device)

out_file = 'user_embeddings_nonsar.csv'

with open(out_file, 'a') as f:

  for key in dictionary:

    tweets = dictionary[key].split('\t')
    tweets_length = len(tweets)

    embedding = None

    i = 0

    while i*128 < tweets_length :

      if i*128 + 128 < tweets_length :
        tweet = tweets[i*128 : i*128 + 128]
      else :
        tweet = tweets[i*128 : tweets_length-1]

      if len(tweet) == 0:
        i += 1
        continue

      encoded_input = tokenizer(tweet, padding=True, truncation=True, return_tensors='pt').to(device)

      # Compute token embeddings
      with torch.no_grad():
        model_output = model(**encoded_input)

      # Perform pooling
      sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

      # Normalize embeddings
      sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

      #mean - [128, 384] -> [1, 384]
      if embedding is None:
        embedding = torch.mean(torch.Tensor.cpu(sentence_embeddings), 0).numpy()

      else :
        embedding += torch.mean(torch.Tensor.cpu(sentence_embeddings), 0).numpy()

      i += 1
    
    writer = csv.writer(f)
    writer.writerow([key, np.array_str((1/i)*embedding, max_line_width=np.inf)])


cuda:0
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII