In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
os.chdir("/content/drive/My Drive/SPIRS")

In [5]:
pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 5.5 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 50.6 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 60.6 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 75.6 MB/s 
Collecting tokenizers

In [6]:
import zipfile
import pandas as pd
from csv import reader
import re
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import numpy as np
import csv
import tensorflow

In [7]:
EMOJI_DESCRIPTION_SCRUB = re.compile(r':(\S+?):')
HASHTAG_BEFORE = re.compile(r'#(\S+)')
FIND_MENTIONS = re.compile(r'@(\S+)')
LEADING_NAMES = re.compile(r'^\s*((?:@\S+\s*)+)')
TAIL_NAMES = re.compile(r'\s*((?:@\S+\s*)+)$')

In [10]:
from gensim.parsing.preprocessing import remove_stopwords
import emoji

In [9]:
pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-1.7.0.tar.gz (175 kB)
[K     |████████████████████████████████| 175 kB 30.6 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171046 sha256=d877d0bf662b96cf185ea6a277ea86d9569d1245f02248892eb74274a0a8d341
  Stored in directory: /root/.cache/pip/wheels/8a/4e/b6/57b01db010d17ef6ea9b40300af725ef3e210cb1acfb7ac8b6
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-1.7.0


In [11]:
def process_tweet(s, keep_emoji=True, keep_usernames=False):

  s = s.lower()

  #removing urls, htmls tags, etc
  s = re.sub(r'https\S+', r'', str(s))
  s = re.sub(r'\\n', ' ', s)
  s = re.sub(r'\s', ' ', s)
  s = re.sub(r'<br>', ' ', s)
  s = re.sub(r'&amp;', '&', s)
  s = re.sub(r'&#039;', "'", s)
  s = re.sub(r'&gt;', '>', s)
  s = re.sub(r'&lt;', '<', s)
  s = re.sub(r'\'', "'", s)

  #removing stopwords
  s = remove_stopwords(s)

  #removing emojis
  if keep_emoji:
      s = emoji.demojize(s)
  else:
      emoj = re.compile("["
      u"\U0001F600-\U0001F64F"  # emoticons
      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
      u"\U0001F680-\U0001F6FF"  # transport & map symbols
      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
      u"\U00002500-\U00002BEF"  # chinese char
      u"\U00002702-\U000027B0"
      u"\U00002702-\U000027B0"
      u"\U000024C2-\U0001F251"
      u"\U0001f926-\U0001f937"
      u"\U00010000-\U0010ffff"
      u"\u2640-\u2642" 
      u"\u2600-\u2B55"
      u"\u200d"
      u"\u23cf"
      u"\u23e9"
      u"\u231a"
      u"\ufe0f"  # dingbats
      u"\u3030"
                    "]+", re.UNICODE)

      s = emoj.sub(r'',s)

 #   s = re.sub(r"\\x[0-9a-z]{2,3,4}", "", s)

  #removing hashtags
  s = re.sub(HASHTAG_BEFORE, r'\1!!', s)


  #removing usernames

  #removing just @sign
  if keep_usernames:
      s = ' '.join(s.split())

      s = re.sub(LEADING_NAMES, r' ', s)
      s = re.sub(TAIL_NAMES, r' ', s)

      s = re.sub(FIND_MENTIONS, r'\1', s)

  #removing username completely
  else:
      s = re.sub(FIND_MENTIONS, r' ', s)
    
  #removing username tags - just in case ??
  s = re.sub(re.compile(r'@(\S+)'), r'@', s)
  user_regex = r".?@.+?( |$)|<@mention>"    
  s = re.sub(user_regex," @user ", s, flags=re.I)
  
  # Just in case -- remove any non-ASCII and unprintable characters, apart from whitespace  
  s = "".join(x for x in s if (x.isspace() or (31 < ord(x) < 127)))
  s = ' '.join(s.split())

  return s

In [25]:
file = None

with zipfile.ZipFile('spirs_history.zip') as zip:
  file = zip.open('spirs_history/SPIRS-sarcastic-history.txt', mode='r')

#dictionary = {}
tweets_array = []
old_user = None
sentences = []

i = 0
n_user_tweets = 0

for line in file:

  if i > 0000:

    try:
  
      user_id, tweet_id, tweet = re.split(r'\t+', line.decode('utf-8'))

      tweets_array.append([tweet_id, process_tweet(tweet), user_id])

    except:

      print(re.split(r'\t+', line.decode('utf-8')))

  if i == 10000:
    break

  i += 1

In [20]:
tweets_array

[['1310336957818511362',
  'yup! resort protective styles protect hair love scissors :loudly_crying_face::loudly_crying_face: lockdown, tempted cut myself!',
  '51123724'],
 ['1310336261400473600',
  ':smiling_face_with_hearts::smiling_face_with_hearts:... conflicted wanting cut vs letting grow :weary_face::weary_face::weary_face::weary_face:',
  '51123724'],
 ['1310324348729913349',
  'got lick up! :rolling_on_the_floor_laughing::rolling_on_the_floor_laughing::rolling_on_the_floor_laughing::rolling_on_the_floor_laughing::rolling_on_the_floor_laughing:',
  '51123724'],
 ['1310292631721836544',
  'rt talks loneliness attached bigger person. youre silencing dismissing fe',
  '51123724'],
 ['1310291134841794564', 'rt relationship ptsd real', '51123724'],
 ['1310281047381364736',
  'rt :speaking_head: endchildfoodpoverty!!',
  '51123724'],
 ['1310277415432859648',
  'mannnn wtf!? :rolling_on_the_floor_laughing::rolling_on_the_floor_laughing::rolling_on_the_floor_laughing::rolling_on_the_fl

In [26]:
len(tweets_array)

10000

In [27]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

from csv import writer


device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
model = model.to(device)

out_file = 'user_embeddings3_sar.csv'

with open(out_file, 'a') as f:

#with open('user_embedding.csv', newline='', mode='a', encoding='utf-8') as f_object:

  # Pass this file object to csv.writer() and get a writer object
  #writer_object = writer(f_object)

  for row in tweets_array:

    #row[0] - tweet_id, row[1] - tweet, row[2] - user_id/label

    encoded_input = tokenizer(row[1], padding=True, truncation=True, return_tensors='pt').to(device)

    # Compute token embeddings
    with torch.no_grad():
      model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    embedding = torch.mean(torch.Tensor.cpu(sentence_embeddings), 0).numpy()
    
    writer = csv.writer(f)
    writer.writerow([row[0], np.array_str((1/i)*embedding, max_line_width=np.inf), row[2]])

    print('I', end='')


Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

cuda:0
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII

In [28]:
df = pd.read_csv('user_embeddings3_sar.csv', header = None)

In [29]:
df

Unnamed: 0,0,1,2
0,1331806220596834309,[-2.76897754e-05 -1.10188114e-06 1.55866383e-...,707672778867130368
1,1328817876858515457,[-2.02395913e-05 2.61095153e-07 3.13298879e-...,707672778867130368
2,1327812984312360967,[-4.10710527e-05 2.09783120e-05 -2.04861635e-...,707672778867130368
3,1326383107117899777,[-1.12013768e-05 7.31568298e-06 -4.11455840e-...,707672778867130368
4,1326382157993046017,[-2.84063026e-05 1.90368264e-05 1.51708136e-...,707672778867130368
...,...,...,...
109995,1279588299619926023,[-6.63095605e-08 3.71013311e-06 -3.54425424e-...,18181081
109996,1276217789557157888,[-2.91709694e-06 5.58827367e-07 1.32699154e-...,18181081
109997,1276194294395809792,[ 8.63899913e-07 -1.00229981e-05 -1.43146860e-...,18181081
109998,1276193567384514560,[ 1.3221979e-06 6.3757789e-06 -3.6107190e-06 ...,18181081


In [32]:
len(df.loc[0,1])

6145

In [33]:
len(df.loc[109999,1])

12289