### PREPROCESSING

In [None]:
!pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-1.7.0.tar.gz (175 kB)
[K     |████████████████████████████████| 175 kB 8.8 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171046 sha256=bf195021e1d8d0d10ad582d2987670939c71f28ffcbce644f2ac26de9ce2b65f
  Stored in directory: /root/.cache/pip/wheels/8a/4e/b6/57b01db010d17ef6ea9b40300af725ef3e210cb1acfb7ac8b6
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-1.7.0


In [None]:
import numpy as np
import pandas as pd
import re
import emoji
from gensim.parsing.preprocessing import remove_stopwords
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
string.punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.text import Tokenizer


def remove_na_from_column(df, column_name):
    df = df.dropna(subset = [column_name])
    df = df.reset_index(drop = True)

    return df

def fill_na_from_column(df, column_name):
    df[column_name] = df[column_name].fillna('')

    return df

EMOJI_DESCRIPTION_SCRUB = re.compile(r':(\S+?):')
HASHTAG_BEFORE = re.compile(r'#(\S+)')
FIND_MENTIONS = re.compile(r'@(\S+)')
LEADING_NAMES = re.compile(r'^\s*((?:@\S+\s*)+)')
TAIL_NAMES = re.compile(r'\s*((?:@\S+\s*)+)$')

def preprocess_tweets(df, column_name='sar_text', keep_emoji = True): #column_name=tweet
    df[column_name] = df[column_name].transform(func = process_tweet, keep_emoji=keep_emoji, keep_usernames=False)

    return df

def process_tweet(s, keep_emoji=True, keep_usernames=False):

    s = s.lower()

    #removing urls, htmls tags, etc
    s = re.sub(r'https\S+', r'', str(s))
    s = re.sub(r'\\n', ' ', s)
    s = re.sub(r'\s', ' ', s)
    s = re.sub(r'<br>', ' ', s)
    s = re.sub(r'&amp;', '&', s)
    s = re.sub(r'&#039;', "'", s)
    s = re.sub(r'&gt;', '>', s)
    s = re.sub(r'&lt;', '<', s)
    s = re.sub(r'\'', "'", s)

    #removing stopwords
    s = remove_stopwords(s)

    #removing emojis
    if keep_emoji:
        s = emoji.demojize(s)
    else:
        emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)

        s = emoj.sub(r'',s)

 #   s = re.sub(r"\\x[0-9a-z]{2,3,4}", "", s)

    #removing hashtags
    s = re.sub(HASHTAG_BEFORE, r'\1!!', s)


    #removing usernames

    #removing just @sign
    if keep_usernames:
        s = ' '.join(s.split())

        s = re.sub(LEADING_NAMES, r' ', s)
        s = re.sub(TAIL_NAMES, r' ', s)

        s = re.sub(FIND_MENTIONS, r'\1', s)

    #removing username completely
    else:
        s = re.sub(FIND_MENTIONS, r' ', s)
    
    #removing username tags - just in case ??
    s = re.sub(re.compile(r'@(\S+)'), r'@', s)
    user_regex = r".?@.+?( |$)|<@mention>"    
    s = re.sub(user_regex," @user ", s, flags=re.I)
    
    # Just in case -- remove any non-ASCII and unprintable characters, apart from whitespace  
    s = "".join(x for x in s if (x.isspace() or (31 < ord(x) < 127)))
    s = ' '.join(s.split())

    return s

def remove_punctiation(df, column_name='tweet'):
    df[column_name] = df[column_name].transform(remove_punctuation)

    return df
    
def remove_punctuation(text):
    if(type(text)==float):
        return text
    
    ans=""  
    for i in text:     
        if i not in string.punctuation:
            ans+=i    
            
    return ans

def remove_nltk_stopwords(df, column_name='tweet') :
    df[column_name] = df[column_name].transform(remove_nltk_stopwords_from_tweet)

    return df
    
    
def remove_nltk_stopwords_from_tweet(s):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(s)
    tokens_without_sw = [word for word in word_tokens if not word in stop_words]
    
    s = (" ").join(tokens_without_sw)
    
    return s  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


###USER EMBBEDINGS

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("/content/drive/My Drive/SPIRS")

Mounted at /content/drive


In [None]:
!ls

spirs_history.zip	 SPIRS-sarcastic.csv
SPIRS-non-sarcastic.csv  user_mentions2.json


#USER HISTORY

In [None]:
import random
import nltk
import zipfile
import csv
from csv import DictWriter

OUTPUT_FILE = 'user_embeddings_non_sarcastic_history.csv'
INPUT_FILE = 'spirs_history/SPIRS-non-sarcastic-history.txt'
TOKENS_NO = 250
os.chdir("/content/drive/My Drive/SP")
csv_file = open(OUTPUT_FILE, 'a')
dictwriter_object = DictWriter(csv_file, fieldnames=['user_id', 'history'])

os.chdir("/content/drive/My Drive/SPIRS")
with zipfile.ZipFile('spirs_history.zip') as zip:
    with zip.open(INPUT_FILE, mode='r') as file:
      line = file.readline().split(b'\t')
      old_user_id = int(line[0])
      sar_text = []
      for line in file:
        line = line.decode('UTF-8')
        line = line.split('\t')
        user_id = int(line[0])
        tweet_text = process_tweet(line[2]) 

        if (len(tweet_text)) == 0:
          continue
      
        if user_id == old_user_id: 
          sar_text.append(tweet_text)
        else:
          sar_text = random.sample(sar_text, len(sar_text))
          sentences = ''
          count = 0
          #count tokens, add tweets to embedding 
          for tweet in sar_text:
            #tokens = tweet.split(' ')
            tokens = nltk.word_tokenize(tweet)
            count += len(tokens) + 1 #+1 because of [SEP]
            if count <= TOKENS_NO:
              sentences += tweet + ' [SEP] '
            else:
              count -= len(tokens)
              sar_text = []
              break

          #if num of tokens is smaller than expected add padding
          while count <= TOKENS_NO:
            sentences += ' [PAD]'
            count += 1

          embedding = {'user_id': old_user_id, 'history': sentences}
          dictwriter_object.writerow(embedding)

          old_user_id = user_id
          

#CONCAT USER HISTORY WITH TWEETS

In [None]:
from csv import reader
import csv

os.chdir("/content/drive/My Drive/SPIRS")

INPUT_FILE = 'SPIRS-non-sarcastic.csv'

df = pd.read_csv(INPUT_FILE)
df = pd.DataFrame({'sar_user': np.array(df['sar_user']), 'sar_id': np.array(df['sar_id']), 'sar_text': np.array(df['sar_text'])})
df = remove_na_from_column(df, 'sar_user')
df = remove_na_from_column(df, 'sar_id')
df = remove_na_from_column(df, 'sar_text')
df = preprocess_tweets(df, 'sar_text')

for i in range(len(df['sar_user'])):
  user = df['sar_user'][i].split('|')
  df['sar_user'][i] = int(user[-1])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
os.chdir("/content/drive/My Drive/SP")
file_history = open('user_embeddings_non_sarcastic_history.csv', 'r')
csv_reader = reader(file_history)

OUTPUT_FILE = 'user_embeddings_non_sarcastic_beginning.csv'
file_write = open(OUTPUT_FILE, 'a')
writer = csv.writer(file_write)
writer.writerow(['user_id', 'sar_id', 'sar_text'])

end = False   #if true add tweet to the end

for row in csv_reader:
  users_tweets = df[df['sar_user'] == int(row[0])]   #find tweets of user
  sar_id = users_tweets['sar_id'].values
  tweets = users_tweets['sar_text'].values
  

  for i in range(len(tweets)):
    if end:
      extended_tweet = '[CLS] ' + row[1] + ' ' + tweets[i]
    else:
      extended_tweet = '[CLS] ' + tweets[i] + ' [SEP] ' + row[1] 
    
    writer.writerow([str(row[0]), str(sar_id[i]), extended_tweet])   #user_id, sar_id, tweet + history