# Install necessary libraries

In [None]:
!pip install tokenize_uk
!pip install advertools
!pip install emot
!pip install stanza
!pip install emosent-py

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenize_uk
  Downloading tokenize_uk-0.2.0.tar.gz (22 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tokenize_uk
  Building wheel for tokenize_uk (setup.py) ... [?25l[?25hdone
  Created wheel for tokenize_uk: filename=tokenize_uk-0.2.0-py2.py3-none-any.whl size=4588 sha256=4b443c0c317ad85d615e6b300e91e58988ccc1663be2ed6bfa473ade6d4a25cf
  Stored in directory: /root/.cache/pip/wheels/df/b5/be/5eba684a792f1b6c4707ba47d29cd55afaac03124b448da2dc
Successfully built tokenize_uk
Installing collected packages: tokenize_uk
Successfully installed tokenize_uk-0.2.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymorphy3
  Downloading pymorphy3-1.2.0-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m1.0 MB/s[0m et

# Download necessary packages

In [None]:
import pandas as pd
import re
import advertools as adv
import emot as emotic
import stanza
stanza.download("uk", verbose=False)
import nltk
import tokenize_uk
from nltk.tokenize import TweetTokenizer
from emosent import get_emoji_sentiment_rank

# Perform text cleaning and part of speech tagging

In [None]:
# downloading dataset
data = pd.read_csv('/content/3795text_original_tag_cleaned.csv', usecols=['NO_TAGS', 'IS_SARCASTIC'])
data = data.rename(columns={'NO_TAGS':'text', 'IS_SARCASTIC': 'is_sarcastic'})
data.head()

Unnamed: 0,is_sarcastic,text
0,1,графіки виживання тритонів і піраміди смертнос...
1,1,@lovemyself_not оні ти мене заблокуєш я люблю ...
2,1,@k1207h03 я броньована бля😎
3,1,"мене не добавляють у різні списочки по типу ""г..."
4,1,@sorixben можу стати 😇


In [None]:
# download stopwords
with open('/content/stopwords_ua.txt', encoding='utf-8') as file:
    stops = file.read().split()

# initialize necessary classes for tokenizarion, emoji extraction and POS tagging  
tokenizer = TweetTokenizer()

emot_obj = emotic.core.emot()
uk_nlp = stanza.Pipeline('uk')

In [None]:
def perform_pos_tagging(text):
  """Performs part of speech tagging on given text.

  Args:
    text (str): text to be preprocessed

  Returns:
    str: part of speech tags separated by spaces
  """
  return ' '.join(word.pos for sent in uk_nlp(text).sentences for word in sent.words)

def lemmatize(cleared_text):
  return ' '.join(word.lemma for sent in uk_nlp(cleared_text).sentences for word in sent.words)

In [None]:
def normalize_elongated_words(text):
    """Performs normalization of words with elongated vowels. 
    For example, 'дуууже' -> 'дуже'.

  Args:
    text (str): text to be preprocessed

  Returns:
    str: text with normalized elongated words
  """

    regex = re.compile(r'(\w)\1+')
    normalized = []
    for token in tokenizer.tokenize(text):
      if regex.search(token):
        normalized.append(re.sub(r'(\w)\1+', r'\1', token))
      else:
        normalized.append(token)
    return ' '.join(normalized)

def basic_cleaning(text):
  """Performs basic text cleaning, specifically remove leading/trailing spaces,
     latin characters, punctuation, hashtags, links, user mentions, more than
     one space, numbers; normalizes elongated word, substitutes apostrophe
     with another similar symbol and finally lowercases text.

  Args:
    text (str): text to be preprocessed

  Returns:
    str: cleaned text
  """
  text = re.sub('[A-Za-z]+', '', text)
  text = re.sub('#\w+', '', text)
  text = re.sub('.pic.\S+', '', text)
  text = re.sub('http\S+', '', text)
  text = re.sub('bit.ly/\S+', '', text)
  text = text.strip('[link]') 
  text = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', text) 
  text = text.lower() 
  text = re.sub('([0-9]+)', '', text)
  text = re.sub(r'[\\!"#$%&\(\)*+,-./:;<=>?\[\]^_`{|}~•@—–…¯\\ツ¯‼«»☺️„“™]+', '', text)
  text = re.sub('\s+', ' ', text) 
  text = re.sub('’', "'", text)
  text = normalize_elongated_words(text)
  text = text.strip()
  return text

In [None]:
# clean and POS tag tweets
data['cleaned'] = data['text'].apply(lambda x: basic_cleaning(x))
data['pos_tags'] = data['cleaned'].apply(lambda x: perform_pos_tagging(x))
data['lemmatized'] = data['cleaned'].apply(lambda x: lemmatize(x))
# download into separate file
data.to_csv('dataset_tagged.csv') 

In [None]:
data.head()

Unnamed: 0,is_sarcastic,text,cleaned,pos_tags,lemmatized
0,1,графіки виживання тритонів і піраміди смертнос...,графіки виживаня тритонів і піраміди смертност...,NOUN NOUN NOUN CCONJ NOUN NOUN NOUN ADJ DET ADJ,графік виживаня тритон і піраміда смертність б...
1,1,@lovemyself_not оні ти мене заблокуєш я люблю ...,оні ти мене заблокуєш я люблю каєбед вони мій ...,PRON PRON PRON VERB PRON VERB NOUN PRON DET NO...,оні ти я заблокувати я любити каєбед вони мій ...
2,1,@k1207h03 я броньована бля😎,я броньована бля 😎,PRON ADJ NOUN PUNCT,я броньований бля 😎
3,1,"мене не добавляють у різні списочки по типу ""г...",мене не добавляють у різні списочки по типу гі...,PRON PART VERB ADP ADJ NOUN ADP NOUN NOUN PRON...,я не добавляти у різний списочка по тип гівнож...
4,1,@sorixben можу стати 😇,можу стати 😇,VERB VERB PUNCT,могти стати 😇


# Perform feature engineering

In [None]:
# load data obtained from previous step
data = pd.read_csv("/content/dataset_tagged.csv")

In [None]:
# count number of characters in text
def character_count(text):
    return len(text)

# count number of words in text
def words_count(text):
    return len([word for word in tokenizer.tokenize(text) if re.search("[А-ЩЬЮЯҐЄІЇа-щьюяґєії'`’ʼ-]+", word)])

# count ratio of capital characters in text
def capital_chars_count(text):
    capital_chars = sum(1 for char in text if char.isupper())
    if capital_chars:
      return capital_chars / character_count(text)
    else:
      return 0

# count ratio of exclamation marks in text
def exclamation_mark_count(text):
  if text.count('!'):
    return text.count("!") / len(tokenizer.tokenize(text))
  return 0

# count ratio of question marks in text
def question_mark_count(text):
  if text.count('?'):
    return text.count("?") / len(tokenizer.tokenize(text))
  return 0

# count ratio of elipsis in text
def elipsis_count(text):
  if text.count('...'):
    return text.count("...") / len(tokenizer.tokenize(text))
  return 0

# count ratio of full stops in text
def full_stop_count(text):
  if text.count('.'):
    return text.count(".") / len(tokenizer.tokenize(text))
  return 0

# count ratio of words in quotes in text
def words_in_quotes_count(text, length_in_words):
    quoted = re.findall("\'.+\'|\".+\"", text)
    if quoted:
      return len(quoted) / length_in_words
    return 0
    
# count number of sentences in text
def sentences_count(text):
    return len((tokenize_uk.tokenize_sents(text)))

# count number of unique words in text
def unique_words_count(text):
    return len(set(tokenizer.tokenize(text)))
    
# count number of stopwords in text
def stopwords_count(text):
    return len([w for w in tokenizer.tokenize(text) if w in set(stops)])

# count ratio elongated words in text
def elongated_words_count(text, length_in_words):
    regex = re.compile(r"([аеіоуи])\1{2}")
    elongated = sum(1 for word in tokenizer.tokenize(text) if regex.search(word))
    if elongated:
      return elongated / length_in_words
    return 0

# count ratio of emoji in text
def emojis_count(text, length_in_words):
    emoji = len(adv.extract_emoji([text])['emoji'][0])
    if emoji:
      return emoji / length_in_words
    else:
      return 0

# count ratio of emoticons in text
def emoticons_count(text, length_in_words):
    emoticons = len(emot_obj.emoticons(text)['value'])
    if emoticons:
      return emoticons / length_in_words
    else:
      return 0

# count ratio of intensifiers in text
def intensifiers_count(pos_tags:str, length_in_words: int):
  """As intensifiers were considered such sequence of POS tags:
      ADV | ADJ + ADV | VERB | ADJ
  """
  counter = 0
  pos_tags = pos_tags.split()
  for i, tag in enumerate(pos_tags):
      if tag in ('ADV', 'ADJ', 'PART'):
          try:
              if pos_tags[i + 1] in ('PART', 'VERB', 'ADV', 'ADJ'):
                  counter += 1
          except:
              pass
  if counter:
    return counter / length_in_words
  else:
    return 0

# count ratio of adverbs in text
def adverbs_count(pos_tags: str, length_in_words: int):
  if pos_tags.count('ADV'):
    return pos_tags.count('ADV') / length_in_words
  else:
    return 0

# count ratio of adjectives in text
def adjectives_count(pos_tags: str, length_in_words: int):
  if pos_tags.count('ADJ'):
    return pos_tags.count('ADJ') / length_in_words
  else:
    return 0

# count ratio of interjections in text
def interjections_count(pos_tags: str, length_in_words: int):
  if pos_tags.count('INTJ'):
    return pos_tags.count('INTJ') / length_in_words
  else:
    return 0

# count ratio of particles in text
def particles_count(pos_tags: str, length_in_words: int):
  if pos_tags.count('PART'):
    return pos_tags.count('PART') / length_in_words
  else:
    return 0

In [None]:
# apply functions defined above on text
other_features = {'quoted words': [], 'elongated words': [], 'emojis': [],'emoticons': [],
                 'intensifiers': [],'adjectives': [],'adverbs': [],'interjections': [], "particles": []}

for index, row in data.iterrows():
  length_in_words = words_count(row['text'])
  other_features['quoted words'].append(words_in_quotes_count(row['text'], length_in_words))
  other_features['elongated words'].append(elongated_words_count(row['text'], length_in_words))
  other_features['emojis'].append(emojis_count(row['text'], length_in_words))
  other_features['emoticons'].append(emoticons_count(row['text'], length_in_words))
  other_features['intensifiers'].append(intensifiers_count(row['pos_tags'], length_in_words))
  other_features['adjectives'].append(adjectives_count(row['pos_tags'], length_in_words))
  other_features['adverbs'].append(adverbs_count(row['pos_tags'], length_in_words))
  other_features['interjections'].append(interjections_count(row['pos_tags'], length_in_words))
  other_features['particles'].append(particles_count(row['pos_tags'], length_in_words))

In [None]:
# apply functions defined above on text
data['characters count'] = data["text"].apply(lambda x:character_count(x))
data['words count'] = data["text"].apply(lambda x:words_count(x))
data['sentences count'] = data["text"].apply(lambda x:sentences_count(x))
data['capital characters count'] = data["text"].apply(lambda x:capital_chars_count(x))
data['stopwords count'] = data["text"].apply(lambda x:stopwords_count(x))
data['unique words count'] = data["text"].apply(lambda x:unique_words_count(x))
data['exclamation marks'] = data["text"].apply(lambda x:exclamation_mark_count(x))
data['question marks'] = data["text"].apply(lambda x:question_mark_count(x))
data['full stops'] = data["text"].apply(lambda x:full_stop_count(x))
data['elipsis'] = data["text"].apply(lambda x:elipsis_count(x))
data['average length of word'] = data['characters count']/data['words count']
data['average length of sentence'] = data['words count']/data['sentences count']
data['ratio of unique words'] = data['unique words count']/data['words count']
data['ratio of stop words'] = data['stopwords count']/data['words count']

In [None]:
# convert one of the features set to dataframe
features = pd.DataFrame(other_features)

In [None]:
# concatenate all features
final_features = pd.concat([data, features], axis=1)

In [None]:
final_features.shape

(7590, 29)

In [None]:
final_features.isna().any()

Unnamed: 0                    False
is_sarcastic                  False
text                          False
cleaned                       False
pos_tags                      False
lemmatized                    False
characters count              False
words count                   False
sentences count               False
capital characters count      False
stopwords count               False
unique words count            False
exclamation marks             False
question marks                False
full stops                    False
elipsis                       False
average length of word        False
average length of sentence    False
ratio of unique words         False
ratio of stop words           False
quoted words                  False
elongated words               False
emojis                        False
emoticons                     False
intensifiers                  False
adjectives                    False
adverbs                       False
interjections               

In [None]:
# dowload 2 sentiment dictionaries and combine them

words = pd.read_table('/content/tone-dict-uk.tsv')['Всевишній'].to_list()

words = [word.lower() for word in words]

sent = pd.read_table('/content/tone-dict-uk.tsv')['1'].to_list()

sent_words = list(zip(words, sent))


def Convert(tup, di):
    for a, b in tup:
        di.setdefault(a, []).append(b)
    return di

# Driver Code   
tups = sent_words
dictionary = {}
sent_words = Convert(tups, dictionary)

skrup_sent = {}

with open("sentiment_ua.txt", "r", encoding="utf-8") as f:
  for l in f.readlines()[1:]:
    skrup_sent[l.split(';')[0].lower()] = l.split(';')[1].strip()

def combined_dicts(dic1, dic2):
  for key in dic1.keys():
    if key not in dic2.keys():
      dic2[key] = dic1[key]
  return dic2

combined = combined_dicts(sent_words, skrup_sent)

for k, v in combined.items():
  if isinstance(v, list):
    combined[k] = v[0]

In [None]:
def sentiment(text):
    """Calculates sentiment for given text.

  Args:
    text (str): text to calculate sentiment onn

  Returns:
    int: sentiment score
  """
    words = text.split()
    negative = 0
    positive = 0
    total = len(words)
    for word in words:
        if word in combined:
            if float(combined[word]) > 0:
                positive += 1
            else:
                negative +=1
        else:
          try:
            if float(get_emoji_sentiment_rank(word)['sentiment_score']) > 0:
                positive += 1
            elif float(get_emoji_sentiment_rank(word)['sentiment_score']):
                negative +=1
          except:
            continue
    pr = positive / total
    nr = negative / total
    return pr - nr

In [None]:
# for key, value in combined.items():
#   if int(value) == 2:
#     combined[key] = 1
#   elif int(value) == -2:
#     combined[key] = -1

In [None]:
def find_contradiction(text):
    """Check whether contradicted sentiments are present in text.

  Args:
    text (str): text to be preprocessed

  Returns:
    int: 1 or 0 for True and False for contradiction presence.
  """
  text = text.split()
  negative = 0
  positive = 0
  for word in text:
      if word in combined:
          if int(combined[word]) > 0:
            # print(word)
            positive += 1
          else:
            # print(word)
            negative +=1
      else:
          try:
            if float(get_emoji_sentiment_rank(word)['sentiment_score']) > 0:
                positive += 1
            elif float(get_emoji_sentiment_rank(word)['sentiment_score']):
                negative +=1
          except:
            continue
  if positive and negative:
    return 1
  else:
    return 0

In [None]:
df['sentiment'] = df['lemmatized'].apply(sentiment)
df['contradiction'] = df['lemmatized'].apply(find_contradiction)

In [None]:
df.to_csv('sent_contra_added.csv')