## Creating feature dataframes

In [4]:
import pandas as pd
import re
import numpy as np
import jamspell
import nltk

from gensim.models import KeyedVectors
from simple_elmo import ElmoModel

from nltk import word_tokenize
from pymorphy2 import MorphAnalyzer
from collections import Counter

from math import log
from statistics import mean
from random import choices

nltk.download('punkt')

m = MorphAnalyzer()

corrector = jamspell.TSpellCorrector()
corrector.LoadLangModel('ru_small.bin')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
pd.set_option('display.max_colwidth', -1)

  """Entry point for launching an IPython kernel.


In [6]:
df = pd.read_csv('train_corpus.csv', sep=';')
df.dropna(inplace=True)
df = df.drop(['Post tag'], axis=1)
print(f'Sentences in total: {len(df)}')
print(f'Ironic sentences: {df["Ironic"].value_counts()[1.0]}')
df.head(5)

Sentences in total: 8956
Ironic sentences: 1001


Unnamed: 0,Sentence,Ironic
0,У человека-летучей мыши обнаружен Covid-19,1.0
1,"Съемки нового фильма «Бэтмен» приостановлены из-за того, что исполнитель главной роли Роберт Паттинсон заразился коронавирусом, передает Liter.kz со ссылкой на издание Vanity Fair.",0.0
2,"В официальном заявлении киностудии Warner Bros. не уточняется, у кого из участников съемочной группы выявили положительный результат на COVID-19.",0.0
3,"Здесь лишь сообщили, что инфицированный находится в изоляции, а съемки на время приостановлены.",0.0
4,"По данным источника издания Vanity Fair, коронавирусом заразился Роберт Паттинсон, исполняющий главную роль в фильме.",0.0


### Irony markers extraction

In [7]:
def weirdness(target: str, contrast: str):
    target_words = [word for sent in target for word in word_tokenize(sent) if word.isalpha()]
    len_target = len(target_words)
    target_lemmas = [m.parse(word)[0].normal_form for word in target_words]
    target_freqs = Counter(target_lemmas)

    contrast_words = [word for sent in contrast for word in word_tokenize(sent) if word.isalpha()]
    len_contrast = len(contrast_words)
    contrast_lemmas = [m.parse(word)[0].normal_form for word in contrast_words]
    contrast_freqs = Counter(contrast_lemmas)

    coefs = {}
    for word in target_freqs:
        coefs[word] = (target_freqs[word] / len_target) / ((contrast_freqs[word]+target_freqs[word]) / (len_contrast+len_target))

    return coefs

In [8]:
def loglikelihood(target: str, contrast: str):
    target_words = [word for sent in target for word in word_tokenize(sent) if word.isalpha()]
    len_target = len(target_words)
    target_lemmas = [m.parse(word)[0].normal_form for word in target_words]
    target_freqs = Counter(target_lemmas)

    contrast_words = [word for sent in contrast for word in word_tokenize(sent) if word.isalpha()]
    len_contrast = len(contrast_words)
    contrast_lemmas = [m.parse(word)[0].normal_form for word in contrast_words]
    contrast_freqs = Counter(contrast_lemmas)
   
    coefs = {}
    for word in target_freqs:
        d_targ = len_target * (target_freqs[word] + contrast_freqs[word]) / (len_target + len_contrast)
        d_contr = len_contrast * (target_freqs[word] + contrast_freqs[word]) / (len_target + len_contrast)
        coefs[word] = 2*((target_freqs[word]*log(target_freqs[word]/d_targ)) + (contrast_freqs[word]*log(contrast_freqs[word]/d_contr)))
    
    return coefs

In [9]:
target = df[df['Ironic'] == 1]['Sentence'].values.tolist()
contrast = df['Sentence'].values.tolist()

In [10]:
top_weird = weirdness(target, contrast)
Counter(top_weird).most_common()[:10]

[('копчик', 3.9704838336291384),
 ('накалиться', 3.9704838336291384),
 ('ежели', 3.9704838336291384),
 ('пытливый', 3.9704838336291384),
 ('знаток', 3.9704838336291384),
 ('позорный', 3.9704838336291384),
 ('пригвоздить', 3.9704838336291384),
 ('альтернативноодаренный', 3.9704838336291384),
 ('школоть', 3.9704838336291384),
 ('ничуть', 3.9704838336291384)]

In [11]:
top_logl = loglikelihood(target, contrast)
Counter(top_logl).most_common()[:10]

[('https', 26.127949595009547),
 ('человек', 17.14162764118675),
 ('страна', 15.570015806263505),
 ('демократический', 13.701574411968817),
 ('ты', 13.170662824330812),
 ('чистый', 11.934076150707632),
 ('случайность', 11.113824330289322),
 ('режим', 10.916293929684276),
 ('яков', 10.249708432234861),
 ('свобода', 9.669731794700796)]

### Feature calculation

In [None]:
ironies = df.loc[df['Ironic'] == 1]
non_ironies = df.loc[df['Ironic'] == 0]
non_ironies = non_ironies.sample(1001)
norm_df = pd.concat([ironies, non_ironies], sort=False, axis=0)
norm_df = norm_df.sample(frac=1).reset_index(drop=True)
norm_df.head(10)

In [None]:
weird_markers = list(map(lambda x: x[0], top_weird))[:10]
', '.join(weird_markers)

logl_markers = list(map(lambda x: x[0], top_logl))
', '.join(logl_markers)

interjs = ['бы', 'ах', 'эх', 'ой', 'ох', 'оу', 'угу', 'ага', 'вау', 'ха', 'ха-ха', 'ух']

pat = re.compile('[«|\"](.+)[»|\"]')

In [None]:
def quotes(sent):
    inbrs = re.findall(pat, sent)
    for inbr in inbrs:
        if inbr[0].islower() and ' ' not in inbr:
            return 1
    else:
        return 0

In [None]:
def exclam(sent):
    return sent.count('!')

In [None]:
def quest(sent):
    return sent.count('?')

In [None]:
def mult_br(sent):
    if '))' in sent or '((' in sent:
        return 1
    return 0

In [None]:
def intj(sent):
    sent_words = word_tokenize(sent)
    for word in interjs:
        if word in sent_words or word.upper() in sent_words:
            return 1 
    return 0

In [None]:
def weird_mrkers(sent):
    lemmas = [m.parse(word)[0].normal_form for word in word_tokenize(sent) if word.isalpha()]
    for word in weird_markers:
        if word in lemmas:
            return 1
    return 0

In [None]:
def if_mistake(sent):
    sent2 = corrector.FixFragment(sent)
    if sent == sent2:
        return 0
    else:
        count = 0
        for w1, w2 in zip(word_tokenize(sent), word_tokenize(sent2)):
            if w1 != w2:
                count += 1
        return count

In [None]:
concat_vocab = pd.read_csv('concat_vocab.csv', encoding='UTF-8', sep=';')
wordlist = concat_vocab['term'].tolist()
print(len(wordlist))

In [None]:
def contrast_tone(sent, flag):
    tag_list = []
    for word in word_tokenize(sent):
        if word.isalpha():
            lemma = m.parse(word)[0].normal_form
            if lemma in wordlist:
                tag = list(concat_vocab.loc[concat_vocab['term'] == lemma]['tag'])[0]
            else:
                tag = '-'
            tag_list.append(tag)
            
    if flag == 'pos_and_neg':
        if 'PSTV' in tag_list and 'NGTV' in tag_list:
            return 1
        return 0
    
    if flag == 'pos_near_neg':
        for n in range(len(tag_list)-1):
            if tag_list[n] == 'PSTV' and tag_list[n+1] == 'NGTV':
                return 1
            if tag_list[n] == 'NGTV' and tag_list[n+1] == 'PSTV':
                return 1
        return 0

In [None]:
model_sg = KeyedVectors.load_word2vec_format('model.bin', binary=True)

In [None]:
def skipgram_vec(sent):
    tokens = []
    for word in word_tokenize(sent):
        if word.isalpha():
            result = m.parse(word)[0]
            lemma = result.normal_form
            pos = result.tag.POS
            if pos == 'NOUN' or pos == 'VERB':
                if f'{lemma}_{pos}' in model_sg.vocab:
                    tokens.append(f'{lemma}_{pos}')
    if tokens == []:
        return 0
    vecs = [model_sg[t] for t in tokens]
    center = np.mean(vecs, axis=0)

    max_d = max(model_sg.distances(center, tokens))
    return abs(max_d)

In [None]:
model_elmo = ElmoModel()
model_elmo.load('199.zip')

In [None]:
def elmo_vec(sent):
    tokens = []
    for word in word_tokenize(sent):
        if word.isalpha():
            result = m.parse(word)[0]
            lemma = result.normal_form
            pos = result.tag.POS
            if pos == 'NOUN' or pos == 'VERB' or pos == 'ADJF':
                tokens.append(lemma)
    if tokens == []:
        return 0
    vecs = model_elmo.get_elmo_vector_average(tokens)
    center = np.mean(vecs, axis=0)

    min_sim = min(model_sg.cosine_similarities(center, vecs))
    idx = list(model_sg.cosine_similarities(center, vecs)).index(min_sim)
    return 1 - min_sim

### Main corpus (2 002 sentences)

In [None]:
norm_df['Quotes'] = norm_df['Sentence'].apply(quotes)
norm_df['Exclamation'] = norm_df['Sentence'].apply(exclam)
norm_df['Question'] = norm_df['Sentence'].apply(quest)
norm_df['Multiple brackets'] = norm_df['Sentence'].apply(mult_br)
norm_df['Interjections'] = norm_df['Sentence'].apply(intj)
norm_df['Weird markers'] = norm_df['Sentence'].apply(weird_mrkers)
norm_df['Mistakes'] = norm_df['Sentence'].apply(if_mistake)
norm_df['P&N'] = norm_df['Sentence'].apply(contrast_tone, flag ='pos_and_neg')
norm_df['P near N'] = norm_df['Sentence'].apply(contrast_tone, flag ='pos_near_neg')
norm_df['Max vec dist'] = norm_df['Sentence'].apply(elmo_vec)
norm_df['Max vec sg dist'] = norm_df['Sentence'].apply(skipgram_vec)

In [None]:
norm_df.to_csv('train_features.csv', index=False, sep=';')

### Test corpus (100 sentences)

In [12]:
test_df = pd.read_csv('test_corpus.csv', sep=';')
print(f'Sentences in total: {len(test_df)}')
print(f'Ironic sentences: {test_df["Ironic"].value_counts()[1.0]}')
test_df.head(5)

Sentences in total: 100
Ironic sentences: 30


Unnamed: 0,Sentence,Ironic
0,Можете спать спокойно — террористы не пройдут!,1
1,"Во всём мире люди очень боятся терактов, даже произнесение слова ""террорист"" у многих вызывает панику.",0
2,Но у меня для вас прекрасные новости из Кургана!,1
3,Там наконец пришли к решению этой глобальной проблемы.,1
4,Теперь можно спать спокойно.,1


In [None]:
test_df['Quotes'] = test_df['Sentence'].apply(quotes)
test_df['Exclamation'] = test_df['Sentence'].apply(exclam)
test_df['Question'] = test_df['Sentence'].apply(quest)
test_df['Multiple brackets'] = test_df['Sentence'].apply(mult_br)
test_df['Interjections'] = test_df['Sentence'].apply(intj)
test_df['Weird markers'] = test_df['Sentence'].apply(weird_mrkers)
test_df['Mistakes'] = test_df['Sentence'].apply(if_mistake)
test_df['P&N'] = test_df['Sentence'].apply(contrast_tone, flag ='pos_and_neg')
test_df['P near N'] = test_df['Sentence'].apply(contrast_tone, flag ='pos_near_neg')
test_df['Max vec dist'] = test_df['Sentence'].apply(elmo_vec)
test_df['Max vec sg dist'] = test_df['Sentence'].apply(skipgram_vec)

In [None]:
test_df.to_csv('test_features.csv', index=False, sep=';')