# Create dataset for collected data

In [1]:
import json
import os
import pickle
import re

import pandas as pd

In [2]:
ci_path = '../results/citaty_info/qbq'
# ci_path = '../results/citaty_info'

### Read quotes to list

In [3]:
quotes = []

for file in os.listdir(ci_path):
    if file.endswith(".txt"):
        with open(os.path.join(ci_path, file), 'r') as f:
            lines = f.readlines()
            for i, line in enumerate(lines):
                quote = json.loads(line)
                quotes.append(quote)

### Create dataset

In [4]:
df = pd.DataFrame(quotes)

In [5]:
df.shape

(438085, 10)

In [6]:
df.head()

Unnamed: 0,link,text,references,tags,rating,rating_positive,rating_negative,submitted_by,submitted_date,comments_count
0,https://citaty.info/quote/130001,— В моём кабинете всё ещё идет дождь . Уже два...,{'Цитата из фильма': 'Гарри Поттер и Дары смер...,[смешные цитаты],124,129,5,JustRevenge,05.05.11 в 20:16,5
1,https://citaty.info/quote/130002,Джульетта была моего возраста: она успела влюб...,"{'Цитата из сериала': 'Моя прекрасная няня', '...",[секс],29,38,9,дыши со мной,05.05.11 в 20:16,0
2,https://citaty.info/quote/130003,"— Боже мой, что же мне делать! Моя жена совсем...",{'Цитата из фильма': 'Гарри Поттер и Дары смер...,[смешные цитаты],172,181,9,JustRevenge,05.05.11 в 20:18,8
3,https://citaty.info/quote/130004,Мутацию в твоем случае корректнее было бы назы...,{},"[Автор неизвестен, мутация]",4,5,1,Varela,05.05.11 в 20:34,0
4,https://citaty.info/quote/130005,"Боже, ну зачем тебя я встретил? Лучше бы я шел...","{'Исполнитель': 'R.Mike', 'Песня': 'Безумная'}",[любовь],3,5,2,Lerya Nolwenn.,05.05.11 в 20:34,0


### Normalize id

In [7]:
def get_quote_id(link):
    return link.split('/')[-1]

In [8]:
df['link_id'] = df['link'].apply(get_quote_id)
df.link_id = df.link_id.astype('int32')
df = df.drop_duplicates(subset=['link_id'])

In [9]:
df.set_index('link_id', inplace=True)
df.drop(['link', ], axis=1, inplace=True);

In [10]:
df.sort_index(inplace=True);

### Check if data are correct

Should be empty

In [11]:
b = df.index.values[0]
for i in df.index.values[1:]:
    if i-b > 100:
        print(b, i)
    b = i

### Check for references keys

In [12]:
keys = []

for k in df.references:
    keys += k.keys()
    
set(keys)

{'Автор цитаты',
 'Исполнитель',
 'Песня',
 'Самиздат',
 'Цитата из аниме',
 'Цитата из игры',
 'Цитата из книги',
 'Цитата из комикса',
 'Цитата из мультфильма',
 'Цитата из сериала',
 'Цитата из спектакля',
 'Цитата из телешоу',
 'Цитата из фильма',
 'Цитируемый персонаж'}

In [13]:
new_keys = {
    'Автор цитаты': 'author',
    'Исполнитель': 'performer',
    'Песня': 'song',
    'Самиздат': 'samizdat',
    'Цитата из аниме': 'anime',
    'Цитата из игры': 'game',
    'Цитата из книги': 'book',
    'Цитата из комикса': 'comic',
    'Цитата из мультфильма': 'cartoon',
    'Цитата из сериала': 'serial',
    'Цитата из спектакля': 'play',
    'Цитата из телешоу': 'tv',
    'Цитата из фильма': 'movie',
    'Цитируемый персонаж': 'character'
}

In [14]:
for val in new_keys.values():
    df[val] = None

In [15]:
for ind, row in df.iterrows():
    refs = row['references']
    for key in refs.keys():
        df.at[ind, new_keys[key]] = refs[key]

In [16]:
df.drop(['references', ], axis=1, inplace=True);

### Fix rating data

In [17]:
df.rating = pd.to_numeric(df.rating, errors='coerce')
df.rating.fillna(0, inplace=True);
df.rating = df.rating.astype('int32')

In [18]:
df.rating_positive = pd.to_numeric(df.rating_positive, errors='coerce')
df.rating_positive.fillna(0, inplace=True);
df.rating_positive = df.rating_positive.astype('int32')

In [19]:
df.rating_negative = pd.to_numeric(df.rating_negative, errors='coerce')
df.rating_negative.fillna(0, inplace=True);
df.rating_negative = df.rating_negative.astype('int32')

In [20]:
for ind, row in df.iterrows():
    if (row['rating'] and row['rating_positive'] 
        and not row['rating_negative']):
        df.at[ind, 'rating_negative'] = row['rating_positive'] - row['rating']

### Fix submitted_date

In [21]:
def parse_dtime(dtime_str):
    return "".join(filter(lambda x: re.match(r'[\d\.\: ]', x), dtime_str))

In [22]:
df.submitted_date = pd.to_datetime(
    df['submitted_date'].apply(parse_dtime)
)

### Convert tags to lowercase

In [23]:
def create_list_column(tags):
    return [
        item.lower() for item in tags if item.lower() != 'автор неизвестен'
    ]

In [24]:
df.tags = df.tags.apply(create_list_column)

### Save raw data

In [25]:
with open(os.path.join(ci_path, 'raw_ci.pickle'), 'wb') as f:
    pickle.dump(df, f)

## Prepare data for Sarcasm/Irony Detection task
### Delete extra brackets

In [26]:
def del_square_brackets(text):
    '''Delete [ ... ] and [ ... ]:'''
    return re.sub(r'\[[^\]^\[]+\]\:?\s*', '', text)

In [27]:
def del_round_brackets(text):
    '''Delete ( ... ) at the end of the line'''
    return re.sub(r'[\.\!\?]+\s*(\([^\)^\(]+\)\.*\s*)+$', '.', text)

In [28]:
df['text'] = df['text'].apply(del_square_brackets)

In [29]:
df['text'] = df['text'].apply(del_round_brackets)

### Mark all dialogs

In [30]:
def is_dialog(text):
    if (
        (len(re.findall(r'—', text)) >= 2 and re.match(r'—', text))
        or 
        (len(re.findall(r'–', text)) >= 2 and re.match(r'–', text))
        or 
        (len(re.findall(r'−', text)) >= 2 and re.match(r'−', text))
        or 
        (len(re.findall(r'-', text)) >= 2 and re.match(r'-', text))
    ):
        return 1
    else:
        return 0

In [31]:
df['is_dialog'] = df['text'].apply(is_dialog)

### Check for list data

In [32]:
for column in df.columns:
    if list in set([type(i) for i in df[column]]):
        print(column)

tags
author
performer
character


### Normalize columns

In [33]:
def normalize_column(tag):
    if type(tag) == list:
        return list(set(
            [' '.join(item.split()) for item in tag]
        ))
    elif type(tag) == str:
        return [tag, ]
    else:
        return []

In [34]:
df['tags'] = df['tags'].apply(normalize_column)

In [35]:
df['author'] = df['author'].apply(normalize_column)

In [36]:
df['performer'] = df['performer'].apply(normalize_column)

In [37]:
df['character'] = df['character'].apply(normalize_column)

### Resolve ambiguity for character

In [38]:
def is_character_in_text(text, characters):
    text = text.lower()
    for character in [char.lower() for char in characters]:
        if character in text:
            return True
    return False

In [39]:
for ind, row in df.iterrows():
    if len(row.character) > 1:
         if is_character_in_text(row.text, row.character):
                df.at[ind, 'is_dialog'] = 1

### Get data with specific tages

In [40]:
def get_tags_by_topic(tag, topic_list):
    return True if set(tag).intersection(topic_list) else False

In [41]:
labels = [
    'ирония', 'ироничные цитаты', 'самоирония', 
    'сарказм', 'саркастичные цитаты',
    # Addition tags 
    'насмешки', 'издевательство', 'сатира', 
    'остроумие', 'черный юмор',
]

In [42]:
df['target'] = df['tags'].apply(
    lambda x: get_tags_by_topic(x, labels)
)

In [43]:
df.target = df.target.fillna(0)
df.target = pd.to_numeric(df.target, errors='coerce').astype(int)

In [44]:
df[df.target == 1].shape

(29982, 24)

### Delete extra columns

In [45]:
df['author'] = df.apply(
    lambda x: x.author if x.author else x.performer, 
    axis=1
)

In [46]:
df.drop(['performer', ], axis=1, inplace=True);

In [47]:
def get_source(row):
    if row.anime:
        return row.anime
    elif row.book:
        return row.book
    elif row.cartoon:
        return row.cartoon
    elif row.comic:
        return row.comic
    elif row.game:
        return row.game
    elif row.movie:
        return row.movie
    elif row.play:
        return row.play
    elif row.samizdat:
        return row.samizdat
    elif row.serial:
        return row.serial
    elif row.song:
        return row.song
    elif row.tv:
        return row.tv
    else:
        return None

In [48]:
df['source'] = df.apply(lambda x: get_source(x), axis=1)

In [49]:
df.drop(
    [
        'song', 'samizdat', 'anime', 'game', 
        'book', 'comic', 'cartoon', 'serial', 
        'play', 'tv', 'movie'
    ], 
    axis=1, 
    inplace=True
);

### Delete extra symbols from columns (author, character, source)

In [50]:
def del_extra_symbols(text):
    '''Delete ( ... ) at the end of the line'''
    if text:
        return re.sub(r'\s*\([^\)^\(]+\)*\s*$', '', text)

In [51]:
df['source'] = df['source'].apply(del_extra_symbols)

In [52]:
for ind, row in df.iterrows():
    df.at[ind, 'author'] = [del_extra_symbols(author) for author in row.author]

In [53]:
for ind, row in df.iterrows():
    df.at[ind, 'character'] = [del_extra_symbols(character) for character in row.character]

In [54]:
df = df.rename(columns={"text": "quote"})

## Save data

In [55]:
with open(os.path.join(ci_path, 'ci.pickle'), 'wb') as f:
    pickle.dump(df, f)