In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data reading

In [2]:
# Read data
dataset_path = 'data/discussion_data.csv'
df = pd.read_csv(dataset_path, delimiter=';')
df['CategoryBroad'] = pd.Categorical(df['CategoryBroad'])
df

Unnamed: 0,School,Cohort,Book ID,Topic,Bookclub,User ID,Name,Message,Translation,Message Time,Is Answer,Page,Book relevance,Type,Category,CategoryBroad
0,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1382,MumaD8,gremo se pogovarjati,Let's talk,2019-06-18 05:16:16 AM,No,4,No,S,CE,C
1,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1392,MumaD18,Kip je to,This is a statue,2019-06-18 05:17:29 AM,No,4,No,S,CO,C
2,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1392,MumaD18,Kdo je to jaz sem tara,Who is this I am Tara (girl's name),2019-06-18 05:17:59 AM,No,4,No,Q,IQ,I
3,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1382,MumaD8,kaj kip,what statue,2019-06-18 05:18:58 AM,No,4,No,S,CO,C
4,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1382,MumaD8,gremo ven,let's go outside,2019-06-18 05:19:24 AM,No,4,No,S,CO,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3261,OŠ Koseze,Koseze,2,"Mojca je bila prepričana, da je z njo ženska i...",Knjižni Klub 8,676,karin japelj K,"Jaz mislim,da ne,ker je Pehta zlobna.","I don't think so, because Pehta is mena.",12.3.2019 5:51,No,5,Yes,A,DA,D
3262,OŠ Koseze,Koseze,2,"Mojca je bila prepričana, da je z njo ženska i...",Knjižni Klub 8,726,timskander,"Jaz mislim da ne, ker je zlobna.","I don't think so, because she is mean.",12.3.2019 5:53,No,14,Yes,A,DA,D
3263,OŠ Koseze,Koseze,2,"Mojca je bila prepričana, da je z njo ženska i...",Knjižni Klub 8,676,karin japelj K,itak,Of course.,12.3.2019 6:24,No,14,Yes,A,DA,D
3264,OŠ Koseze,Koseze,2,"Mojca je bila prepričana, da je z njo ženska i...",Knjižni Klub 8,676,karin japelj K,oops,oops,12.3.2019 6:25,No,14,No,S,CO,C


In [3]:
df.Message = df.Message.fillna('')

## Message parsing

### Tokenization

In [4]:
import re

from nltk import word_tokenize
from nltk.tokenize.casual import TweetTokenizer
from nltk.tokenize.destructive import NLTKWordTokenizer

In [5]:
tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True)

In [6]:
# Split punctuation without space ('neki.halo' -> 'neki', '.', 'halo')
def split_punc(tokens):
    tokens_out = []
    for token in tokens:
        match = re.match('(\w+)\.(\w+)', token)
        if match is not None:
            l,r = match.groups()
            tokens_out.extend([l, '.', r])
        else:
            tokens_out.append(token)
            
    return tokens_out

# Split numbers to separate tokens ('username15halo' -> 'username', '15', 'halo')
def split_num(tokens):
    tokens_out = []
    for token in tokens:
        new_tokens = re.findall('\d+|\D+', token)
        tokens_out.extend(new_tokens)
            
    return tokens_out

In [7]:
def tokenize(message):
    tokens = tokenizer.tokenize(message)
    
    # Fix punctuations
    tokens = split_punc(tokens)
    
    # Separate numbers
    tokens = split_num(tokens)
    
    return tokens

In [148]:
messages = df.Message
messages.iloc[40:60]

40                                                nevem
41                                           Kaj ne ves
42                              A splotches ves kdo sem
43    <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<...
44                                                 Nehi
45    llllllllllllllllllllllllllllllllllllllllllllll...
46                                                   ??
47                                         tara neni ti
48                                            Ne napacn
49                                                  vem
50                                          kaj mislite
51                                                nevem
52                                             odgovori
53                             ker je tako boljzanimivo
54                                     kaj mislis s tem
55                                               ker je
56                                       Kako to mislis
57                                            kd

In [9]:
i=67
print(tokenize(messages[i]))

['ups', '.', 'mislila', 'sem', 'kdo', 'jst', '.']


In [10]:
conversations = [tokenize(message) for message in messages]

In [147]:
conversations[40:60]

[['nev'],
 ['kaj'],
 ['splotches', 'kdo'],
 ['<', '<', '<'],
 ['neh'],
 ['lllooojjje', '<', '<', '<'],
 ['?', '?'],
 ['tar', 'nen'],
 ['napacn'],
 ['vedeti'],
 ['kaj', 'misliti'],
 ['nev'],
 ['odgovor'],
 ['boljzanimiv'],
 ['kaj', 'mislisa', 'ta'],
 [],
 ['kak', 'mislisa'],
 ['kdo'],
 ['jst'],
 ['natalio', 'vedeti']]

### Stop words removal

In [12]:
class StopWordsRemover():
    def __init__(self, stop_words_txt):
        with open(stop_words_txt, 'r') as file:
            self.stopwords = {line.strip() for line in file}
            
    def remove_stopwords(self, tokens):
        filtered = [token for token in tokens if token not in self.stopwords]
        return filtered

In [13]:
stopwords_remover = StopWordsRemover('data/stopwords-sl-custom.txt')

In [14]:
tokens = conversations[21]
tokens, stopwords_remover.remove_stopwords(tokens)

(['zakaj', 'to', 'pises'], ['zakaj', 'pises'])

In [15]:
conversations = [stopwords_remover.remove_stopwords(tokens) for tokens in conversations]

In [16]:
conversations[1010]

['poklicati', 'telefonu']

### Lematizacija

In [17]:
import lemmagen.lemmatizer
from lemmagen.lemmatizer import Lemmatizer

lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)

In [18]:
conversations = [[lemmatizer.lemmatize(token) for token in message] for message in conversations]

In [19]:
conversations[:100]

[['iti', 'pogovarjati'],
 ['kip'],
 ['kdo', 'tar'],
 ['kaj', 'kip'],
 ['iti', 'vena'],
 ['kam'],
 ['park'],
 ['zadusiti', 'drug'],
 ['park', 'kodeljev'],
 ['kdo', 'pis', 'moči'],
 ['gregr'],
 ['gregor'],
 ['behinda'],
 ['neh'],
 ['bols', 'poves', 'kdo'],
 ['tar'],
 ['ooonnnee'],
 ['ok'],
 ['zzzaaakkkaaajjje'],
 ['reklo', 'pogledati', 'ta', 'zbezati'],
 ['vvvtttfff'],
 ['zakaj', 'pises'],
 ['kdo', 'mumada', '24'],
 ['ubiti'],
 ['magoca', 'postati', 'prevec', 'nasilen', '?', '?'],
 ['ja', 'nasilen'],
 ['napa', 'is', 'kaj'],
 ['kaj'],
 ['upati', 'nov', 'naslednji', 'kniga', 'stup', 'cav'],
 ['hus', 'uijkh'],
 [],
 ['gregor'],
 ['ohč',
  '3',
  '.',
  'hizti',
  '.',
  'zzz',
  'wehrw',
  'ebzztdftzdztdtziztdti',
  '45',
  'wweddfdfffdfbfbdrez',
  '5',
  'z',
  'rt',
  'fg',
  'rtrthfhgf',
  'fthfhfthfthfthfthfthfhfghfthzfrtje',
  'gz---aw',
  'wawdwawsdawd',
  'trrreo',
  'daws',
  'aadw',
  'a',
  'wdawsdasdssawww',
  'fff',
  'w',
  'awdadawdwdawdawdadjadvjawgdagdhasxbasjkhdkawbkjasgdaw

### Dešumifikacija

In [20]:
import re

class RoofRemoval():
    def __init__(self):
        self.replacements = dict([('ž', 'z'),('č', 'c'),('š', 's'),('ć', 'c'),('đ', 'dz')])
        self.pattern = re.compile("|".join(self.replacements.keys()))

    def remove(self, token):
        token = self.pattern.sub(lambda x: self.replacements[re.escape(x.group(0))], token)
        return token

In [21]:
roof_removal = RoofRemoval()

In [22]:
roof_removal.remove('test šumnikov. da vidimo če pobere čšž?')

'test sumnikov. da vidimo ce pobere csz?'

In [23]:
conversations = [[roof_removal.remove(token) for token in message] for message in conversations]

### Error correction

In [107]:
from nltk.metrics.distance import edit_distance
import difflib

class SpellingCorrection():
    def __init__(self, dict_txt, roof_removal=None):
        
        with open(dict_txt, 'r') as file:
            dictionary = (line.strip() for line in file)
            
            if roof_removal is not None:
                dictionary = (roof_removal.remove(w) for w in dictionary)
                
            self.dictionary = list(dictionary)
            
        self.roof_removal = roof_removal
            
    def find_close(self, token, n=1, cutoff=0.9):
        close_matches = difflib.get_close_matches(token, self.dictionary, n=n, cutoff=cutoff)
        
        scores = [difflib.SequenceMatcher(None, token, w).ratio() for w in close_matches]
        
        return list(zip(close_matches, scores))
    
    def replace_if_close(self, token, thresh=0.9):
        res = self.find_close(token, n=1, cutoff=thresh)
        if len(res) < 1:
            return token
        
        word, sim = res[0]
        if sim < 1:
            print(f'{token} -> {word} ({sim})')
        
        return word

In [108]:
spelling_correction = SpellingCorrection('data/dict-sl.txt', roof_removal=roof_removal)

In [109]:
spelling_correction.replace_if_close('avtobrus')

avtobrus -> avtobus (0.9333333333333333)


'avtobus'

In [29]:
from tqdm.auto import tqdm

In [30]:
conversations_corr = [[spelling_correction.replace_if_close(token) for token in tokens] for tokens in tqdm(conversations)]

HBox(children=(FloatProgress(value=0.0, max=3266.0), HTML(value='')))

gregr -> gregor (0.9090909090909091)
pises -> prises (0.9090909090909091)
kniga -> knjiga (0.9090909090909091)
utilo -> cutilo (0.9090909090909091)
napacn -> napacen (0.9230769230769231)
jurje -> jurjev (0.9090909090909091)
jurje -> jurjev (0.9090909090909091)
napisan -> nadpisan (0.9333333333333333)
plaziti -> vplaziti (0.9333333333333333)
jurje -> jurjev (0.9090909090909091)
jurje -> jurjev (0.9090909090909091)
odgovpor -> odgovor (0.9333333333333333)
obcutka -> obcutkar (0.9333333333333333)
pocutiti -> ocutiti (0.9333333333333333)
adrian -> arian (0.9090909090909091)
natas -> naftas (0.9090909090909091)
primeti -> primreti (0.9333333333333333)
primeti -> primreti (0.9333333333333333)
zacniti -> zlacniti (0.9333333333333333)
zacniti -> zlacniti (0.9333333333333333)
volitve -> volitven (0.9333333333333333)
nasljeden -> nasleden (0.9411764705882353)
nasljednji -> naslednji (0.9473684210526315)
navciti -> naviti (0.9230769230769231)
skocit -> skociti (0.9230769230769231)
nobeden -> obed

obcuek -> obcutek (0.9230769230769231)
preprican -> prepricevan (0.9)
kokoma -> skokoma (0.9230769230769231)
kokosje -> okosje (0.9230769230769231)
barvaza -> baraza (0.9230769230769231)
friend -> frend (0.9090909090909091)
tuneti -> tutneti (0.9230769230769231)
gospodicen -> gospodicna (0.9)
igotov -> gotov (0.9090909090909091)
odgoveor -> odgovor (0.9333333333333333)
brcen -> ibrcen (0.9090909090909091)
obcutka -> obcutkar (0.9333333333333333)
brava -> brlava (0.9090909090909091)
adeti -> zadeti (0.9090909090909091)
pokvarjen -> pokvarjenka (0.9)
smucat -> smucati (0.9230769230769231)
neomen -> nepomen (0.9230769230769231)
izracunati -> izracunjati (0.9523809523809523)
preiskusiti -> presusiti (0.9)
astronavta -> astronavtka (0.9523809523809523)
predcednik -> predsednik (0.9)
programera -> programerka (0.9523809523809523)
tiina -> tisina (0.9090909090909091)
poskusiti -> posusiti (0.9411764705882353)
pripet -> priplet (0.9230769230769231)
potruditi -> potrditi (0.9411764705882353)
ym

### Token grouping

In [52]:
import re

In [41]:
import string

class GibberishDetector():
    def __init__(self, roof_removal):
        """Gibberish detector trained on a dictionary of real words (using Markov chains)."""
        
        self.invalid_regex = re.compile('[^a-z]')
        
        self.states = string.ascii_lowercase + '*^$'
        self.state_index = {char: i for i, char in enumerate(self.states)}
        
        self.probs = np.zeros((len(self.states), len(self.states)))
        self.threshold = 0.0
        
        self.roof_removal = roof_removal
        
        
    def normalize(self, string):
        """Replace invalid characters (non-alphabet) with *"""
        
        # Remove roofs
        string = self.roof_removal.remove(string)
        
        # Replace out-of-dict chars
        string = self.invalid_regex.sub('*', string)
        
        # Surround with ^ and $
        string = '^' + string + '$'
        
        return string
        
    def ngram(self, string, n):
        """ Return all n grams from string"""
        
        filtered = self.normalize(string)
        for start in range(0, len(filtered) - n + 1):
            yield ''.join(filtered[start:start + n])
            
    def train(self, dictionary_txt, good_txt, bad_txt):
        counts = np.zeros((len(self.states), len(self.states)))
        
        with open(dictionary_txt, 'r') as file:
            word_iter = (self.normalize(line.strip()) for line in file)
        
            for word in word_iter:
                for c1, c2 in self.ngram(word, 2):
                    c1i = self.state_index[c1]
                    c2i = self.state_index[c2]

                    counts[c1i, c2i] += 1    
                    
        
        # Add small probability even to missing transitions
        laplace_vector = np.maximum((counts.sum(axis=1)*0.01/len(counts)), 1)[:, np.newaxis]
        counts = np.maximum(counts, laplace_vector)
        
        # Compute log probabilities
        sums = counts.sum(axis=1)[:, np.newaxis]
        self.probs = np.log(counts / sums)
        
        # Compute best threshold
        with open(good_txt, 'r') as file:
            word_iter = (line.strip() for line in file)
            good_probs = np.array([self.word_probability(word) for word in word_iter])
        
        with open(bad_txt, 'r') as file:
            word_iter = (line.strip() for line in file)
            bad_probs = np.array([self.word_probability(word) for word in word_iter])
        
        min_g = np.min(good_probs)
        max_b = np.max(bad_probs)
        
        self.threshold = (min_g + max_b) * 0.5
        
        # Test threshold
        print(f'Correct good: {np.mean(good_probs > self.threshold)}')
        print(f'Correct bad: {np.mean(bad_probs <= self.threshold)}')
            
    def word_probability(self, word):
        word = self.normalize(word)
        
        log_prob = 0.0
        count = 0
        for c1, c2 in self.ngram(word, 2):
            c1i = self.state_index[c1]
            c2i = self.state_index[c2]

            log_prob += self.probs[c1i, c2i]
            count += 1
            
        return np.exp(log_prob / count)
        
    def is_gibberish(self, word):
        return self.word_probability(word) <= self.threshold
        

In [42]:
gibberish_detector = GibberishDetector(roof_removal)
gibberish_detector.train('data/dict-sl.txt', 'data/gibberish_good.txt', 'data/gibberish_bad.txt')

Correct good: 1.0
Correct bad: 1.0


In [43]:
gibberish_detector.is_gibberish('kagfjhnjzguyd'), gibberish_detector.is_gibberish('otorinolaringolog')

(True, False)

In [53]:
# Regex for symbols etc.
other_regex = re.compile(r'[^\w?!.,-]')

def group_tokens(token):
    
    if token.isdigit():
        print(f'<number> <- {token}')
        token = '<number>'
    elif len(token) > 4 and gibberish_detector.is_gibberish(token):
        print(f'<gibberish> <- {token}')
        token = '<gibberish>'
    elif other_regex.search(token) is not None:
        print(f'<other> <- {token}')
        token = '<other>'
        
    return token

In [54]:
conversations_gr = [[group_tokens(token) for token in message] for message in conversations]

<gibberish> <- ooonnnee
<gibberish> <- zzzaaakkkaaajjje
<gibberish> <- vvvtttfff
<number> <- 24
<gibberish> <- uijkh
<number> <- 3
<gibberish> <- wehrw
<gibberish> <- ebzztdftzdztdtziztdti
<number> <- 45
<gibberish> <- wweddfdfffdfbfbdrez
<number> <- 5
<gibberish> <- rtrthfhgf
<gibberish> <- fthfhfthfthfthfthfthfhfghfthzfrtje
<gibberish> <- gz---aw
<gibberish> <- wawdwawsdawd
<gibberish> <- trrreo
<gibberish> <- wdawsdasdssawww
<gibberish> <- awdadawdwdawdawdadjadvjawgdagdhasxbasjkhdkawbkjasgdawghdkaghdkajkgawkgd
<gibberish> <- kagfjhgsfvawgdadjzgfinjzjgujlizuukgguyd
<number> <- 76
<gibberish> <- uzfrhtthdhjdftdtttzezgzuutttzzzrdtdzka
<gibberish> <- ktfgca
<gibberish> <- iluehz
<gibberish> <- hsehfiwhf
<gibberish> <- usrhfsiufhweufhwekhsehfkshuefksuf
<gibberish> <- kufhskuhzezhshkeufszehwehzefhs
<gibberish> <- fhwefefsefsefsfesfeww
<gibberish> <- sssohgkgghuocg
<gibberish> <- sekhsoiefheiohfioehfehfrehkflwefhlief
<gibberish> <- erilceilzwhreibaeizcsfhsezfzscjsgweuwtrewfasegwegerrrsjhfe

<gibberish> <- hahahahhahahahahahhahahahhahahahahhahahahahahahahahahahahhahahahahahahahahahahahahahahhaahahahahhahahahahahahahahjahajajahahahahahahahahhahahahahahahahahahahhahaahhahahahahaahhahahahahahahahahahahahahhahah
<other> <- (
<other> <- )
<gibberish> <- khukuguigujliuh
<number> <- 8
<gibberish> <- ighocpjio
<number> <- 8
<gibberish> <- itgpa
<number> <- 8
<number> <- 8
<number> <- 87674
<number> <- 6
<gibberish> <- lovcfkghptfhkptlhpctfkhcptkhcsdrflpkjcpdtlg
<gibberish> <- peswnjjbggclmcvcti
<gibberish> <- cnjtg
<gibberish> <- kcgti
<gibberish> <- lcfkgfr
<gibberish> <- kgfkg
<other> <- :)
<gibberish> <- cogpegpokcllegplegcrspogrega
<gibberish> <- vcdgfkje
<gibberish> <- fdcpogr
<number> <- 4
<number> <- 5
<gibberish> <- gcprtz
<gibberish> <- urzcnqp
<number> <- 9
<gibberish> <- tmhpv
<other> <- :)
<other> <- :)
<other> <- :)
<other> <- :)
<number> <- 5
<gibberish> <- grhfbdgd
<other> <- :)
<gibberish> <- gkvfffchcghh
<other> <- :
<gibberish> <- sedbfzub
<gibberish> <- hkljg
<g

In [55]:
conversations_gr[:100]

[['iti', 'pogovarjati'],
 ['kip'],
 ['kdo', 'tar'],
 ['kaj', 'kip'],
 ['iti', 'vena'],
 ['kam'],
 ['park'],
 ['zadusiti', 'drug'],
 ['park', 'kodeljev'],
 ['kdo', 'pis', 'moci'],
 ['gregr'],
 ['gregor'],
 ['behinda'],
 ['neh'],
 ['bols', 'poves', 'kdo'],
 ['tar'],
 ['<gibberish>'],
 ['ok'],
 ['<gibberish>'],
 ['reklo', 'pogledati', 'ta', 'zbezati'],
 ['<gibberish>'],
 ['zakaj', 'pises'],
 ['kdo', 'mumada', '<number>'],
 ['ubiti'],
 ['magoca', 'postati', 'prevec', 'nasilen', '?', '?'],
 ['ja', 'nasilen'],
 ['napa', 'is', 'kaj'],
 ['kaj'],
 ['upati', 'nov', 'naslednji', 'kniga', 'stup', 'cav'],
 ['hus', '<gibberish>'],
 [],
 ['gregor'],
 ['ohc',
  '<number>',
  '.',
  'hizti',
  '.',
  'zzz',
  '<gibberish>',
  '<gibberish>',
  '<number>',
  '<gibberish>',
  '<number>',
  'z',
  'rt',
  'fg',
  '<gibberish>',
  '<gibberish>',
  '<gibberish>',
  '<gibberish>',
  '<gibberish>',
  'daws',
  'aadw',
  'a',
  '<gibberish>',
  'fff',
  'w',
  '<gibberish>',
  '<gibberish>',
  'jgdt',
  ',',
  

### Create dictionary

In [157]:
from collections import Counter

class TokenDictionary():
    def __init__(self, documents, dict_size=512):
        self.dict_size = dict_size
        
        all_tokens = [token for document in documents for token in document]

        # Find most common tokens and construct a dict from them
        cnt = Counter(all_tokens)
        most_common = cnt.most_common(dict_size-1)

        # Out-of-dict token
        remaining = len(all_tokens) - sum(count for _, count in most_common)
        most_common.append(('<OOD>', remaining))

        token_dictionary = [token for token, count in most_common]
        self.token_map = {token: i for i, token in enumerate(token_dictionary)}
        
        # Compute idf for each word
        bow = np.stack([self.bag_of_words(document) for document in documents])
        num_documents = len(documents)
        word_occurences = np.sum(bow>0, axis=0)
        
        self.idf = np.log(num_documents / word_occurences)
        
    def get_token(self, token):
        """ Get in-dict token for a given token. """
        
        if token not in self.token_map:
            return '<OOD>'
        
        return token

    def bag_of_words(self, tokens, tf_idf=False):
        """ Convert a list of tokens to a bag-of-words representation. """
        
        bow = np.zeros(self.dict_size)
        if len(tokens) == 0:
            return bow
        
        for token in tokens:
            token = self.get_token(token)
            i = self.token_map[token]
            bow[i] += 1
            
        if tf_idf:
            tf = np.log(1 + bow / len(tokens))
            idf = self.idf
            
            bow = tf * idf
            
        return bow

In [158]:
documents = [message for message in conversations_gr]
token_dict = TokenDictionary(documents)

In [159]:
documents[:100]

[['iti', 'pogovarjati'],
 ['kip'],
 ['kdo', 'tar'],
 ['kaj', 'kip'],
 ['iti', 'vena'],
 ['kam'],
 ['park'],
 ['zadusiti', 'drug'],
 ['park', 'kodeljev'],
 ['kdo', 'pis', 'moci'],
 ['gregr'],
 ['gregor'],
 ['behinda'],
 ['neh'],
 ['bols', 'poves', 'kdo'],
 ['tar'],
 ['<gibberish>'],
 ['ok'],
 ['<gibberish>'],
 ['reklo', 'pogledati', 'ta', 'zbezati'],
 ['<gibberish>'],
 ['zakaj', 'pises'],
 ['kdo', 'mumada', '<number>'],
 ['ubiti'],
 ['magoca', 'postati', 'prevec', 'nasilen', '?', '?'],
 ['ja', 'nasilen'],
 ['napa', 'is', 'kaj'],
 ['kaj'],
 ['upati', 'nov', 'naslednji', 'kniga', 'stup', 'cav'],
 ['hus', '<gibberish>'],
 [],
 ['gregor'],
 ['ohc',
  '<number>',
  '.',
  'hizti',
  '.',
  'zzz',
  '<gibberish>',
  '<gibberish>',
  '<number>',
  '<gibberish>',
  '<number>',
  'z',
  'rt',
  'fg',
  '<gibberish>',
  '<gibberish>',
  '<gibberish>',
  '<gibberish>',
  '<gibberish>',
  'daws',
  'aadw',
  'a',
  '<gibberish>',
  'fff',
  'w',
  '<gibberish>',
  '<gibberish>',
  'jgdt',
  ',',
  

In [160]:
conversations_in_dict = [[token_dict.get_token(token) for token in message] for message in conversations_gr]

In [161]:
conversations_in_dict[:100]

[['iti', 'pogovarjati'],
 ['kip'],
 ['kdo', 'tar'],
 ['kaj', 'kip'],
 ['iti', 'vena'],
 ['kam'],
 ['park'],
 ['zadusiti', 'drug'],
 ['park', '<OOD>'],
 ['kdo', 'pis', 'moci'],
 ['<OOD>'],
 ['gregor'],
 ['<OOD>'],
 ['neh'],
 ['<OOD>', '<OOD>', 'kdo'],
 ['tar'],
 ['<gibberish>'],
 ['ok'],
 ['<gibberish>'],
 ['reklo', 'pogledati', 'ta', 'zbezati'],
 ['<gibberish>'],
 ['zakaj', 'pises'],
 ['kdo', 'mumada', '<number>'],
 ['ubiti'],
 ['<OOD>', 'postati', 'prevec', 'nasilen', '?', '?'],
 ['ja', 'nasilen'],
 ['<OOD>', 'is', 'kaj'],
 ['kaj'],
 ['upati', 'nov', '<OOD>', '<OOD>', '<OOD>', 'cav'],
 ['<OOD>', '<gibberish>'],
 [],
 ['gregor'],
 ['<OOD>',
  '<number>',
  '.',
  '<OOD>',
  '.',
  '<OOD>',
  '<gibberish>',
  '<gibberish>',
  '<number>',
  '<gibberish>',
  '<number>',
  '<OOD>',
  'rt',
  '<OOD>',
  '<gibberish>',
  '<gibberish>',
  '<gibberish>',
  '<gibberish>',
  '<gibberish>',
  '<OOD>',
  '<OOD>',
  'a',
  '<gibberish>',
  '<OOD>',
  'w',
  '<gibberish>',
  '<gibberish>',
  '<OOD>'

### Convert messages to a bag-of-words

In [162]:
conv_bow = np.stack([token_dict.bag_of_words(message) for message in conversations_gr])
conv_bow

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [163]:
conv_bow = np.stack([token_dict.bag_of_words(message, tf_idf=True) for message in conversations_gr])
conv_bow.sum(axis=1)

array([3.95775314, 5.12802351, 3.43892895, ..., 0.66019099, 0.66019099,
       2.14000525])

In [169]:
conv_bow[1000]

array([0.46279805, 0.39755665, 0.        , 0.51761962, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [164]:
conv_bow.max()

5.128023513834108