In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Read the data

In [2]:
from preprocessing import read_dataset

In [3]:
# Read data
dataset_path = 'data/discussion_data.csv'
df = read_dataset(dataset_path)
df.head()

Unnamed: 0,School,Cohort,Book ID,Topic,Bookclub,User ID,Name,Message,Translation,Message Time,Is Answer,Page,Book relevance,Type,Category,CategoryBroad
0,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1382,MumaD8,gremo se pogovarjati,Let's talk,2019-06-18 05:16:16 AM,No,4,No,S,CE,C
1,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1392,MumaD18,Kip je to,This is a statue,2019-06-18 05:17:29 AM,No,4,No,S,CO,C
2,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1392,MumaD18,Kdo je to jaz sem tara,Who is this I am Tara (girl's name),2019-06-18 05:17:59 AM,No,4,No,Q,IQ,I
3,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1382,MumaD8,kaj kip,what statue,2019-06-18 05:18:58 AM,No,4,No,S,CO,C
4,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1382,MumaD8,gremo ven,let's go outside,2019-06-18 05:19:24 AM,No,4,No,S,CO,C


## Preprocessing

In [4]:
from preprocessing import Tokenization, StopWordsRemover, Lemmatization, RoofRemoval, SpellingCorrection
from preprocessing import GibberishDetector, TokenGrouping, TokenDictionary

In [5]:
tokenizer = Tokenization()
stop_words_remover = StopWordsRemover('data/stopwords-sl-custom.txt')
lemmatizer = Lemmatization()

roof_removal = RoofRemoval()
spelling_correction = SpellingCorrection('data/dict-sl.txt', roof_removal)

gibberish_detector = GibberishDetector(roof_removal)
# Train gibberish_detector
gibberish_detector.train('data/dict-sl.txt', 'data/gibberish_good.txt', 'data/gibberish_bad.txt')

token_grouping = TokenGrouping(gibberish_detector)

Correct good: 1.0
Correct bad: 1.0


In [6]:
# Tokenization
messages = df.Message
messages = [tokenizer.tokenize(message) for message in messages]

In [7]:
# Remove stop words
messages = [stop_words_remover.remove_stopwords(tokens) for tokens in messages]

In [8]:
# Lemmatization
messages = [[lemmatizer.lemmatize(token) for token in message] for message in messages]

In [9]:
# Roof removal
messages = [[roof_removal.remove(token) for token in message] for message in messages]

In [10]:
# Spelling correction
# conversations = [[spelling_correction.replace_if_close(token) for token in tokens] for tokens in tqdm(conversations)]

In [11]:
# Token grouping
messages = [[token_grouping.group_tokens(token) for token in message] for message in messages]

In [12]:
# Create BoW dictionary
token_dict = TokenDictionary(messages)

In [13]:
# Get tf-idf weighted BoW representations
bow = np.stack([token_dict.bag_of_words(message, tf_idf=True) for message in messages])

## Prepare conversations

In [44]:
conversations = df.groupby(by=['School', 'Topic', 'Bookclub'])

In [45]:
# Get a list of conversations
conversation_list = []
for conversation in conversations.groups.keys():
    conversation_df = conversations.get_group(conversation)
    
    conversation_el = [(row['Message'], row['CategoryBroad']) for _, row in conversation_df.iterrows()]
    conversation_list.append(conversation_el)

In [46]:
len(conversation_list)

105

In [47]:
conversation_list[3]

[('Skoka', 'D'),
 ('Jaz sem že brala ta zgodba', 'C'),
 ('Jst tut', 'C'),
 ('piši pravilno prosim!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!',
  'C'),
 ('Kako lepo zgodbo!!!!!!!! Ta mi je tudi zelo všeč.', 'C'),
 ('Zadnja uprašanja so nemogoča!', 'C'),
 ('Pa lahko bi se naučil peti pesmice in pripovedovati zgodbice...', 'D'),
 ('Poleg tega pa sem tudi jaz že bral to zgodbo.', 'C'),
 ('Ali kdo v tem razredu sploh zna rešiti zadnje vprašanje? Kar koli napišem je narobe!',
  'D'),
 ('Ja jaz tudi...', 'C'),
 ('Ali res alojzija 19! kako!?', 'D')]

## CRF training

In [48]:
import random
import pycrfsuite

In [53]:
# 1. Define feature functions
def get_features(messages, idx):
    message = messages[idx]

    feature_list = []
    
    count = len(message.split())
    feature_list.append(f'length={count}')
    
    # TODO: features
    
    return feature_list

def conversation2features(conversation):
    messages = [message for message, _ in conversation]
    features = [get_features(messages, i) for i in range(len(conversation))]
    
    return features
    
def conversation2labels(conversation):
    labels = [label for _, label in conversation]
    return labels

In [54]:
# 3. Prepare data for training (and testing if needed)
random.shuffle(conversation_list)
n_test_samples = int(0.2 * len(conversation_list))
train_data = conversation_list[:-n_test_samples]
test_data = conversation_list[-n_test_samples:]

In [55]:
X_train = [conversation2features(s) for s in train_data]
y_train = [conversation2labels(s) for s in train_data]

X_test = [conversation2features(s) for s in test_data]
y_test = [conversation2labels(s) for s in test_data]

In [56]:
X_train[0]

[['length=8'],
 ['length=3'],
 ['length=5'],
 ['length=4'],
 ['length=2'],
 ['length=1'],
 ['length=7'],
 ['length=4'],
 ['length=1'],
 ['length=2'],
 ['length=1'],
 ['length=3']]

In [58]:
%%time
# 4. Prepare trainer
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 6.67 ms, sys: 253 µs, total: 6.93 ms
Wall time: 7.63 ms


In [59]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [60]:
%%time
# 4. Train a classifier (built classifier will be stored into a file "model.crf.tagger")
trainer.train('discussions.crfsuite')

CPU times: user 65.2 ms, sys: 3.37 ms, total: 68.5 ms
Wall time: 74.3 ms


In [61]:
trainer.logparser.last_iteration

{'num': 50,
 'scores': {},
 'loss': 2948.72976,
 'feature_norm': 9.767975,
 'error_norm': 5.972897,
 'active_features': 88,
 'linesearch_trials': 1,
 'linesearch_step': 1.0,
 'time': 0.001}

## Prediction

In [62]:
tagger = pycrfsuite.Tagger()
tagger.open('discussions.crfsuite')

<contextlib.closing at 0x11ceb0690>

In [71]:
example_convo = train_data[0]
for message, cls in example_convo:
    print(f'{cls}: {message}')

print("Predicted:", ' '.join(tagger.tag(conversation2features(example_convo))))
print("Correct:  ", ' '.join(conversation2labels(example_convo)))

D: da si ni šel v nedeljo ogledat policaja
D: Pobegnil bi policaju
I: kdo si ti mi movej
D: a ni policaj neumen
C: kaj delaš
I: lara
M: Tukaj vaša učiteljica, prosim odgovorite na vprašanje.
C: sram naj te modi
C: lara
C: nisi prijazna
O: kijuhz
C: ja res je
Predicted: D D D D D D D D C C C C
Correct:   D D I D C I M C C C O C
