In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import random
import functools
from datetime import timedelta

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pycrfsuite

from preprocessing import read_dataset
from preprocessing import Tokenization, StopWordsRemover, Lemmatization, RoofRemoval, SpellingCorrection
from preprocessing import GibberishDetector, TokenGrouping, TokenDictionary, SentimentAnalysis

from baseline import evaluate_solution
from csv_parser import split_train_test

import features as F

## Read the data

In [2]:
# Read data
dataset_path = 'data/discussion_data.csv'
df = read_dataset(dataset_path)
df.head()

Unnamed: 0,School,Cohort,Book ID,Topic,Bookclub,User ID,Name,Message,Translation,Message Time,Is Answer,Page,Book relevance,Type,Category,CategoryBroad
0,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1382,MumaD8,gremo se pogovarjati,Let's talk,2019-06-18 05:16:16 AM,No,4,No,S,CE,C
1,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1392,MumaD18,Kip je to,This is a statue,2019-06-18 05:17:29 AM,No,4,No,S,CO,C
2,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1392,MumaD18,Kdo je to jaz sem tara,Who is this I am Tara (girl's name),2019-06-18 05:17:59 AM,No,4,No,Q,IQ,I
3,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1382,MumaD8,kaj kip,what statue,2019-06-18 05:18:58 AM,No,4,No,S,CO,C
4,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1382,MumaD8,gremo ven,let's go outside,2019-06-18 05:19:24 AM,No,4,No,S,CO,C


In [3]:
df['Message Time'] = pd.to_datetime(df['Message Time'])

In [4]:
df['Message Time'][0] - timedelta(minutes=5)

Timestamp('2019-06-18 05:11:16')

In [5]:
df.School.unique()

array(['OŠ Ketteja in Murna', 'OŠ Franca Rozmana Staneta',
       'OŠ Nove Fužine', 'OŠ Alojzija Šuštarja', 'OŠ Vižmarje - Brod',
       'OŠ Vide Pregarc', 'OŠ Valentina Vodnika', 'OŠ Koseze'],
      dtype=object)

In [6]:
df.CategoryBroad.value_counts()/df.CategoryBroad.count()

C    0.414268
D    0.323944
I    0.123086
O    0.081751
M    0.047152
S    0.009798
Name: CategoryBroad, dtype: float64

In [7]:
list(df.CategoryBroad.unique())

['C', 'I', 'D', 'O', 'M', 'S']

## Preprocessing

In [8]:
tokenizer = Tokenization()
stop_words_remover = StopWordsRemover('data/stopwords-sl-custom.txt')
lemmatizer = Lemmatization()

roof_removal = RoofRemoval()
spelling_correction = SpellingCorrection('data/dict-sl.txt', roof_removal)

gibberish_detector = GibberishDetector(roof_removal)
# Train gibberish_detector
gibberish_detector.train('data/dict-sl.txt', 'data/gibberish_good.txt', 'data/gibberish_bad.txt')

token_grouping = TokenGrouping(gibberish_detector)

Correct good: 1.0
Correct bad: 1.0


In [9]:
# Tokenization
messages = df.Message
messages = [tokenizer.tokenize(message) for message in messages]

In [10]:
# Remove stop words
messages = [stop_words_remover.remove_stopwords(tokens) for tokens in messages]
messages_sent = messages

In [11]:
# Lemmatization
messages = [[lemmatizer.lemmatize(token) for token in message] for message in messages]

In [12]:
# Roof removal
messages = [[roof_removal.remove(token) for token in message] for message in messages]

In [13]:
# Spelling correction
# conversations = [[spelling_correction.replace_if_close(token) for token in tokens] for tokens in tqdm(conversations)]

In [14]:
# Token grouping
messages = [[token_grouping.group_tokens(token) for token in message] for message in messages]

In [18]:
# Create BoW dictionary
token_dict = TokenDictionary(messages)

In [19]:
# Get tf-idf weighted BoW representations
bow = np.stack([token_dict.bag_of_words(message) for message in messages])
bow_tfidf = np.stack([token_dict.bag_of_words(message, tf_idf=True) for message in messages])

## CRF features construction

In [20]:
sa = SentimentAnalysis('data/negative_words_Slolex.txt', 'data/positive_words_Slolex.txt', roof_removal)

## Split train, test

In [21]:
dfs_split = split_train_test(df)
train_dfs, test_dfs = dfs_split[0]

['OŠ Ketteja in Murna' 'OŠ Franca Rozmana Staneta' 'OŠ Nove Fužine'
 'OŠ Alojzija Šuštarja' 'OŠ Vižmarje - Brod' 'OŠ Vide Pregarc'
 'OŠ Valentina Vodnika' 'OŠ Koseze']


Distribution of train and test is very different

In [22]:
train_dfs_all = pd.concat(train_dfs)
train_dfs_all.CategoryBroad.value_counts()/train_dfs_all.CategoryBroad.count()

C    0.417851
D    0.345355
I    0.111111
O    0.060838
M    0.053188
S    0.011658
Name: CategoryBroad, dtype: float64

In [23]:
test_dfs_all = pd.concat(test_dfs)
test_dfs_all.CategoryBroad.value_counts()/test_dfs_all.CategoryBroad.count()

C    0.395393
D    0.211132
O    0.191939
I    0.186180
M    0.015355
S    0.000000
Name: CategoryBroad, dtype: float64

In [24]:
# Majority class from train set
majority_class = train_dfs_all.CategoryBroad.value_counts().index[0]
majority_class

'C'

**Prepare conversations**

In [22]:
# Get a list of conversations
conversation_list_train = [list(df.index) for df in train_dfs]
conversation_list_test = [list(df.index) for df in test_dfs]

In [23]:
len(conversation_list_train), len(conversation_list_test)

(85, 16)

In [24]:
conversation_list_train[0]

[1683, 1684, 1685, 1686, 1687, 1688]

## CRF training

In [36]:
# features_fn = get_features_length
# features_fn = functools.partial(get_features_bow, bow_values=bow)
# features_fn = functools.partial(get_features_bow_length, bow_values=bow_tfidf)
def features_fn(message_id):
    all_features = {}
    
    features = [
        F.message_bow(message_id, bow_values=bow_tfidf),
        F.sentiment(message_id, messages_sent, sa, normalize=True),
        F.recent_activity(message_id, df),
        F.length(message_id, df),
        F.wordcount(message_id, df)
    ]
    
    for feats, names in features:
        features_dict = {name: feat for feat, name in zip(feats, names)}
        all_features.update(features_dict)
    
    return all_features

labels_fn = functools.partial(F.get_label, df=df)

X_train = [F.conversation2features(s, features_fn) for s in conversation_list_train]
y_train = [F.conversation2labels(s, labels_fn) for s in conversation_list_train]

X_test = [F.conversation2features(s, features_fn) for s in conversation_list_test]
y_test = [F.conversation2labels(s, labels_fn) for s in conversation_list_test]

In [37]:
X_train[1]

[{'bow_000': 0.0,
  'bow_001': 0.0,
  'bow_002': 0.0,
  'bow_003': 0.0,
  'bow_004': 0.0,
  'bow_005': 0.0,
  'bow_006': 0.0,
  'bow_007': 0.0,
  'bow_008': 0.0,
  'bow_009': 0.0,
  'bow_010': 0.0,
  'bow_011': 0.0,
  'bow_012': 0.0,
  'bow_013': 0.0,
  'bow_014': 0.0,
  'bow_015': 0.0,
  'bow_016': 0.0,
  'bow_017': 0.0,
  'bow_018': 0.0,
  'bow_019': 0.0,
  'bow_020': 0.0,
  'bow_021': 0.0,
  'bow_022': 0.0,
  'bow_023': 0.0,
  'bow_024': 0.0,
  'bow_025': 0.0,
  'bow_026': 0.0,
  'bow_027': 0.0,
  'bow_028': 0.0,
  'bow_029': 0.0,
  'bow_030': 0.0,
  'bow_031': 0.0,
  'bow_032': 0.0,
  'bow_033': 0.0,
  'bow_034': 0.0,
  'bow_035': 0.0,
  'bow_036': 0.0,
  'bow_037': 0.0,
  'bow_038': 0.0,
  'bow_039': 0.0,
  'bow_040': 0.0,
  'bow_041': 0.0,
  'bow_042': 0.0,
  'bow_043': 0.0,
  'bow_044': 0.0,
  'bow_045': 0.0,
  'bow_046': 0.0,
  'bow_047': 0.0,
  'bow_048': 0.0,
  'bow_049': 0.0,
  'bow_050': 0.0,
  'bow_051': 0.0,
  'bow_052': 0.0,
  'bow_053': 0.0,
  'bow_054': 0.0,
  'bow_055

In [38]:
X_test[0]

[{'bow_000': 0.0,
  'bow_001': 0.0,
  'bow_002': 0.0,
  'bow_003': 0.0,
  'bow_004': 0.0,
  'bow_005': 0.0,
  'bow_006': 0.0,
  'bow_007': 0.0,
  'bow_008': 0.0,
  'bow_009': 0.0,
  'bow_010': 0.0,
  'bow_011': 0.0,
  'bow_012': 0.0,
  'bow_013': 0.0,
  'bow_014': 0.0,
  'bow_015': 0.0,
  'bow_016': 0.0,
  'bow_017': 0.0,
  'bow_018': 0.0,
  'bow_019': 0.0,
  'bow_020': 0.0,
  'bow_021': 0.0,
  'bow_022': 0.0,
  'bow_023': 0.0,
  'bow_024': 0.0,
  'bow_025': 0.0,
  'bow_026': 0.0,
  'bow_027': 0.0,
  'bow_028': 0.0,
  'bow_029': 0.0,
  'bow_030': 0.0,
  'bow_031': 0.0,
  'bow_032': 0.0,
  'bow_033': 0.0,
  'bow_034': 0.0,
  'bow_035': 0.0,
  'bow_036': 0.0,
  'bow_037': 0.0,
  'bow_038': 0.0,
  'bow_039': 0.0,
  'bow_040': 0.0,
  'bow_041': 0.0,
  'bow_042': 0.0,
  'bow_043': 0.0,
  'bow_044': 0.0,
  'bow_045': 0.0,
  'bow_046': 0.0,
  'bow_047': 0.0,
  'bow_048': 0.0,
  'bow_049': 0.0,
  'bow_050': 0.0,
  'bow_051': 0.0,
  'bow_052': 0.0,
  'bow_053': 0.0,
  'bow_054': 0.0,
  'bow_055

In [39]:
%%time
# 4. Prepare trainer
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 505 ms, sys: 13.3 ms, total: 519 ms
Wall time: 541 ms


In [40]:
trainer.get_params()

{'feature.minfreq': 0.0,
 'feature.possible_states': False,
 'feature.possible_transitions': False,
 'c1': 0.0,
 'c2': 1.0,
 'max_iterations': 2147483647,
 'num_memories': 6,
 'epsilon': 1e-05,
 'period': 10,
 'delta': 1e-05,
 'linesearch': 'MoreThuente',
 'max_linesearch': 20}

In [41]:
trainer.set_params({
    # include transitions and states that are possible, but not observed
    'feature.possible_states': True,
    'feature.possible_transitions': True
})

In [42]:
%%time
# 4. Train a classifier (built classifier will be stored into a file "model.crf.tagger")
if not os.path.exists('models'):
    os.makedirs('models')
    
trainer.train('models/bow_tfidf.crfsuite')

CPU times: user 45.9 s, sys: 100 ms, total: 46 s
Wall time: 46.2 s


In [43]:
trainer.logparser.last_iteration

{'num': 1140,
 'scores': {},
 'loss': 1577.549289,
 'feature_norm': 15.804286,
 'error_norm': 19.215968,
 'active_features': 2942,
 'linesearch_trials': 1,
 'linesearch_step': 1.0,
 'time': 0.037}

## Prediction

In [44]:
tagger = pycrfsuite.Tagger()
tagger.open('models/bow_tfidf.crfsuite')

<contextlib.closing at 0x13112c518>

In [45]:
example_convo = conversation_list_test[10]
for i in example_convo:
    msg = df.loc[i]
    print(f'{msg.CategoryBroad}: {msg.Message}')

print("Predicted:", ' '.join(tagger.tag(F.conversation2features(example_convo, features_fn))))
print("Correct:  ", ' '.join(F.conversation2labels(example_convo, labels_fn)))

D: bi ga udaril in bi zbezal
I: Gdo si ti
I: adri
I: kdo si ti
I: NataĹˇa
I: adrian
I: NataĹˇa
C: mah
C: Kaj mah
C: ham
C: Napisu si mah ne ham
C: ham
C: Ne mah
C: nope pusti me pri miru
I: Adrianđź¤Şđź¤Şđź¤Şđź¤Şđź¤Ş
C: nehi
C: kaj pa odgovori
I: Ajda đź¤Şđź¤Şđź¤Şđź¤Ş
C: moje ime govori
C: nehi
C: notr sem
C: jaz maj
C: kdo
D: lahko bi mu stopil na nogo in stekel stran
D: lahko bi ga zadusil
I: Gdo si MumaD7 in muma D26
D: lahko biga ubil
D: lahko bi ga prenteltal in bi rekel poglej letalo in ko bi se obrnil bi stekel stran
C: ne ne
C: dej nam mir
I: natasa
I: kdo
I: mumad19
D: ko bi mu zbezal bi se lahko skril
D: Lahko bi mu stopil na nogo in ko bi cvilil bi ga primel in ga odpeljal v zapor
D: ko bi sel v pekarno in se skril za vrata in ko bi sel noter bi potem stekel ven
D: Lahko bi mu stopil mu stopil na noge in ko bi cvilil bi ga primel in ga odpeliu v zapor v najtrdnejĹˇo sobo ki jo imajo.
Predicted: D I I I C C C C C I I I C C C C C C C C C C I D D D D D C C C I I D D D M
Correct

**Test accuracy**

In [46]:
preds = [tag for convo in X_test for tag in tagger.tag(convo)]
labels = [tag for convo in y_test for tag in convo]
tags = list(df.CategoryBroad.unique())

evaluate_solution(preds, labels, tags, majority_class=majority_class)

['C', 'I', 'D', 'O', 'M', 'S']
[[163   7  30   5   1   0]
 [ 41  47   7   2   0   0]
 [ 14   1  92   0   3   0]
 [ 38   2   5  43   0  12]
 [  2   0   0   0   6   0]
 [  0   0   0   0   0   0]]
Our CA: 0.6737044145873321
Majority CA: 0.39539347408829173


array([[163,   7,  30,   5,   1,   0],
       [ 41,  47,   7,   2,   0,   0],
       [ 14,   1,  92,   0,   3,   0],
       [ 38,   2,   5,  43,   0,  12],
       [  2,   0,   0,   0,   6,   0],
       [  0,   0,   0,   0,   0,   0]])

**Train accuracy**

In [47]:
preds = [tag for convo in X_train for tag in tagger.tag(convo)]
labels = [tag for convo in y_train for tag in convo]
tags = list(df.CategoryBroad.unique())

evaluate_solution(preds, labels, tags, majority_class=majority_class)

['C', 'I', 'D', 'O', 'M', 'S']
[[1012   23   80   31    0    1]
 [  82  212    9    1    1    0]
 [ 109    4  822    5    8    0]
 [  53    1   11   99    3    0]
 [  14    3   38    1   90    0]
 [  12    0    7    2    1   10]]
Our CA: 0.8178506375227687
Majority CA: 0.4178506375227687


array([[1012,   23,   80,   31,    0,    1],
       [  82,  212,    9,    1,    1,    0],
       [ 109,    4,  822,    5,    8,    0],
       [  53,    1,   11,   99,    3,    0],
       [  14,    3,   38,    1,   90,    0],
       [  12,    0,    7,    2,    1,   10]])

## Cross validation

In [25]:
from tqdm import tqdm

In [26]:
cross_val_dfs = split_train_test(df)

['OŠ Ketteja in Murna' 'OŠ Franca Rozmana Staneta' 'OŠ Nove Fužine'
 'OŠ Alojzija Šuštarja' 'OŠ Vižmarje - Brod' 'OŠ Vide Pregarc'
 'OŠ Valentina Vodnika' 'OŠ Koseze']


In [27]:
def data_from_fold(fold, features_fn, labels_fn):
    train_dfs, test_dfs = fold
    
    # Get a list of conversations
    conversation_list_train = [list(df.index) for df in train_dfs]
    conversation_list_test = [list(df.index) for df in test_dfs]

    # Construct CRF datasets
    X_train = [F.conversation2features(s, features_fn) for s in conversation_list_train]
    y_train = [F.conversation2labels(s, labels_fn) for s in conversation_list_train]

    X_test = [F.conversation2features(s, features_fn) for s in conversation_list_test]
    y_test = [F.conversation2labels(s, labels_fn) for s in conversation_list_test]
    
    return X_train, y_train, X_test, y_test

**Majority classifier**

In [28]:
maj_accuracies = []

labels_fn = F.get_label
for fold in tqdm(cross_val_dfs):
    train_dfs, test_dfs = fold
    
    train_dfs_all = pd.concat(train_dfs)
    test_dfs_all = pd.concat(test_dfs)
    
    # Get majority class in train
    majority_class = train_dfs_all.CategoryBroad.value_counts().index[0]
    
    accuracy = np.mean(test_dfs_all.CategoryBroad == majority_class)
    maj_accuracies.append(accuracy)
    

100%|██████████| 8/8 [00:01<00:00,  5.12it/s]


In [29]:
# features_fn = get_features_length
# features_fn = functools.partial(get_features_bow, bow_values=bow)
# features_fn = functools.partial(get_features_bow, bow_values=bow_tfidf)
# features_fn = functools.partial(get_features_bow_length, bow_values=bow_tfidf)
def features_fn(message_id):
    all_features = {}
    
    features = [
        F.message_bow(message_id, bow_values=bow_tfidf),
        F.sentiment(message_id, messages_sent, sa, normalize=True),
        F.recent_activity(message_id, df),
        F.length(message_id, df),
        F.wordcount(message_id, df)
    ]
    
    for feats, names in features:
        features_dict = {name: feat for feat, name in zip(feats, names)}
        all_features.update(features_dict)
    
    return all_features

labels_fn = functools.partial(F.get_label, df=df)

model_name = 'models/cross_val.crfsuite'

**Our classifier**

In [30]:
categories = list(df.CategoryBroad.unique())
combine_matrix_test = np.zeros([len(categories), len(categories)], dtype=np.int64)
combine_matrix_train = np.zeros([len(categories), len(categories)], dtype=np.int64)

for fold in tqdm(cross_val_dfs):
    X_train, y_train, X_test, y_test = data_from_fold(fold, features_fn, labels_fn)
    
    # Trainer
    trainer = pycrfsuite.Trainer(verbose=False)
    for xseq, yseq in zip(X_train, y_train):
        trainer.append(xseq, yseq)
        
    trainer.set_params({
        # include states and transitions that are possible, but not observed
        'feature.possible_states': True,
        'feature.possible_transitions': True
    })
    
    if not os.path.exists('models'):
        os.makedirs('models')

    # Train
    trainer.train(model_name)
    
    # Load tagger
    tagger = pycrfsuite.Tagger()
    tagger.open(model_name)
    
    # TEST
    # Predictions and labels
    preds = [tag for convo in X_test for tag in tagger.tag(convo)]
    labels = [tag for convo in y_test for tag in convo]
    
    # Compute confusion matrix
    conf_mat = evaluate_solution(preds, labels, categories, verbose=False)
    combine_matrix_test += conf_mat
        
    # TRAIN
    # Predictions and labels
    preds = [tag for convo in X_train for tag in tagger.tag(convo)]
    labels = [tag for convo in y_train for tag in convo]
    
    # Compute confusion matrix
    conf_mat = evaluate_solution(preds, labels, categories, verbose=False)
    combine_matrix_train += conf_mat

100%|██████████| 8/8 [10:15<00:00, 76.92s/it]


In [70]:
acc_test = combine_matrix_test.diagonal().sum() / combine_matrix_test.sum()
acc_train = combine_matrix_train.diagonal().sum() / combine_matrix_train.sum()

In [71]:
print(f'Combined accuracy (test): {acc_test}')
print(f'Combined accuracy (train): {acc_train}')

Combined accuracy (test): 0.6527862829148806
Combined accuracy (train): 0.8164640013997025


In [97]:
print(f'Average accuracy (our): {np.mean(our_accuracies)}')
print(f'Average accuracy (maj): {np.mean(maj_accuracies)}')

Average accuracy (our): 0.645143221897118
Average accuracy (maj): 0.3932443041655893
