In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import random
import functools

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pycrfsuite

from preprocessing import read_dataset
from preprocessing import Tokenization, StopWordsRemover, Lemmatization, RoofRemoval, SpellingCorrection
from preprocessing import GibberishDetector, TokenGrouping, TokenDictionary

from baseline import evaluate_solution
from csv_parser import split_train_test

## Read the data

In [2]:
# Read data
dataset_path = 'data/discussion_data.csv'
df = read_dataset(dataset_path)
df.head()

Unnamed: 0,School,Cohort,Book ID,Topic,Bookclub,User ID,Name,Message,Translation,Message Time,Is Answer,Page,Book relevance,Type,Category,CategoryBroad
0,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1382,MumaD8,gremo se pogovarjati,Let's talk,2019-06-18 05:16:16 AM,No,4,No,S,CE,C
1,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1392,MumaD18,Kip je to,This is a statue,2019-06-18 05:17:29 AM,No,4,No,S,CO,C
2,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1392,MumaD18,Kdo je to jaz sem tara,Who is this I am Tara (girl's name),2019-06-18 05:17:59 AM,No,4,No,Q,IQ,I
3,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1382,MumaD8,kaj kip,what statue,2019-06-18 05:18:58 AM,No,4,No,S,CO,C
4,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1382,MumaD8,gremo ven,let's go outside,2019-06-18 05:19:24 AM,No,4,No,S,CO,C


In [3]:
df.CategoryBroad.value_counts()/df.CategoryBroad.count()

C    0.414268
D    0.323944
I    0.123086
O    0.081751
M    0.047152
S    0.009798
Name: CategoryBroad, dtype: float64

In [4]:
list(df.CategoryBroad.unique())

['C', 'I', 'D', 'O', 'M', 'S']

## Preprocessing

In [5]:
tokenizer = Tokenization()
stop_words_remover = StopWordsRemover('data/stopwords-sl-custom.txt')
lemmatizer = Lemmatization()

roof_removal = RoofRemoval()
spelling_correction = SpellingCorrection('data/dict-sl.txt', roof_removal)

gibberish_detector = GibberishDetector(roof_removal)
# Train gibberish_detector
gibberish_detector.train('data/dict-sl.txt', 'data/gibberish_good.txt', 'data/gibberish_bad.txt')

token_grouping = TokenGrouping(gibberish_detector)

Correct good: 1.0
Correct bad: 1.0


In [6]:
# Tokenization
messages = df.Message
messages = [tokenizer.tokenize(message) for message in messages]

In [7]:
# Remove stop words
messages = [stop_words_remover.remove_stopwords(tokens) for tokens in messages]

In [8]:
# Lemmatization
messages = [[lemmatizer.lemmatize(token) for token in message] for message in messages]

In [9]:
# Roof removal
messages = [[roof_removal.remove(token) for token in message] for message in messages]

In [10]:
# Spelling correction
# conversations = [[spelling_correction.replace_if_close(token) for token in tokens] for tokens in tqdm(conversations)]

In [11]:
# Token grouping
messages = [[token_grouping.group_tokens(token) for token in message] for message in messages]

In [12]:
# Create BoW dictionary
token_dict = TokenDictionary(messages)

In [13]:
# Get tf-idf weighted BoW representations
bow = np.stack([token_dict.bag_of_words(message) for message in messages])
bow_tfidf = np.stack([token_dict.bag_of_words(message, tf_idf=True) for message in messages])

## CRF features construction

In [56]:
# 1. Define feature functions

# Only length as a feature
def get_features_length(message_i):
    message = df.loc[message_i]['Message']
    feature_list = []
    
    count = len(message.split())
    feature_list.append(f'length={count}')
    
    
    return feature_list

# BoW as features
def get_features_bow(message_i, bow_values):
#     message = df.loc[message_i]
    message_bow = bow_values[message_i]
    
    features = {}
    for i, w in enumerate(message_bow):
        features[f'bow_{i:03d}'] = w
        
    return features

# BoW + length as features
def get_features_bow_length(message_i, bow_values):
    message = df.loc[message_i]['Message']
    message_bow = bow_values[message_i]
    
    features = {}
    for i, w in enumerate(message_bow):
        features[f'bow_{i:03d}'] = w
        
    # Add length
    count = len(message.split())
    features['length'] = count
        
    return features

def get_label(message_i):
    message = df.loc[message_i]
    return message['CategoryBroad']

In [24]:
def conversation2features(conversation, feature_fn):
    features = [feature_fn(msg_i) for msg_i in conversation]
    
    return features
    
def conversation2labels(conversation, labels_fn):
    labels = [labels_fn(msg_i) for msg_i in conversation]
    return labels

## Split train, test

In [14]:
train_dfs, test_dfs = split_train_test(df) # deprecated

Distribution of train and test is very different

In [15]:
train_dfs_all = pd.concat(train_dfs)
train_dfs_all.CategoryBroad.value_counts()/train_dfs_all.CategoryBroad.count()

C    0.428960
D    0.289939
I    0.135282
O    0.090075
M    0.049626
S    0.006118
Name: CategoryBroad, dtype: float64

In [16]:
test_dfs_all = pd.concat(test_dfs)
test_dfs_all.CategoryBroad.value_counts()/test_dfs_all.CategoryBroad.count()

D    0.632716
C    0.280864
S    0.043210
M    0.024691
I    0.012346
O    0.006173
Name: CategoryBroad, dtype: float64

In [17]:
# Majority class from train set
majority_class = train_dfs_all.CategoryBroad.value_counts().index[0]
majority_class

'C'

**Prepare conversations**

In [18]:
# Get a list of conversations
conversation_list_train = [list(df.index) for df in train_dfs]
conversation_list_test = [list(df.index) for df in test_dfs]

In [19]:
len(conversation_list_train), len(conversation_list_test)

(83, 21)

In [20]:
conversation_list_train[0]

[1683, 1684, 1685, 1686, 1687, 1688]

## CRF training

In [23]:
# features_fn = get_features_length
# features_fn = functools.partial(get_features_bow, bow_values=bow)
features_fn = functools.partial(get_features_bow, bow_values=bow_tfidf)
labels_fn = get_label

X_train = [conversation2features(s, features_fn) for s in conversation_list_train]
y_train = [conversation2labels(s, labels_fn) for s in conversation_list_train]

X_test = [conversation2features(s, features_fn) for s in conversation_list_test]
y_test = [conversation2labels(s, labels_fn) for s in conversation_list_test]

In [24]:
X_train[0]

[{'bow_000': 0.0,
  'bow_001': 0.0,
  'bow_002': 0.5655558160696986,
  'bow_003': 0.0,
  'bow_004': 0.0,
  'bow_005': 0.0,
  'bow_006': 0.0,
  'bow_007': 0.0,
  'bow_008': 0.0,
  'bow_009': 0.0,
  'bow_010': 0.0,
  'bow_011': 0.0,
  'bow_012': 0.0,
  'bow_013': 0.8740385638344491,
  'bow_014': 0.0,
  'bow_015': 0.0,
  'bow_016': 0.0,
  'bow_017': 0.0,
  'bow_018': 0.0,
  'bow_019': 0.0,
  'bow_020': 0.0,
  'bow_021': 0.0,
  'bow_022': 0.0,
  'bow_023': 0.0,
  'bow_024': 0.0,
  'bow_025': 0.0,
  'bow_026': 0.0,
  'bow_027': 0.0,
  'bow_028': 0.0,
  'bow_029': 0.0,
  'bow_030': 0.0,
  'bow_031': 0.0,
  'bow_032': 0.0,
  'bow_033': 0.0,
  'bow_034': 0.0,
  'bow_035': 0.0,
  'bow_036': 0.0,
  'bow_037': 0.0,
  'bow_038': 0.0,
  'bow_039': 0.0,
  'bow_040': 0.0,
  'bow_041': 0.0,
  'bow_042': 0.0,
  'bow_043': 0.0,
  'bow_044': 0.0,
  'bow_045': 0.0,
  'bow_046': 0.0,
  'bow_047': 0.0,
  'bow_048': 0.0,
  'bow_049': 0.0,
  'bow_050': 0.0,
  'bow_051': 0.0,
  'bow_052': 0.0,
  'bow_053': 0.0

In [25]:
X_test[0]

[{'bow_000': 0.0,
  'bow_001': 0.8841266546286305,
  'bow_002': 0.0,
  'bow_003': 0.0,
  'bow_004': 0.0,
  'bow_005': 0.0,
  'bow_006': 0.0,
  'bow_007': 0.0,
  'bow_008': 0.0,
  'bow_009': 0.0,
  'bow_010': 0.0,
  'bow_011': 0.0,
  'bow_012': 0.0,
  'bow_013': 0.0,
  'bow_014': 0.0,
  'bow_015': 0.0,
  'bow_016': 0.0,
  'bow_017': 0.0,
  'bow_018': 0.0,
  'bow_019': 0.0,
  'bow_020': 0.0,
  'bow_021': 0.0,
  'bow_022': 0.0,
  'bow_023': 0.0,
  'bow_024': 0.0,
  'bow_025': 0.0,
  'bow_026': 0.0,
  'bow_027': 0.0,
  'bow_028': 0.0,
  'bow_029': 0.0,
  'bow_030': 0.0,
  'bow_031': 0.0,
  'bow_032': 0.0,
  'bow_033': 0.0,
  'bow_034': 0.0,
  'bow_035': 0.0,
  'bow_036': 0.0,
  'bow_037': 0.0,
  'bow_038': 0.0,
  'bow_039': 0.0,
  'bow_040': 0.0,
  'bow_041': 0.0,
  'bow_042': 0.0,
  'bow_043': 0.0,
  'bow_044': 0.0,
  'bow_045': 0.0,
  'bow_046': 0.0,
  'bow_047': 0.0,
  'bow_048': 0.0,
  'bow_049': 0.0,
  'bow_050': 0.0,
  'bow_051': 0.0,
  'bow_052': 0.0,
  'bow_053': 0.0,
  'bow_054': 

In [26]:
%%time
# 4. Prepare trainer
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 637 ms, sys: 25.5 ms, total: 662 ms
Wall time: 906 ms


In [27]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
#     'max_iterations': 200,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [28]:
%%time
# 4. Train a classifier (built classifier will be stored into a file "model.crf.tagger")
if not os.path.exists('models'):
    os.makedirs('models')
    
trainer.train('models/bow_tfidf.crfsuite')

CPU times: user 3.61 s, sys: 46.6 ms, total: 3.66 s
Wall time: 4.04 s


In [29]:
trainer.logparser.last_iteration

{'num': 90,
 'scores': {},
 'loss': 2021.861222,
 'feature_norm': 23.070673,
 'error_norm': 1.384544,
 'active_features': 663,
 'linesearch_trials': 1,
 'linesearch_step': 1.0,
 'time': 0.035}

## Prediction

In [30]:
tagger = pycrfsuite.Tagger()
tagger.open('models/bow_tfidf.crfsuite')

<contextlib.closing at 0x130892250>

In [31]:
example_convo = conversation_list_test[10]
for i in example_convo:
    msg = df.loc[i]
    print(f'{msg.CategoryBroad}: {msg.Message}')

print("Predicted:", ' '.join(tagger.tag(conversation2features(example_convo, features_fn))))
print("Correct:  ", ' '.join(conversation2labels(example_convo, labels_fn)))

D: če bi morala zapustiti svojo družino bi se počutila zelo žalostno.
D: malo bi bila žalostna
D: jaz pa zelo
D: Počutil bi se malo prestrašeno če je to 1, tudi žalostno, drugače pa ne.
D: o jaz pa zelo zelo zelo zelo zelo zalostno
D: vesela bi bila da bi spoznala nove prijatelje
D: a imate radi starše
D: jaz tudi ampak se vedno zelo zalostno
D: ja zelo
D: pa ti
D: Žalostno kaj pa ti ??
Predicted: D D D D D D D D C C C
Correct:   D D D D D D D D D D D


**Test accuracy**

In [32]:
preds = [tag for convo in X_test for tag in tagger.tag(convo)]
labels = [tag for convo in y_test for tag in convo]
tags = list(df.CategoryBroad.unique())

evaluate_solution(preds, labels, tags, majority_class=majority_class)

['C', 'I', 'D', 'O', 'M', 'S']
[[ 81   2   6   2   0   0]
 [  2   2   0   0   0   0]
 [126   1  67   3   8   0]
 [  2   0   0   0   0   0]
 [  6   0   0   0   2   0]
 [ 13   0   0   0   1   0]]
Our CA: 0.4691358024691358
Majority CA: 0.2808641975308642


**Train accuracy**

In [33]:
preds = [tag for convo in X_train for tag in tagger.tag(convo)]
labels = [tag for convo in y_train for tag in convo]
tags = list(df.CategoryBroad.unique())

evaluate_solution(preds, labels, tags, majority_class=majority_class)

['C', 'I', 'D', 'O', 'M', 'S']
[[1103   36   65   50    8    0]
 [ 107  276   12    1    2    0]
 [ 140    6  685   15    7    0]
 [ 123    4   10  123    5    0]
 [  37    4   29    3   73    0]
 [   3    0    3    1    1   10]]
Our CA: 0.7715839564921821
Majority CA: 0.42895989123045547


## Cross validation

In [None]:
from tqdm import tqdm

In [22]:
cross_val_dfs = split_train_test(df)

In [25]:
def data_from_fold(fold, features_fn, labels_fn):
    train_dfs, test_dfs = fold
    
    # Get a list of conversations
    conversation_list_train = [list(df.index) for df in train_dfs]
    conversation_list_test = [list(df.index) for df in test_dfs]

    # Construct CRF datasets
    X_train = [conversation2features(s, features_fn) for s in conversation_list_train]
    y_train = [conversation2labels(s, labels_fn) for s in conversation_list_train]

    X_test = [conversation2features(s, features_fn) for s in conversation_list_test]
    y_test = [conversation2labels(s, labels_fn) for s in conversation_list_test]
    
    return X_train, y_train, X_test, y_test

**Majority classifier**

In [34]:
maj_accuracies = []

labels_fn = get_label
for fold in tqdm(cross_val_dfs):
    train_dfs, test_dfs = fold
    
    train_dfs_all = pd.concat(train_dfs)
    test_dfs_all = pd.concat(test_dfs)
    
    # Get majority class in train
    majority_class = train_dfs_all.CategoryBroad.value_counts().index[0]
    
    accuracy = np.mean(test_dfs_all.CategoryBroad == majority_class)
    maj_accuracies.append(accuracy)
    

100%|██████████| 12/12 [00:01<00:00,  6.32it/s]


In [57]:
# features_fn = get_features_length
# features_fn = functools.partial(get_features_bow, bow_values=bow)
# features_fn = functools.partial(get_features_bow, bow_values=bow_tfidf)
features_fn = functools.partial(get_features_bow_length, bow_values=bow_tfidf)
labels_fn = get_label

model_name = 'models/cross_val.crfsuite'

**Our classifier**

In [58]:
our_accuracies = []

for fold in tqdm(cross_val_dfs):
    X_train, y_train, X_test, y_test = data_from_fold(fold, features_fn, labels_fn)
    
    # Trainer
    trainer = pycrfsuite.Trainer(verbose=False)
    for xseq, yseq in zip(X_train, y_train):
        trainer.append(xseq, yseq)
        
    trainer.set_params({
        'c1': 1.0,   # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })
    
    if not os.path.exists('models'):
        os.makedirs('models')

    # Train
    trainer.train(model_name)
    
    # Load tagger
    tagger = pycrfsuite.Tagger()
    tagger.open(model_name)
    
    # Predictions and labels
    preds = [tag for convo in X_test for tag in tagger.tag(convo)]
    labels = [tag for convo in y_test for tag in convo]
    
    # Compute accuracy
    accuracy = np.mean([pred == label for pred, label in zip(preds, labels)])
    our_accuracies.append(accuracy)

100%|██████████| 12/12 [03:58<00:00, 19.87s/it]


In [59]:
print(f'Average accuracy (our): {np.mean(our_accuracies)}')
print(f'Average accuracy (maj): {np.mean(maj_accuracies)}')

Average accuracy (our): 0.6380940269980564
Average accuracy (maj): 0.3952239376898053
