In [69]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import random
import functools
from datetime import timedelta

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pycrfsuite

from preprocessing import read_dataset
from preprocessing import Tokenization, StopWordsRemover, Lemmatization, RoofRemoval, SpellingCorrection
from preprocessing import GibberishDetector, TokenGrouping, TokenDictionary, SentimentAnalysis

from baseline import evaluate_solution
from csv_parser import split_train_test

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Read the data

In [2]:
# Read data
dataset_path = 'data/discussion_data.csv'
df = read_dataset(dataset_path)
df.head()

Unnamed: 0,School,Cohort,Book ID,Topic,Bookclub,User ID,Name,Message,Translation,Message Time,Is Answer,Page,Book relevance,Type,Category,CategoryBroad
0,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1382,MumaD8,gremo se pogovarjati,Let's talk,2019-06-18 05:16:16 AM,No,4,No,S,CE,C
1,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1392,MumaD18,Kip je to,This is a statue,2019-06-18 05:17:29 AM,No,4,No,S,CO,C
2,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1392,MumaD18,Kdo je to jaz sem tara,Who is this I am Tara (girl's name),2019-06-18 05:17:59 AM,No,4,No,Q,IQ,I
3,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1382,MumaD8,kaj kip,what statue,2019-06-18 05:18:58 AM,No,4,No,S,CO,C
4,OŠ Ketteja in Murna,MumaD,8,Kako bi lahko Cefizelj pobegnil policistu še n...,Book Club One,1382,MumaD8,gremo ven,let's go outside,2019-06-18 05:19:24 AM,No,4,No,S,CO,C


In [66]:
df['Message Time'] = pd.to_datetime(df['Message Time'])

In [70]:
df['Message Time'][0] - timedelta(minutes=5)

Timestamp('2019-06-18 05:11:16')

In [None]:
df.School.unique()

In [None]:
df.CategoryBroad.value_counts()/df.CategoryBroad.count()

In [None]:
list(df.CategoryBroad.unique())

## Preprocessing

In [None]:
tokenizer = Tokenization()
stop_words_remover = StopWordsRemover('data/stopwords-sl-custom.txt')
lemmatizer = Lemmatization()

roof_removal = RoofRemoval()
spelling_correction = SpellingCorrection('data/dict-sl.txt', roof_removal)

gibberish_detector = GibberishDetector(roof_removal)
# Train gibberish_detector
gibberish_detector.train('data/dict-sl.txt', 'data/gibberish_good.txt', 'data/gibberish_bad.txt')

token_grouping = TokenGrouping(gibberish_detector)

In [None]:
# Tokenization
messages = df.Message
messages = [tokenizer.tokenize(message) for message in messages]

In [None]:
# Remove stop words
messages = [stop_words_remover.remove_stopwords(tokens) for tokens in messages]
messages_sent = messages

In [None]:
# Lemmatization
messages = [[lemmatizer.lemmatize(token) for token in message] for message in messages]

In [None]:
# Roof removal
messages = [[roof_removal.remove(token) for token in message] for message in messages]

In [None]:
# Spelling correction
# conversations = [[spelling_correction.replace_if_close(token) for token in tokens] for tokens in tqdm(conversations)]

In [None]:
# Token grouping
messages = [[token_grouping.group_tokens(token) for token in message] for message in messages]

In [None]:
# Create BoW dictionary
token_dict = TokenDictionary(messages)

In [None]:
# Get tf-idf weighted BoW representations
bow = np.stack([token_dict.bag_of_words(message) for message in messages])
bow_tfidf = np.stack([token_dict.bag_of_words(message, tf_idf=True) for message in messages])

## CRF features construction

In [15]:
sa = SentimentAnalysis('data/negative_words_Slolex.txt', 'data/positive_words_Slolex.txt', roof_removal)

In [84]:
# 1. Define feature functions

# Only length as a feature
def get_features_length(message_i):
    message = df.loc[message_i]['Message']
    features = {}
    
    count = len(message.split())
    
    count = len(message.split())
    features['words'] = count
    features['length'] = len(message)
    
    
    return features

# BoW as features
def get_features_bow(message_i, bow_values):
#     message = df.loc[message_i]
    message_bow = bow_values[message_i]
    
    features = {}
    for i, w in enumerate(message_bow):
        features[f'bow_{i:03d}'] = w
        
    return features

# BoW + length as features
def get_features_bow_length(message_i, bow_values):
    message = df.loc[message_i]['Message']
    message_bow = bow_values[message_i]
    
    features = {}
    for i, w in enumerate(message_bow):
        features[f'bow_{i:03d}'] = w
        
    # Add length
    count = len(message.split())
    features['words'] = count
    features['length'] = len(message)
        
    return features

def get_features_sent(message_id):
    message = messages_sent[message_id]
    sent = sa.sentiment(message)
    
    features = {'sentiment': sent}
    
    return features

def get_features_history(message_id):
    entry = df.loc[message_id]
    time = entry['Message Time']
    username = entry['Name']
    
    conversation = df[df['School'] == entry['School']]
    conversation = conversation[conversation['Bookclub'] == entry['Bookclub']]
    conversation = conversation[conversation['Topic'] == entry['Topic']]
    
    time_mask = (conversation['Message Time'] < time) & (conversation['Message Time'] > time - timedelta(minutes=5))
    username_mask = conversation['Name'] == username
    
    n_last_5min = len(conversation[time_mask & username_mask])
    n_posts_5min = len(conversation[time_mask])
    n_users_5min = len(conversation[time_mask]['Name'].unique())
    
    features = {
        'recent_user_posts': n_last_5min,
        'recent_posts': n_posts_5min,
        'recent_users': n_users_5min
    }
    
    return features

def get_label(message_i):
    message = df.loc[message_i]
    return message['CategoryBroad']

In [72]:
def conversation2features(conversation, feature_fn):
    features = [feature_fn(msg_i) for msg_i in conversation]
    
    return features
    
def conversation2labels(conversation, labels_fn):
    labels = [labels_fn(msg_i) for msg_i in conversation]
    return labels

## Split train, test

In [45]:
dfs_split = split_train_test(df)
train_dfs, test_dfs = dfs_split[0]

['OŠ Ketteja in Murna' 'OŠ Franca Rozmana Staneta' 'OŠ Nove Fužine'
 'OŠ Alojzija Šuštarja' 'OŠ Vižmarje - Brod' 'OŠ Vide Pregarc'
 'OŠ Valentina Vodnika' 'OŠ Koseze']


Distribution of train and test is very different

In [46]:
train_dfs_all = pd.concat(train_dfs)
train_dfs_all.CategoryBroad.value_counts()/train_dfs_all.CategoryBroad.count()

C    0.417851
D    0.345355
I    0.111111
O    0.060838
M    0.053188
S    0.011658
Name: CategoryBroad, dtype: float64

In [47]:
test_dfs_all = pd.concat(test_dfs)
test_dfs_all.CategoryBroad.value_counts()/test_dfs_all.CategoryBroad.count()

C    0.395393
D    0.211132
O    0.191939
I    0.186180
M    0.015355
S    0.000000
Name: CategoryBroad, dtype: float64

In [48]:
# Majority class from train set
majority_class = train_dfs_all.CategoryBroad.value_counts().index[0]
majority_class

'C'

**Prepare conversations**

In [49]:
# Get a list of conversations
conversation_list_train = [list(df.index) for df in train_dfs]
conversation_list_test = [list(df.index) for df in test_dfs]

In [50]:
len(conversation_list_train), len(conversation_list_test)

(85, 16)

In [51]:
conversation_list_train[0]

[1683, 1684, 1685, 1686, 1687, 1688]

## CRF training

In [85]:
# features_fn = get_features_length
# features_fn = functools.partial(get_features_bow, bow_values=bow)
# features_fn = functools.partial(get_features_bow_length, bow_values=bow_tfidf)
def features_fn(message_id):
    all_features = {}
    
    features1 = get_features_bow(message_id, bow_values=bow_tfidf)
    features2 = get_features_sent(message_id)
    features3 = get_features_history(message_id)
    features4 = get_features_length(message_id)
    
    all_features.update(features2)
    all_features.update(features3)
    all_features.update(features4)
    
    return all_features

labels_fn = get_label

X_train = [conversation2features(s, features_fn) for s in conversation_list_train]
y_train = [conversation2labels(s, labels_fn) for s in conversation_list_train]

X_test = [conversation2features(s, features_fn) for s in conversation_list_test]
y_test = [conversation2labels(s, labels_fn) for s in conversation_list_test]

In [86]:
X_train[1]

[{'sentiment': 0,
  'recent_user_posts': 0,
  'recent_posts': 0,
  'recent_users': 0,
  'words': 8,
  'length': 36},
 {'sentiment': 0,
  'recent_user_posts': 0,
  'recent_posts': 1,
  'recent_users': 1,
  'words': 10,
  'length': 56},
 {'sentiment': 0,
  'recent_user_posts': 1,
  'recent_posts': 2,
  'recent_users': 2,
  'words': 4,
  'length': 16},
 {'sentiment': 0,
  'recent_user_posts': 1,
  'recent_posts': 3,
  'recent_users': 2,
  'words': 14,
  'length': 66},
 {'sentiment': 0,
  'recent_user_posts': 2,
  'recent_posts': 4,
  'recent_users': 2,
  'words': 7,
  'length': 30},
 {'sentiment': 0,
  'recent_user_posts': 1,
  'recent_posts': 4,
  'recent_users': 2,
  'words': 5,
  'length': 21},
 {'sentiment': 0,
  'recent_user_posts': 1,
  'recent_posts': 3,
  'recent_users': 2,
  'words': 2,
  'length': 10},
 {'sentiment': 0,
  'recent_user_posts': 2,
  'recent_posts': 3,
  'recent_users': 2,
  'words': 2,
  'length': 9},
 {'sentiment': 0.2857142857142857,
  'recent_user_posts': 1,
  

In [25]:
X_test[0]

[{'bow_000': 0.0,
  'bow_001': 0.8841266546286305,
  'bow_002': 0.0,
  'bow_003': 0.0,
  'bow_004': 0.0,
  'bow_005': 0.0,
  'bow_006': 0.0,
  'bow_007': 0.0,
  'bow_008': 0.0,
  'bow_009': 0.0,
  'bow_010': 0.0,
  'bow_011': 0.0,
  'bow_012': 0.0,
  'bow_013': 0.0,
  'bow_014': 0.0,
  'bow_015': 0.0,
  'bow_016': 0.0,
  'bow_017': 0.0,
  'bow_018': 0.0,
  'bow_019': 0.0,
  'bow_020': 0.0,
  'bow_021': 0.0,
  'bow_022': 0.0,
  'bow_023': 0.0,
  'bow_024': 0.0,
  'bow_025': 0.0,
  'bow_026': 0.0,
  'bow_027': 0.0,
  'bow_028': 0.0,
  'bow_029': 0.0,
  'bow_030': 0.0,
  'bow_031': 0.0,
  'bow_032': 0.0,
  'bow_033': 0.0,
  'bow_034': 0.0,
  'bow_035': 0.0,
  'bow_036': 0.0,
  'bow_037': 0.0,
  'bow_038': 0.0,
  'bow_039': 0.0,
  'bow_040': 0.0,
  'bow_041': 0.0,
  'bow_042': 0.0,
  'bow_043': 0.0,
  'bow_044': 0.0,
  'bow_045': 0.0,
  'bow_046': 0.0,
  'bow_047': 0.0,
  'bow_048': 0.0,
  'bow_049': 0.0,
  'bow_050': 0.0,
  'bow_051': 0.0,
  'bow_052': 0.0,
  'bow_053': 0.0,
  'bow_054': 

In [26]:
%%time
# 4. Prepare trainer
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

CPU times: user 637 ms, sys: 25.5 ms, total: 662 ms
Wall time: 906 ms


In [27]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
#     'max_iterations': 200,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [28]:
%%time
# 4. Train a classifier (built classifier will be stored into a file "model.crf.tagger")
if not os.path.exists('models'):
    os.makedirs('models')
    
trainer.train('models/bow_tfidf.crfsuite')

CPU times: user 3.61 s, sys: 46.6 ms, total: 3.66 s
Wall time: 4.04 s


In [29]:
trainer.logparser.last_iteration

{'num': 90,
 'scores': {},
 'loss': 2021.861222,
 'feature_norm': 23.070673,
 'error_norm': 1.384544,
 'active_features': 663,
 'linesearch_trials': 1,
 'linesearch_step': 1.0,
 'time': 0.035}

## Prediction

In [53]:
tagger = pycrfsuite.Tagger()
tagger.open('models/bow_tfidf.crfsuite')

<contextlib.closing at 0x12bf78518>

In [31]:
example_convo = conversation_list_test[10]
for i in example_convo:
    msg = df.loc[i]
    print(f'{msg.CategoryBroad}: {msg.Message}')

print("Predicted:", ' '.join(tagger.tag(conversation2features(example_convo, features_fn))))
print("Correct:  ", ' '.join(conversation2labels(example_convo, labels_fn)))

D: če bi morala zapustiti svojo družino bi se počutila zelo žalostno.
D: malo bi bila žalostna
D: jaz pa zelo
D: Počutil bi se malo prestrašeno če je to 1, tudi žalostno, drugače pa ne.
D: o jaz pa zelo zelo zelo zelo zelo zalostno
D: vesela bi bila da bi spoznala nove prijatelje
D: a imate radi starše
D: jaz tudi ampak se vedno zelo zalostno
D: ja zelo
D: pa ti
D: Žalostno kaj pa ti ??
Predicted: D D D D D D D D C C C
Correct:   D D D D D D D D D D D


**Test accuracy**

In [32]:
preds = [tag for convo in X_test for tag in tagger.tag(convo)]
labels = [tag for convo in y_test for tag in convo]
tags = list(df.CategoryBroad.unique())

evaluate_solution(preds, labels, tags, majority_class=majority_class)

['C', 'I', 'D', 'O', 'M', 'S']
[[ 81   2   6   2   0   0]
 [  2   2   0   0   0   0]
 [126   1  67   3   8   0]
 [  2   0   0   0   0   0]
 [  6   0   0   0   2   0]
 [ 13   0   0   0   1   0]]
Our CA: 0.4691358024691358
Majority CA: 0.2808641975308642


**Train accuracy**

In [33]:
preds = [tag for convo in X_train for tag in tagger.tag(convo)]
labels = [tag for convo in y_train for tag in convo]
tags = list(df.CategoryBroad.unique())

evaluate_solution(preds, labels, tags, majority_class=majority_class)

['C', 'I', 'D', 'O', 'M', 'S']
[[1103   36   65   50    8    0]
 [ 107  276   12    1    2    0]
 [ 140    6  685   15    7    0]
 [ 123    4   10  123    5    0]
 [  37    4   29    3   73    0]
 [   3    0    3    1    1   10]]
Our CA: 0.7715839564921821
Majority CA: 0.42895989123045547


## Cross validation

In [36]:
from tqdm import tqdm

In [54]:
cross_val_dfs = split_train_test(df)

['OŠ Ketteja in Murna' 'OŠ Franca Rozmana Staneta' 'OŠ Nove Fužine'
 'OŠ Alojzija Šuštarja' 'OŠ Vižmarje - Brod' 'OŠ Vide Pregarc'
 'OŠ Valentina Vodnika' 'OŠ Koseze']


In [87]:
def data_from_fold(fold, features_fn, labels_fn):
    train_dfs, test_dfs = fold
    
    # Get a list of conversations
    conversation_list_train = [list(df.index) for df in train_dfs]
    conversation_list_test = [list(df.index) for df in test_dfs]

    # Construct CRF datasets
    X_train = [conversation2features(s, features_fn) for s in conversation_list_train]
    y_train = [conversation2labels(s, labels_fn) for s in conversation_list_train]

    X_test = [conversation2features(s, features_fn) for s in conversation_list_test]
    y_test = [conversation2labels(s, labels_fn) for s in conversation_list_test]
    
    return X_train, y_train, X_test, y_test

**Majority classifier**

In [88]:
maj_accuracies = []

labels_fn = get_label
for fold in tqdm(cross_val_dfs):
    train_dfs, test_dfs = fold
    
    train_dfs_all = pd.concat(train_dfs)
    test_dfs_all = pd.concat(test_dfs)
    
    # Get majority class in train
    majority_class = train_dfs_all.CategoryBroad.value_counts().index[0]
    
    accuracy = np.mean(test_dfs_all.CategoryBroad == majority_class)
    maj_accuracies.append(accuracy)
    

100%|██████████| 8/8 [00:01<00:00,  4.87it/s]


In [95]:
# features_fn = get_features_length
# features_fn = functools.partial(get_features_bow, bow_values=bow)
# features_fn = functools.partial(get_features_bow, bow_values=bow_tfidf)
# features_fn = functools.partial(get_features_bow_length, bow_values=bow_tfidf)
def features_fn(message_id):
    all_features = {}
    
    features1 = get_features_bow(message_id, bow_values=bow_tfidf)
    features2 = get_features_sent(message_id)
    features3 = get_features_history(message_id)
    features4 = get_features_length(message_id)
    
    all_features.update(features1)
    all_features.update(features2)
    all_features.update(features3)
    all_features.update(features4)
    
    return all_features

labels_fn = get_label

model_name = 'models/cross_val.crfsuite'

**Our classifier**

In [96]:
our_accuracies = []

for fold in tqdm(cross_val_dfs):
    X_train, y_train, X_test, y_test = data_from_fold(fold, features_fn, labels_fn)
    
    # Trainer
    trainer = pycrfsuite.Trainer(verbose=False)
    for xseq, yseq in zip(X_train, y_train):
        trainer.append(xseq, yseq)
        
    trainer.set_params({
        'c1': 1.0,   # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })
    
    if not os.path.exists('models'):
        os.makedirs('models')

    # Train
    trainer.train(model_name)
    
    # Load tagger
    tagger = pycrfsuite.Tagger()
    tagger.open(model_name)
    
    # Predictions and labels
    preds = [tag for convo in X_test for tag in tagger.tag(convo)]
    labels = [tag for convo in y_test for tag in convo]
    
    # Compute accuracy
    accuracy = np.mean([pred == label for pred, label in zip(preds, labels)])
    our_accuracies.append(accuracy)

100%|██████████| 8/8 [05:11<00:00, 38.89s/it]


In [97]:
print(f'Average accuracy (our): {np.mean(our_accuracies)}')
print(f'Average accuracy (maj): {np.mean(maj_accuracies)}')

Average accuracy (our): 0.645143221897118
Average accuracy (maj): 0.3932443041655893
