In [1]:
from collections import defaultdict
import pickle 

import numpy as np
from bokeh.plotting import figure, show, output_notebook

output_notebook()

In [2]:
NTOPICS = 40  # Number of topics {6, 40}
#datapath = 'data/aud_subspace_mbn_4g_gamma_dirichlet_process_ldim40.trans'
#datapath = 'data/aud_subspace_mbn_4g_gamma_dirichlet_process_ldim100.trans'
#datapath = 'data/aud_mbn_4g_gamma_dirichlet_process.trans'
datapath = 'data/aud_subspace_mfcc_8k_4g_gamma_dirichlet_process_ldim40.trans'

with open(f'data/fisher_{NTOPICS}c_train.flist', 'r') as f:
    train_docs = [line.strip() for line in f]
    
with open(f'data/fisher_{NTOPICS}c_test.flist', 'r') as f:
    test_docs = [line.strip() for line in f]
        
with open('data/tID_tName.pkl', 'rb') as f:
    topic_names = pickle.load(f)
        
document2topic = {}                                                            
with open('data/fe_03_p1_calldata.tbl', 'r') as f:                                            
    next(f) # skip the first line.                                             
    for line in f:                                                             
        tokens = line.strip().split(',')                                       
        docid, raw_topic_name = tokens[0], tokens[2]
        if raw_topic_name != '' and (docid in train_docs or docid in test_docs):      
            topicid = int(raw_topic_name[3:])
            document2topic[docid] = topic_names[topicid]

document2topic_MIT = {}
with open('data/fID_tID.pkl', 'rb') as f:
    labels = pickle.load(f)
for doc, topicid in labels.items():
    document2topic_MIT[doc] = topic_names[topicid]
    

diff = 0
for i, docid in enumerate(document2topic):
    if document2topic[docid] != document2topic_MIT[docid]:
        diff += 1
diff / len(document2topic)

0.07456140350877193

## Data preparation

In [12]:
NTOPICS = 40  # Number of topics {6, 40}
#datapath = 'data/aud_subspace_mbn_4g_gamma_dirichlet_process_ldim40.trans'
#datapath = 'data/aud_subspace_mbn_4g_gamma_dirichlet_process_ldim100.trans'
#datapath = 'data/aud_mbn_4g_gamma_dirichlet_process.trans'
datapath = 'data/aud_mfcc_8k_4g_gamma_dirichlet_process.trans'


with open(f'data/fisher_{NTOPICS}c_train.flist', 'r') as f:
    train_docs = [line.strip() for line in f]
    
with open(f'data/fisher_{NTOPICS}c_test.flist', 'r') as f:
    test_docs = [line.strip() for line in f]
    
with open('data/tID_tName.pkl', 'rb') as f:
    topic_names = pickle.load(f)
        
# Load all documents with their associated topic label.
#document2topic = {}                                                            
#with open('data/fe_03_p1_calldata.tbl', 'r') as f:                                            
#    next(f) # skip the first line.                                             
#    for line in f:                                                             
#        tokens = line.strip().split(',')                                       
#        docid, topicid = tokens[0], tokens[2]             
#        if topicid != '' and (docid in train_docs or docid in test_docs):                              
#            document2topic[docid] = topicid     
            
document2topic = {}
with open('data/fID_tID.pkl', 'rb') as f:
    labels = pickle.load(f)
for doc, topicid in labels.items():
    document2topic[doc] = topic_names[topicid]
    
        
train_documents = []
test_documents = []
for docid in document2topic:
    if docid in train_docs:
        train_documents.append(docid)
    elif docid in test_docs:
        test_documents.append(docid)
            
# Build the reverse mapping topic -> documents
topic2document = defaultdict(list)
for doc, topic in document2topic.items():
    topic2document[topic].append(doc)
topics = sorted(list(topic2document.keys()))

topic2document_train = defaultdict(list)
for doc, topic in document2topic.items():
    if doc in train_docs:
        topic2document_train[topic].append(doc)

topic2document_test = defaultdict(list)
for doc, topic in document2topic.items():
    if doc in test_docs:
        topic2document_test[topic].append(doc)

# Load the raw data
rawdata = defaultdict(list)                                                     
with open(datapath, 'r') as f:                                                  
    for line in f:                                                          
        tokens = line.strip().split()                                       
        docid = tokens[0].replace('fe_03_', '')[:5]                        
        rawdata[docid].append(' '.join(tokens[1:]))

In [23]:
topic_names

{1: 'Sports on TV',
 2: 'Pets',
 3: 'Life Partners',
 4: 'Minimum Wage',
 5: 'Comedy',
 6: 'Perjury',
 7: 'Money to Leave US',
 8: 'Opening Own Business',
 9: 'Time Travel',
 10: 'Anonymous Benefactor',
 11: 'US Public Schools',
 12: 'Affirmative Action',
 13: 'Movies',
 14: 'Computer Games',
 15: 'Current Events',
 16: 'Hobbies',
 17: 'Smoking',
 18: 'Terrorism',
 19: 'Televised Criminal Trials',
 20: 'Drug Testing',
 21: 'Family Values',
 22: 'Censorship',
 23: 'Health and Fitness',
 24: 'September 11',
 25: 'Strikes by Athletes',
 26: 'Airport Security',
 27: 'Middle East Issues',
 28: 'Foreign Relations',
 29: 'Education',
 30: 'Family',
 31: 'Corporate Conduct',
 32: 'Outdoor Activities',
 33: 'Friends',
 34: 'Food',
 35: 'Illness',
 36: 'Personal Habits',
 37: 'Reality TV',
 38: 'Iraq Arms Inspections',
 39: 'Holidays',
 40: 'Bioterrorism'}

## Features selection

The vocabulary is selected per topic by choosing the $K$ n-gram $w$ with the highest following propbability:

$$
p(t | w ) = \frac{f_{wt} + |T|p(t)}{f_w + |T|}
$$

In [13]:
p_t = np.array([float(len(topic2document_train[topic])) for topic in topics]) 
p_t /= p_t.sum()
ngram_order = 3

class NGramCounter:
    
    def __init__(self, order=3, prior_count=0.):
        self.order = order
        self.counts = defaultdict(lambda: prior_count)
        
    def add(self, doc):
        for utt in doc:
            new_utt = '<s> ' * (self.order - 1)  + utt
            tokens = new_utt.split()
            for i in range(len(tokens) - self.order):
                ngram = tuple(tokens[i:i+self.order])
                self.counts[ngram] += 1
                
    def get_counts(self, vocab=None):
        if vocab is None:
            vocab = sorted(list(self.counts.keys()))
        return [self.counts[word] for word in vocab]
        
# Evaluate the counts. 
global_counter = NGramCounter(order=ngram_order, prior_count=NTOPICS)
topic_counters = {}
for p_topic, topic in zip(p_t, topics):
    topic_counters[topic] = NGramCounter(order=ngram_order, prior_count=NTOPICS * p_topic)
    for doc in topic2document_train[topic]:
        global_counter.add(rawdata[doc])
        topic_counters[topic].add(rawdata[doc])

In [14]:
topic_ranked_ngrams = {}
for topic in topics:
    ranked_ngrams = []
    for ngram in global_counter.counts:
        score = topic_counters[topic].counts[ngram] / global_counter.counts[ngram]
        ranked_ngrams.append((ngram, score))
    topic_ranked_ngrams[topic]  = list(reversed(sorted(ranked_ngrams, key=lambda x: x[1])))

In [15]:
nbest = 100
vocab = set()
for ngrams in topic_ranked_ngrams.values():
    vocab = vocab.union([ngram for ngram, score in ngrams[:nbest]])
vocab = sorted(list(vocab))
len(vocab), vocab[:100]

(3976,
 [('<s>', 'sil', 'au55'),
  ('<s>', 'sil', 'au74'),
  ('<s>', 'sil', 'au86'),
  ('au1', 'au100', 'au89'),
  ('au1', 'au11', 'au86'),
  ('au1', 'au17', 'au44'),
  ('au1', 'au17', 'au8'),
  ('au1', 'au22', 'au61'),
  ('au1', 'au25', 'au2'),
  ('au1', 'au33', 'au65'),
  ('au1', 'au35', 'au16'),
  ('au1', 'au35', 'au64'),
  ('au1', 'au35', 'au84'),
  ('au1', 'au37', 'au55'),
  ('au1', 'au39', 'au59'),
  ('au1', 'au55', 'au35'),
  ('au1', 'au60', 'au1'),
  ('au1', 'au61', 'au31'),
  ('au1', 'au61', 'au65'),
  ('au1', 'au61', 'au81'),
  ('au1', 'au63', 'au65'),
  ('au1', 'au70', 'au78'),
  ('au1', 'au77', 'au19'),
  ('au1', 'au81', 'au82'),
  ('au1', 'au82', 'au19'),
  ('au1', 'au85', 'au83'),
  ('au1', 'au89', 'au23'),
  ('au1', 'au92', 'au83'),
  ('au1', 'au93', 'au36'),
  ('au10', 'au12', 'au41'),
  ('au10', 'au16', 'au73'),
  ('au10', 'au17', 'au44'),
  ('au10', 'au2', 'au14'),
  ('au10', 'au2', 'au99'),
  ('au10', 'au21', 'au1'),
  ('au10', 'au22', 'au78'),
  ('au10', 'au26', 'au

## Features

Using the selected ngrams, we represent each document as a bag-of-ngrams

In [16]:
train_X = np.zeros((len(train_documents), len(vocab)), dtype=float)
train_y = np.zeros((len(train_documents)))
for i, docid in enumerate(train_documents):
    counter = NGramCounter(order=ngram_order)
    counter.add(rawdata[docid])
    train_X[i] = counter.get_counts(vocab)
    train_y[i] = topics.index(document2topic[docid])
    
test_X = np.zeros((len(test_documents), len(vocab)), dtype=float)
test_y = np.zeros((len(test_documents)))
for i, docid in enumerate(test_documents):
    counter = NGramCounter(order=ngram_order)
    counter.add(rawdata[docid])
    test_X[i] = counter.get_counts(vocab)
    test_y[i] = topics.index(document2topic[docid])

In [17]:
train_X.sum(), test_X.sum(), len(train_X), len(test_X)
train_X[:10].sum(axis=1)

array([ 97., 132., 119., 124., 107.,  78.,  90., 107., 108., 113.])

## Topic classification

Using "sklearn" to build the pipeline

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


clf = Pipeline([
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=13, max_iter=20, tol=1e-1)),
])


parameters = {
}

gs_clf = GridSearchCV(clf, parameters, cv=5, iid=False)
gs_clf.fit(train_X, train_y)


predicted = gs_clf.predict(test_X)
print(f'Accuracy: {np.mean(predicted == test_y) * 100:.3f} %')

Accuracy: 18.513 %


accuracy for the AUD HMM without features selection: 37.755 % <br>
accuracy for the AUD subspace HMM (dim 40): 48.834 %

In [20]:
train_corpus = ['\n'.join(rawdata[doc]) for doc in train_documents]
test_corpus = ['\n'.join(rawdata[doc]) for doc in test_documents]

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(ngram_range=(3, 3))
train_X = count_vect.fit_transform(train_corpus)
test_X = count_vect.transform(test_corpus)

In [22]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier

tf_transformer = TfidfTransformer(use_idf=True).fit(train_X)
train_X_tfidf = tf_transformer.transform(train_X)
test_X_tfidf = tf_transformer.transform(test_X)

clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=13,
                    max_iter=20, tol=1e-3).fit(train_X_tfidf, train_y)

predicted = clf.predict(train_X_tfidf)
np.mean(predicted == train_y)

predicted = clf.predict(test_X_tfidf)
print(f'Accuracy: {np.mean(predicted == test_y) * 100:.3f} %')

Accuracy: 16.108 %
