## Preprocess the dataset

In [None]:
#open file
with open("C:/ML/training.data") as f:
    raw_data = f.readlines()
raw_data = [x.strip('\n') for x in raw_data]

print(raw_data[0])

In [None]:
len(raw_data)

In [None]:
# split every line by first occurence of space into (label, data)
label_data = [x.split(' ', 1) for x in raw_data]
len(label_data)

In [None]:
print(label_data[0])
print(label_data[0][0])
print(label_data[0][1])

In [None]:
# get full labels from labels + data line
label = []

for i in range(len(label_data)):
    label.append(label_data[i][0])

In [None]:
label[0]

In [None]:
# count total unique labels/classes
unique_label = list(set(label))
unique_label_count = len(unique_label)
print(unique_label_count)
print(unique_label)

In [None]:
# get label separated into parts
label_part = []
label_part_0 = []
label_part_1 = []

for i in range(len(label)):
    label_part.append(label[i].split(':'))
    label_part_0.append(label_part[i][0])
    label_part_1.append(label_part[i][1])

In [None]:
print(label_part_0[0])
print(label_part_1[0])
print(label_part[0])

In [None]:
# count unique label_part_0 and label_part_1 classes
unique_label_0 = list(set(label_part_0))
unique_label_count_0 = len(unique_label_0)
print(unique_label_count_0)
print(unique_label_0)

unique_label_1 = list(set(label_part_1))
unique_label_count_1 = len(unique_label_1)
print(unique_label_count_1)
print(unique_label_1)

In [None]:
# generate numerical labels instead of word labels by indexes in unique labels above
label_part_num = []
label_part_0_num = []
label_part_1_num = []

for i in range(len(label_part)):
    for j in range(len(unique_label_0)):
        for k in range(len(unique_label_1)):
            if (label_part[i][0] == unique_label_0[j]) and  (label_part[i][1] == unique_label_1[k]):
                label_part_num.append([j,k])
                label_part_0_num.append(j)
                label_part_1_num.append(k)

In [None]:
print(label_part_0[0])
print(label_part_1[0])
print(label_part[0])

print(label_part_0_num[0])
print(label_part_1_num[0])
print(label_part_num[0])

In [None]:
# generate unique numerical labels
# this is the total space of classes, possibly useful in training/test
# Note to self: during test, need to check if a numerical label was present in training set - if not
# classifier has not seen this label during training, and cannot classify
unique_label_part_num = []

for x in label_part_num:
    if x not in unique_label_part_num:
        unique_label_part_num.append(x)

unique_label_part_num_count = len(unique_label_part_num)

print(unique_label_part_num_count)
print(unique_label_part_num)

#### Observation: there are 50 different classes in total, as combination of 6 label_part_0 and 47 label_part_1

In [None]:
# get data from labels + data line
data = []

for i in range(len(label_data)):
    data.append(label_data[i][1])

In [None]:
data[0]

In [None]:
len(data)

#### Labels and data are now stored in label_part_num, label_part_0_num, label_part_1_num, and data lists

In [None]:
# copy/rename data and labels to X and Y lists (for convention reasons)
# Note that elements of X list are strings! - this is required for CountVectorizer input data format
    
X = data.copy()
Y = label_part.copy()
Y_num = label_part_num.copy()

Y_str = []
for i in range(len(Y_num)):
    Y_str.append(str(Y_num[i][0]) + "," + str(Y_num[i][1]))

In [None]:
Y_str[0]

In [None]:
type(X[0])

In [None]:
# check list lengths

print(len(X))
print(len(Y))
print(len(Y_num))

#### Data and labels are now stored in X, Y, Y_num, and Y_str lists

## Divide dataset into training and test sets

In [None]:
# split all datasets above into training/test sets with ratio 80/20%
from sklearn.model_selection import train_test_split

# Note to self: scikit learn allows splitting multiple datasets with a single-line statement, but it is clearer to put things on separate lines
train_set_X, test_set_X = train_test_split(X, test_size=0.2, random_state=42)
train_set_Y, test_set_Y = train_test_split(Y, test_size=0.2, random_state=42)
train_set_Y_num, test_set_Y_num = train_test_split(Y_num, test_size=0.2, random_state=42)
train_set_Y_str, test_set_Y_str = train_test_split(Y_str, test_size=0.2, random_state=42)

In [None]:
print(len(train_set_X))
print(len(test_set_Y))
print(str(len(train_set_X) + len(test_set_Y)))

In [None]:
# pick examples in test set, for which there are labels in the training set

test_set_X_filt = []
test_set_Y_str_filt = []

for i in range(len(test_set_Y_str)):
    if test_set_Y_str[i] in train_set_Y_str:
        test_set_Y_str_filt.append(test_set_Y_str[i])
        test_set_X_filt.append(test_set_X[i])

len(test_set_Y_str_filt)

In [None]:
unique = len(list(set(test_set_Y_str_filt)))
print(unique) # number of labels in the filtered test set
unique = len(list(set(test_set_Y_str)))
print(unique) # number of labels in unfiltered/original test set

#### Since the length of the filtered test set is unchanged (3091), by chance all 50 of the labels in the test set are present in the training set.

## Tokenize text with CountVectorizer

In [None]:
# Note! CountVectorizer expects input data as sequence of strings, i.e. nparray will NOT work!
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(train_set_X)
X_train_counts = count_vect.fit_transform(train_set_X)
X_train_counts.shape

In [None]:
# test: look up index value for individual word (e.g."when") 
count_vect.vocabulary_.get('when')

In [None]:
# test: sparse matrix for X[0]
print(X_train_counts[0])

## Compute TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

## Train a Naive Bayes classifier

In [None]:
# Train a Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, train_set_Y_str)

# Accuracy on the training set
import numpy as np
predicted = clf.predict(X_train_tfidf)
np.mean(predicted == train_set_Y_str)

## Build a pipeline

In [None]:
# build intent classifier pipeline
from sklearn.pipeline import Pipeline

intent_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

In [None]:
# train classifier in one line, using pipeline
intent_clf.fit(train_set_X, train_set_Y_str)

In [None]:
# check prediction on the training set
predicted = intent_clf.predict(train_set_X)
np.mean(predicted == train_set_Y_str)

In [None]:
# check prediction on the test set
predicted = intent_clf.predict(test_set_X_filt)
np.mean(predicted == test_set_Y_str_filt)

## Train an SVM classifier

In [None]:
# Note! n_iter parameter is renamed max_iter in sklearn v.0.19, additional parameter tol is introduced as well
from sklearn.linear_model import SGDClassifier

intent_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, n_iter=5))])

In [None]:
# check prediction on the training set
intent_clf.fit(train_set_X, train_set_Y_str)
predicted = intent_clf.predict(train_set_X)
np.mean(predicted == train_set_Y_str)

In [None]:
# check prediction on the test set
predicted = intent_clf.predict(test_set_X_filt)
np.mean(predicted == test_set_Y_str_filt)

### Performance analysis of the results

In [None]:
from sklearn import metrics

print(metrics.classification_report(test_set_Y_str_filt, predicted))

In [None]:
metrics.confusion_matrix(test_set_Y_str_filt, predicted)

### Parameter tuning using grid search

In [None]:
from sklearn.model_selection import GridSearchCV
from time import time
from pprint import pprint

parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3, 1e-4)}
gs_clf = GridSearchCV(intent_clf, parameters, n_jobs=-1) # n_jobs = -1 uses all available cores

print("Performing grid search...")
print("pipeline:", [name for name, _ in intent_clf.steps])
print("parameters:")
pprint(parameters)
t0 = time()

gs_clf.fit(train_set_X, train_set_Y_str)

print("done in %0.3fs" % (time() - t0))
print()

# print("Best score: %0.3f" % gs_clf.best_score_) # need to check the exact meaning of score
print("Best parameters set:")
best_parameters = gs_clf.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

predicted = gs_clf.predict(train_set_X)
np.mean(predicted == train_set_Y_str)

print("Best accuracy: %0.3f" % np.mean(predicted == train_set_Y_str))  

#### The model with optimized parameter set can fit the training set (almost), perfectly, i.e. it is easy to overfit this set

In [None]:
print(metrics.classification_report(train_set_Y_str, predicted))

In [None]:
#predict with best parameters found by grid search above
best_clf = gs_clf.best_estimator_
predicted = best_clf.predict(test_set_X)
np.mean(predicted == test_set_Y_str_filt)

print("Accuracy on test set: %0.3f" % np.mean(predicted == test_set_Y_str_filt)) 

### Test set accuracy is 97.2%: the model will overfit the (small) test set, if parameters are optimized further

In [None]:
print(metrics.classification_report(test_set_Y_str_filt, predicted))