In [None]:
from nltk.corpus import reuters
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from pprint import pprint
from nltk.corpus import stopwords

In [None]:
# Function to show the feature weights of a document (to be explained later)
def feature_values(doc, representer):
    doc_representation = representer.transform([doc])
    features = representer.get_feature_names()
    return [(features[index], doc_representation[0, index]) for index in doc_representation.nonzero()[1]]

# Binary text classification problem

In [None]:
# Artificial (and small) dataset. Sports/Non-sports
train_data = ['Football: a great sport', 'The referee has been very bad this season', 'Our team scored 5 goals', 'I love tenis',
              'Politics is in decline in the UK', 'Brexit means Brexit', 'The parlament wants to create new legislation'
              'I so want to travel the world']

test_data = ['Swimming is a great sport', 
             'A lot of policy changes will happen after Brexit', 
             'The table tenis team will travel to the UK soon for the European Championship']

train_labels = ["Sports","Sports","Sports","Sports", "Non Sports", "Non Sports", "Non Sports"]
test_labels = ["Sports","Non Sports","Sports"]

# Representation of the data using TF-IDF
vectorizer = TfidfVectorizer()
vectorised_train_data = vectorizer.fit_transform(train_data)
vectorised_test_data = vectorizer.transform(test_data)

# Train the classifier given the training data
classifier = LinearSVC()
classifier.fit(vectorised_train_data, train_labels)

# Predict the labels for the test documents (not used for training)
print(classifier.predict(vectorised_test_data))

# Why is the prediction of the last item "non-sport"?

In [None]:
pprint([feature_values(doc, vectorizer) for doc in test_data])

In [None]:
# Lets try again... with stop-word removal
stop_words = stopwords.words("english")
vectorizer = TfidfVectorizer(stop_words=stop_words)

vectorised_train_data = vectorizer.fit_transform(train_data)
vectorised_test_data = vectorizer.transform(test_data)

classifier = LinearSVC()
classifier.fit(vectorised_train_data, train_labels)

print(classifier.predict(vectorised_test_data))
print()

pprint([feature_values(doc, vectorizer) for doc in test_data])

# Multi-Class classification problem

In [None]:
# Artificial (and small) dataset. Spanish,English,French texts
train_data = ['PyCon es una gran conferencia', 'Aprendizaje automatico esta listo para dominar el mundo dentro de poco',
             'This is a great conference with a lot of amazing talks', 'AI will dominate the world in the near future',
             'Dix chiffres por resumer le feuilleton de la loi travail']

test_data = ['Estoy preparandome para dominar las olimpiadas', 'Me gustaria mucho aprender el lenguage de programacion Scala',
             'Machine Learning is amazing']

# spanish: 0, english: 1, French: 2
train_labels = ["SP", "SP", "EN", "EN", "FR"]
test_labels = ["SP", "SP", "EN"]

vectorizer = TfidfVectorizer()
vectorised_train_data = vectorizer.fit_transform(train_data)
vectorised_test_data = vectorizer.transform(test_data)

classifier = LinearSVC()
classifier.fit(vectorised_train_data, train_labels)

predictions = classifier.predict(vectorised_test_data)

pprint(predictions)
print()

pprint([feature_values(doc, vectorizer) for doc in test_data])

# Multi-label Problem

In [None]:
# Artificial (and small) dataset. Sports and Politics
train_data = ['Football: a great sport', 'The referee has been very bad this season', 'Our team scored 5 goals', 'I love tenis',
              'Politics is in decline in the UK', 'Brexit means Brexit', 'The parlament wants to create new legislation'
              'I so want to travel the world', 
              'The goverment will increase the budget for sports in the UK after the victories in the Olimpic Games']

test_data = ['Swimming is a great sport', 
             'A lot of policy changes will happen after Brexit', 
             'The table tenis team will travel to the UK soon for the European Championship',
             'The goverment will increase the budget for sports in the UK after the victories in the Olimpic Games']

train_labels = [["Sports"], ["Sports"], ["Sports"], ["Sports"],["Politics"],["Politics"],["Politics"],["Politics", "Sports"]]
test_labels = [["Sports"], ["Politics"], ["Sports"], ["Politics","Sports"]]

# Change the representation
mlb = MultiLabelBinarizer()
binary_train_labels = mlb.fit_transform(train_labels)
binary_test_labels = mlb.transform(test_labels)

binary_train_labels

In [None]:
vectorizer = TfidfVectorizer(stop_words=stop_words)
vectorised_train_data = vectorizer.fit_transform(train_data)
vectorised_test_data = vectorizer.transform(test_data)

classifier = OneVsRestClassifier(LinearSVC())
classifier.fit(vectorised_train_data, binary_train_labels)

predictions = classifier.predict(vectorised_test_data)

print(predictions)
print()

print(mlb.inverse_transform(predictions))

# Evaluation

In [None]:
# Binary problem
binary_labels = [1, 0, 1]
binary_predictions = [1, 0, 0]

# Quality values (with respect to class 1 by default)
print("Binary quality")
print("Precision: {}, Recall: {}, F1-measure: {}".format(precision_score(binary_labels, binary_predictions),
                                                         recall_score(binary_labels, binary_predictions),
                                                         f1_score(binary_labels, binary_predictions)))

binary_labels = ["A", "B", "A"]
binary_predictions = ["A", "B", "B"]

# Quality values (with respect to class A)
print("Precision: {}, Recall: {}, F1-measure: {}".format(precision_score(binary_labels, binary_predictions, pos_label="A"),
                                                         recall_score(binary_labels, binary_predictions, pos_label="A"),
                                                         f1_score(binary_labels, binary_predictions, pos_label="A")))

In [None]:
# Multi-Class
multi_class_labels = [0, 0, 0, 0, 0, 1, 1, 2]
multi_class_predictions = [0, 0, 0, 0, 0, 1, 2, 1]

# Quality must be given per category or aggregated when dealing with multiclass data

print("Precision per category (0, 1, 2) {}".format(precision_score(multi_class_labels, multi_class_predictions, average=None)))
print("Micro-average Precision {}".format(precision_score(multi_class_labels, multi_class_predictions, average='micro')))
print("Macro-average Precision {}".format(precision_score(multi_class_labels, multi_class_predictions, average='macro')))
print()


print("Micro-average quality numbers")
print("Precision: {}, Recall: {}, F1-measure: {}".format(precision_score(multi_class_labels, multi_class_predictions, average='micro'),
                                                         recall_score(multi_class_labels, multi_class_predictions, average='micro'),
                                                         f1_score(multi_class_labels, multi_class_predictions, average='micro')))


In [None]:
# Multi-Class
multi_class_labels = [[0], [0], [0], [0], [0], [1], [1], [2]]
multi_class_predictions = [[0], [0], [0], [0], [0], [1], [2], [1, 2]]

binarised_labels = mlb.fit_transform(multi_class_labels)
binarised_decisions = mlb.transform(multi_class_predictions)

print("Micro-average Precision {}".format(precision_score(binarised_labels, binarised_decisions, average='micro')))
print("Macro-average Precision {}".format(precision_score(binarised_labels, binarised_decisions, average='macro')))
print()

print("Micro-average quality numbers")
print("Precision: {}, Recall: {}, F1-measure: {}".format(precision_score(binarised_labels, binarised_decisions, average='micro'),
                                                         recall_score(binarised_labels, binarised_decisions, average='micro'),
                                                         f1_score(binarised_labels, binarised_decisions, average='micro')))
