In [1]:
from nltk.corpus import reuters
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score
from sklearn.multiclass import OneVsRestClassifier
from pprint import pprint
from nltk.corpus import stopwords

In [2]:
# Function to show the feature weights of a document (to be explained later)
def feature_values(doc, representer):
    doc_representation = representer.transform([doc])
    features = representer.get_feature_names()
    return [(features[index], doc_representation[0, index]) for index in doc_representation.nonzero()[1]]

# (Test) Binary text classification problem

In [25]:
# Artificial (and small) dataset. Sports/Non-sports
train_data = ['Football: a great sport', 'The referee has been very bad this season', 'Our team scored 5 goals', 'I love tenis',
              'Politics is in decline in the UK', 'Brexit means Brexit', 'The parlament wants to create new legislation'
              'I so want to travel the world']

test_data = ['Swimming is a great sport', 
             'A lot of policy changes will happen after Brexit', 
             'The table tenis team will travel to the UK soon for the European Championship']

#1: sports, 0 other
train_labels = [1,1,1,1,0,0,0]
test_labels = [1,0,1]

# Representation of the data using TF-IDF
vectorizer = TfidfVectorizer()
vectorised_train_data = vectorizer.fit_transform(train_data)
vectorised_test_data = vectorizer.transform(test_data)

# Train the classifier given the training data
classifier = LinearSVC()
classifier.fit(vectorised_train_data, train_labels)

# Predict the labels for the test documents (not used for training)
predictions = classifier.predict(vectorised_test_data)

pprint(predictions)
print()

# Why is the prediction of the last item "non-sport"?

array([1, 0, 0])



In [27]:
pprint([feature_values(doc, vectorizer) for doc in test_data])

[[('sport', 0.57735026918962573),
  ('is', 0.57735026918962573),
  ('great', 0.57735026918962573)],
 [('brexit', 1.0)],
 [('uk', 0.3239165812325589),
  ('travel', 0.3239165812325589),
  ('to', 0.3239165812325589),
  ('the', 0.68948549079226851),
  ('tenis', 0.3239165812325589),
  ('team', 0.3239165812325589)]]


In [28]:
# Lets try again... with stop-word removal
stop_words = stopwords.words("english")
vectorizer = TfidfVectorizer(stop_words=stop_words)

vectorised_train_data = vectorizer.fit_transform(train_data)
vectorised_test_data = vectorizer.transform(test_data)

classifier = LinearSVC()
classifier.fit(vectorised_train_data, train_labels)

predictions = classifier.predict(vectorised_test_data)

pprint(predictions)
print()

pprint([feature_values(doc, vectorizer) for doc in test_data])

array([1, 0, 1])

[[('sport', 0.70710678118654757), ('great', 0.70710678118654757)],
 [('brexit', 1.0)],
 [('uk', 0.5), ('travel', 0.5), ('tenis', 0.5), ('team', 0.5)]]


# (Test) Multi-Class classification problem

In [38]:
# Artificial (and small) dataset. Spanish,English,French texts
train_data = ['PyCon es una gran conferencia', 'Aprendizaje automatico esta listo para dominar el mundo dentro de poco',
             'This is a great conference with a lot of amazing talks', 'AI will dominate the world in the near future',
             'Dix chiffres por resumer le feuilleton de la loi travail']

test_data = ['Estoy preparandome para dominar las olimpiadas', 'Me gustaria mucho aprender el lenguage de programacion Scala',
             'Machine Learning is amazing']

#1: sports, 0 other
train_labels = [0,0,1,1,2]
test_labels = [0,0,1]

# Representation of the data using TF-IDF (keeping the stop-words!!)
vectorizer = TfidfVectorizer()
vectorised_train_data = vectorizer.fit_transform(train_data)
vectorised_test_data = vectorizer.transform(test_data)

# Train the classifier given the training data
classifier = LinearSVC()
classifier.fit(vectorised_train_data, train_labels)

# Predict the labels for the test documents (not used for training)
predictions = classifier.predict(vectorised_test_data)

pprint(predictions)
print()

array([0, 0, 1])



# Data Collection Stats

In [None]:
# List of documents
documents = reuters.fileids()
print("Documents: {}".format(len(documents)))

In [None]:
train_docs = list(filter(lambda doc: doc.startswith("train"), documents))
print("Total train documents: {}".format(len(train_docs)))

test_docs = list(filter(lambda doc: doc.startswith("test"), documents))
print("Total test documents: {}".format(len(test_docs)))

In [None]:
# List of categories 
categories = reuters.categories();
print("Number of categories: {}".format(len(categories)))

categories

In [None]:
# Documents per category. TODO: Lambdas too complex?
category_distribution = sorted([[category, len(reuters.fileids(category))] for category in categories], 
                               key=lambda item:item[1], 
                               reverse=True)

category_distribution[:10]

In [None]:
category_distribution[-10:]

In [None]:
# Number of labels (different than number of documents...)
sum([distribution for category, distribution in category_distribution])

In [None]:
# Documents with multiple labels
doc = 'training/9865'
print(reuters.raw(doc))

reuters.categories(doc)

In [None]:
# Tokenisation 
vectorizer = TfidfVectorizer()

# Learn and transform train documents
vectorised_train_documents = vectorizer.fit_transform(train_docs)
vectorised_test_documents = vectorizer.transform(test_docs)

# Transform multilabel labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(document) for document in train_docs])
test_labels = mlb.fit_transform([reuters.categories(document) for document in test_docs])

# Classifier 
classifier = OneVsRestClassifier(LinearSVC())
classifier.fit(vectorised_train_documents, train_labels)
predictions = classifier.predict(vectorised_test_documents)

sum([sum(prediction) for prediction in predictions])

In [None]:
y = mlb.fit_transform(all_labels)
    classifier = OneVsRestClassifier(LinearSVC())

    scores = cross_val_score(classifier, X, y=y, cv=10, scoring='f1_micro')
    print("Average F1: {}".format(np.mean(scores)))